From fee62922d8857ce93f1d4e90fd7240629d606997 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Mon, 5 Feb 2024 08:26:15 -0800 Subject: [PATCH] Pre-commit updates (#2427) Ref #2421 Fix #2908 Updates older pre-commit hooks (initial exp: https://github.com/sourmash-bio/sourmash/pull/680), mostly based (again) on [tox configs](https://github.com/tox-dev/tox/blob/main/.pre-commit-config.yaml) Use ruff instead of pyupgrade/isort/black/flake8 This PR has a couple of commits: the first ones update configs, the last one runs `tox -e fix_lint` to apply pre-commit. Mostly looking into updating the first commits without breaking tests on the last commit. --- .pre-commit-config.yaml | 95 +- benchmarks/benchmarks.py | 65 +- doc/conf.py | 191 +- pyproject.toml | 29 +- src/sourmash/__init__.py | 55 +- src/sourmash/__main__.py | 9 +- src/sourmash/cli/__init__.py | 75 +- src/sourmash/cli/categorize.py | 28 +- src/sourmash/cli/compare.py | 93 +- src/sourmash/cli/compute.py | 107 +- src/sourmash/cli/gather.py | 131 +- src/sourmash/cli/import_csv.py | 12 +- src/sourmash/cli/index.py | 64 +- src/sourmash/cli/info.py | 23 +- src/sourmash/cli/lca/__init__.py | 19 +- src/sourmash/cli/lca/classify.py | 51 +- src/sourmash/cli/lca/compare_csv.py | 34 +- src/sourmash/cli/lca/index.py | 80 +- src/sourmash/cli/lca/rankinfo.py | 20 +- src/sourmash/cli/lca/summarize.py | 59 +- src/sourmash/cli/migrate.py | 6 +- src/sourmash/cli/multigather.py | 58 +- src/sourmash/cli/plot.py | 76 +- src/sourmash/cli/prefetch.py | 76 +- src/sourmash/cli/sbt_combine.py | 15 +- src/sourmash/cli/scripts/__init__.py | 24 +- src/sourmash/cli/search.py | 99 +- src/sourmash/cli/sig/__init__.py | 22 +- src/sourmash/cli/sig/cat.py | 41 +- src/sourmash/cli/sig/check.py | 57 +- src/sourmash/cli/sig/collect.py | 55 +- src/sourmash/cli/sig/describe.py | 31 +- src/sourmash/cli/sig/downsample.py | 36 +- src/sourmash/cli/sig/export.py | 19 +- src/sourmash/cli/sig/extract.py | 43 +- src/sourmash/cli/sig/fileinfo.py | 24 +- src/sourmash/cli/sig/filter.py | 44 +- src/sourmash/cli/sig/flatten.py | 38 +- src/sourmash/cli/sig/grep.py | 65 +- src/sourmash/cli/sig/inflate.py | 23 +- src/sourmash/cli/sig/ingest.py | 22 +- src/sourmash/cli/sig/intersect.py | 33 +- src/sourmash/cli/sig/kmers.py | 50 +- src/sourmash/cli/sig/manifest.py | 36 +- src/sourmash/cli/sig/merge.py | 35 +- src/sourmash/cli/sig/overlap.py | 12 +- src/sourmash/cli/sig/rename.py | 38 +- src/sourmash/cli/sig/split.py | 32 +- src/sourmash/cli/sig/subtract.py | 32 +- src/sourmash/cli/sketch/__init__.py | 19 +- src/sourmash/cli/sketch/dna.py | 79 +- src/sourmash/cli/sketch/fromfile.py | 63 +- src/sourmash/cli/sketch/protein.py | 83 +- src/sourmash/cli/sketch/translate.py | 87 +- src/sourmash/cli/storage/__init__.py | 19 +- src/sourmash/cli/storage/convert.py | 13 +- src/sourmash/cli/tax/__init__.py | 22 +- src/sourmash/cli/tax/annotate.py | 71 +- src/sourmash/cli/tax/genome.py | 105 +- src/sourmash/cli/tax/grep.py | 76 +- src/sourmash/cli/tax/metagenome.py | 113 +- src/sourmash/cli/tax/prepare.py | 55 +- src/sourmash/cli/tax/summarize.py | 47 +- src/sourmash/cli/utils.py | 190 +- src/sourmash/cli/watch.py | 29 +- src/sourmash/command_compute.py | 257 +- src/sourmash/command_sketch.py | 309 +- src/sourmash/commands.py | 756 ++- src/sourmash/compare.py | 119 +- src/sourmash/distance_utils.py | 114 +- src/sourmash/exceptions.py | 11 +- src/sourmash/fig.py | 43 +- src/sourmash/hll.py | 2 +- src/sourmash/index/__init__.py | 245 +- src/sourmash/index/revindex.py | 154 +- src/sourmash/index/sqlite_index.py | 416 +- src/sourmash/lca/__init__.py | 13 +- src/sourmash/lca/__main__.py | 9 +- src/sourmash/lca/command_classify.py | 55 +- src/sourmash/lca/command_compare_csv.py | 54 +- src/sourmash/lca/command_index.py | 237 +- src/sourmash/lca/command_rankinfo.py | 7 +- src/sourmash/lca/command_summarize.py | 79 +- src/sourmash/lca/lca_db.py | 220 +- src/sourmash/lca/lca_utils.py | 90 +- src/sourmash/logging.py | 54 +- src/sourmash/manifest.py | 142 +- src/sourmash/minhash.py | 442 +- src/sourmash/nodegraph.py | 46 +- src/sourmash/np_utils.py | 6 +- src/sourmash/picklist.py | 108 +- src/sourmash/plugins.py | 80 +- src/sourmash/save_load.py | 105 +- src/sourmash/sbt.py | 510 +- src/sourmash/sbt_storage.py | 76 +- src/sourmash/sbtmh.py | 24 +- src/sourmash/search.py | 416 +- src/sourmash/sig/__init__.py | 2 +- src/sourmash/sig/__main__.py | 693 ++- src/sourmash/sig/grep.py | 31 +- src/sourmash/signature.py | 124 +- src/sourmash/sketchcomparison.py | 106 +- src/sourmash/sourmash_args.py | 259 +- src/sourmash/sqlite_utils.py | 23 +- src/sourmash/tax/__main__.py | 387 +- src/sourmash/tax/tax_utils.py | 1167 ++-- src/sourmash/utils.py | 2 +- tests/conftest.py | 36 +- tests/sourmash_tst_utils.py | 84 +- tests/test__minhash_hypothesis.py | 16 +- tests/test_api.py | 35 +- tests/test_bugs.py | 9 +- tests/test_cmd_signature.py | 3280 ++++++----- tests/test_cmd_signature_collect.py | 410 +- tests/test_cmd_signature_fileinfo.py | 210 +- tests/test_cmd_signature_grep.py | 268 +- tests/test_compare.py | 150 +- tests/test_deprecated.py | 7 +- tests/test_distance_utils.py | 270 +- tests/test_hll.py | 36 +- tests/test_index.py | 746 +-- tests/test_index_protocol.py | 323 +- tests/test_jaccard.py | 44 +- tests/test_lca.py | 2520 +++++---- tests/test_lca_db_protocol.py | 56 +- tests/test_lca_functions.py | 356 +- tests/test_manifest.py | 57 +- tests/test_manifest_protocol.py | 119 +- tests/test_minhash.py | 731 ++- tests/test_nodegraph.py | 22 +- tests/test_np_utils.py | 1 - tests/test_picklist.py | 34 +- tests/test_plugin_framework.py | 185 +- tests/test_prefetch.py | 799 ++- tests/test_sbt.py | 572 +- tests/test_search.py | 413 +- tests/test_signature.py | 246 +- tests/test_sketchcomparison.py | 506 +- tests/test_sourmash.py | 6435 +++++++++++++--------- tests/test_sourmash_args.py | 298 +- tests/test_sourmash_compute.py | 874 +-- tests/test_sourmash_sketch.py | 1497 +++-- tests/test_sqlite_index.py | 380 +- tests/test_tax.py | 4702 +++++++++++----- tests/test_tax_utils.py | 3945 +++++++++---- tests/test_test_framework.py | 2 +- tox.ini | 259 +- utils/cardinality_estimate_confidence.py | 86 +- utils/check-tree.py | 10 +- utils/compute-dna-mh-another-way.py | 19 +- utils/compute-input-prot-another-way.py | 108 +- utils/compute-prot-mh-another-way.py | 108 +- 152 files changed, 26467 insertions(+), 16243 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bca7329143..50ab4e2c26 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,71 +1,26 @@ -default_language_version: - python: python3 repos: -- repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.2.0 - hooks: - - id: check-ast -# - id: check-builtin-literals - - id: check-docstring-first - - id: check-merge-conflict - - id: check-yaml - - id: check-toml - - id: debug-statements -# - id: end-of-file-fixer -# exclude: 'tests/test-data' -# - id: trailing-whitespace -# exclude: 'tests/test-data' -#- repo: https://github.com/asottile/pyupgrade -# rev: v2.7.2 -# hooks: -# - id: pyupgrade -#- repo: https://github.com/pre-commit/mirrors-isort -# rev: v5.4.2 -# hooks: -# - id: isort -# additional_dependencies: [toml] - -# format using black -# when the full codebase is black, use it directly; -# while it isn't, let's use darker to format new/changed code -- repo: https://github.com/akaihola/darker - rev: 1.7.1 - hooks: - - id: darker -#- repo: https://github.com/psf/black -# rev: 20.8b1 -# hooks: -# - id: black -# args: -# - --safe -# language_version: python3.8 -#- repo: https://github.com/asottile/blacken-docs -# rev: v1.8.0 -# hooks: -# - id: blacken-docs -# additional_dependencies: -# - black==19.10b0 -# language_version: python3.8 - -#- repo: https://github.com/asottile/add-trailing-comma -# rev: v2.0.1 -# hooks: -# - id: add-trailing-comma -#- repo: https://github.com/pre-commit/pygrep-hooks -# rev: v1.6.0 -# hooks: -# - id: rst-backticks -#- repo: https://github.com/asottile/setup-cfg-fmt -# rev: v1.11.0 -# hooks: -# - id: setup-cfg-fmt -# args: -# - --min-py3-version -# - '3.7' -#- repo: https://gitlab.com/pycqa/flake8 -# rev: 3.8.3 -# hooks: -# - id: flake8 -# additional_dependencies: -# - flake8-bugbear == 20.1.2 -# language_version: python3.8 + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: check-ast + - id: check-builtin-literals + - id: check-docstring-first + - id: check-merge-conflict + - id: check-yaml + - id: check-toml + - id: debug-statements + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.2.0 + hooks: + - id: ruff-format + - id: ruff + args: ["--fix", "--unsafe-fixes", "--exit-non-zero-on-fix"] + - repo: https://github.com/tox-dev/tox-ini-fmt + rev: "0.5.2" + hooks: + - id: tox-ini-fmt + args: ["-p", "fix_lint"] + - repo: meta + hooks: + - id: check-hooks-apply + - id: check-useless-excludes diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index b2b3d7180b..d517bf7b2f 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -4,30 +4,31 @@ from sourmash.sbt_storage import ZipStorage from sourmash.minhash import MinHash -RANDOM_SEQ_SIZE=3000 -RANDOM_SEQ_NUMBER=300 +RANDOM_SEQ_SIZE = 3000 +RANDOM_SEQ_NUMBER = 300 -MINHASH_NUM=500 -MINHASH_K=21 +MINHASH_NUM = 500 +MINHASH_K = 21 -GET_MINS_RANGE=500 -ADD_HASH_RANGE=10_000 -ADD_MANY_RANGE=1000 -SIMILARITY_TIMES=500 -COUNT_COMMON_TIMES=500 -MERGE_TIMES=500 -COPY_TIMES=500 -CONCAT_TIMES=500 -SET_ABUNDANCES_RANGE=500 -ZIP_STORAGE_WRITE=100_000 -ZIP_STORAGE_LOAD=20 +GET_MINS_RANGE = 500 +ADD_HASH_RANGE = 10_000 +ADD_MANY_RANGE = 1000 +SIMILARITY_TIMES = 500 +COUNT_COMMON_TIMES = 500 +MERGE_TIMES = 500 +COPY_TIMES = 500 +CONCAT_TIMES = 500 +SET_ABUNDANCES_RANGE = 500 +ZIP_STORAGE_WRITE = 100_000 +ZIP_STORAGE_LOAD = 20 def load_sequences(): sequences = [] for i in range(10): - random_seq = random.sample("A,C,G,T".split(",") * RANDOM_SEQ_SIZE, - RANDOM_SEQ_NUMBER) + random_seq = random.sample( + "A,C,G,T".split(",") * RANDOM_SEQ_SIZE, RANDOM_SEQ_NUMBER + ) sequences.append("".join(random_seq)) return sequences @@ -35,12 +36,12 @@ def load_sequences(): class TimeMinHashSuite: def setup(self): self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False) - self.protein_mh = MinHash(MINHASH_NUM, MINHASH_K, is_protein=True, - track_abundance=False) + self.protein_mh = MinHash( + MINHASH_NUM, MINHASH_K, is_protein=True, track_abundance=False + ) self.sequences = load_sequences() - self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K, - track_abundance=False) + self.populated_mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=False) for seq in self.sequences: self.populated_mh.add_sequence(seq) @@ -103,8 +104,9 @@ def time_concat(self): class PeakmemMinHashSuite: def setup(self): self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True) - self.protein_mh = MinHash(MINHASH_NUM, MINHASH_K, - is_protein=True, track_abundance=True) + self.protein_mh = MinHash( + MINHASH_NUM, MINHASH_K, is_protein=True, track_abundance=True + ) self.sequences = load_sequences() def peakmem_add_sequence(self): @@ -158,21 +160,25 @@ def time_set_abundances_noclear(self): for i in range(SET_ABUNDANCES_RANGE): mh.set_abundances(mins, clear=False) + class PeakmemMinAbundanceSuite(PeakmemMinHashSuite): def setup(self): PeakmemMinHashSuite.setup(self) self.mh = MinHash(MINHASH_NUM, MINHASH_K, track_abundance=True) + #################### -class TimeZipStorageSuite: +class TimeZipStorageSuite: def setup(self): import zipfile + self.zipfile = NamedTemporaryFile() - with zipfile.ZipFile(self.zipfile, mode='w', - compression=zipfile.ZIP_STORED) as storage: + with zipfile.ZipFile( + self.zipfile, mode="w", compression=zipfile.ZIP_STORED + ) as storage: for i in range(ZIP_STORAGE_WRITE): # just so we have lots of entries storage.writestr(str(i), b"0") @@ -196,17 +202,18 @@ def teardown(self): class PeakmemZipStorageSuite: def setup(self): import zipfile + self.zipfile = NamedTemporaryFile() - with zipfile.ZipFile(self.zipfile, mode='w', - compression=zipfile.ZIP_STORED) as storage: + with zipfile.ZipFile( + self.zipfile, mode="w", compression=zipfile.ZIP_STORED + ) as storage: for i in range(ZIP_STORAGE_WRITE): # just so we have lots of entries storage.writestr(str(i), b"0") # one big-ish entry storage.writestr("sig1", b"9" * 1_000_000) - def peakmem_load_from_zipstorage(self): with ZipStorage(self.zipfile.name) as storage: for i in range(ZIP_STORAGE_LOAD): diff --git a/doc/conf.py b/doc/conf.py index fdd819b93a..43623fcfc5 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # # sourmash documentation build configuration file, created by # sphinx-quickstart on Sat Jun 4 16:35:43 2016. @@ -17,57 +16,59 @@ import os import sourmash -print('sourmash at:', sourmash) + +print("sourmash at:", sourmash) # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, os.path.abspath('..')) +sys.path.insert(0, os.path.abspath("..")) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.coverage', - 'sphinx.ext.viewcode', -# 'sphinx.ext.napoleon', - 'nbsphinx', - 'IPython.sphinxext.ipython_console_highlighting', - 'myst_parser' + "sphinx.ext.autodoc", + "sphinx.ext.doctest", + "sphinx.ext.coverage", + "sphinx.ext.viewcode", + # 'sphinx.ext.napoleon', + "nbsphinx", + "IPython.sphinxext.ipython_console_highlighting", + "myst_parser", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] -source_suffix = ['.rst', '.md'] +source_suffix = [".rst", ".md"] # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'sourmash' -copyright = '2016-2023, C. Titus Brown, Luiz Irber, and N. Tessa Pierce-Ward' -author = 'C. Titus Brown, Luiz Irber, and N. Tessa Pierce-Ward' +project = "sourmash" +copyright = "2016-2023, C. Titus Brown, Luiz Irber, and N. Tessa Pierce-Ward" +author = "C. Titus Brown, Luiz Irber, and N. Tessa Pierce-Ward" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. from importlib.metadata import version -release = version('sourmash') -version = '.'.join(release.split('.')[:2]) + +release = version("sourmash") +version = ".".join(release.split(".")[:2]) # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -78,208 +79,208 @@ # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False # CTB: suppress warnings about circularity in ToC. # see https://github.com/sphinx-doc/sphinx/issues/7410. -suppress_warnings = ['toc.circular'] +suppress_warnings = ["toc.circular"] # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'alabaster' +html_theme = "alabaster" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. html_theme_options = { - 'logo': 'logo.png', - 'logo_name': True, - 'description': 'Quickly search, compare, and analyze genomic and metagenomic data sets', - 'sidebar_collapse': False, + "logo": "logo.png", + "logo_name": True, + "description": "Quickly search, compare, and analyze genomic and metagenomic data sets", + "sidebar_collapse": False, } # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. # " v documentation" by default. -#html_title = 'sourmash v1.0' +# html_title = 'sourmash v1.0' # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (relative to this directory) to use as a favicon of # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not None, a 'Last updated on:' timestamp is inserted at every page # bottom, using the given strftime format. # The empty string is equivalent to '%b %d, %Y'. -#html_last_updated_fmt = None +# html_last_updated_fmt = None # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -html_sidebars = {'*': ['about.html', 'navigation.html', 'relations.html', - 'sourcelink.html', 'searchbox.html']} +html_sidebars = { + "*": [ + "about.html", + "navigation.html", + "relations.html", + "sourcelink.html", + "searchbox.html", + ] +} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Language to be used for generating the HTML full-text search index. # Sphinx supports the following languages: # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh' -#html_search_language = 'en' +# html_search_language = 'en' # A dictionary with options for the search language support, empty by default. # 'ja' uses this config value. # 'zh' user can custom change `jieba` dictionary path. -#html_search_options = {'type': 'default'} +# html_search_options = {'type': 'default'} # The name of a javascript file (relative to the configuration directory) that # implements a search results scorer. If empty, the default will be used. -#html_search_scorer = 'scorer.js' +# html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = 'sourmashdoc' +htmlhelp_basename = "sourmashdoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', - -# Latex figure (float) alignment -#'figure_align': 'htbp', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', + # Latex figure (float) alignment + #'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'sourmash.tex', 'sourmash Documentation', - 'C. Titus Brown', 'manual'), + (master_doc, "sourmash.tex", "sourmash Documentation", "C. Titus Brown", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'sourmash', 'sourmash Documentation', - [author], 1) -] +man_pages = [(master_doc, "sourmash", "sourmash Documentation", [author], 1)] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -288,22 +289,28 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'sourmash', 'sourmash Documentation', - author, 'sourmash', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "sourmash", + "sourmash Documentation", + author, + "sourmash", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False autodoc_mock_imports = ["sourmash.minhash"] myst_heading_anchors = 3 diff --git a/pyproject.toml b/pyproject.toml index d1de447e72..3f2331b97c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -100,9 +100,11 @@ license = { text = "BSD 3-Clause License" } [project.optional-dependencies] test = [ "pytest>=6.2.4,<8.1.0", - "pytest-cov>=2.12,<5.0", - "pytest-xdist", + "pytest-cov>=4,<5.0", + "pytest-xdist>=3.1", "pyyaml>=6,<7", + "diff-cover>=7.3", + "covdefaults>=2.2.2", "recommonmark", "hypothesis", "build", @@ -155,6 +157,12 @@ macos-deployment-target = "10.14" [tool.maturin.target.aarch64-apple-darwin] macos-deployment-target = "11.0" +[tool.ruff.lint] +extend-select = [ + "UP", # pyupgrade +] +ignore = ["F401", "E712", "E402", "F821", "E722", "E741", "F811", "F403", "F822"] + [tool.isort] known_third_party = ["deprecation", "hypothesis", "mmh3", "numpy", "pytest", "screed", "sourmash_tst_utils"] multi_line_output = 3 @@ -212,3 +220,20 @@ testpaths = [ "tests", "doc", ] + +[tool.coverage] +html.show_contexts = true +html.skip_covered = false +paths.source = [ + "src", + ".tox*/*/lib/python*/site-packages", + ".tox*/pypy*/site-packages", + ".tox*\\*\\Lib\\site-packages", + "*/src", + "*\\src", + "*/tests", + "*\tests", +] +report.fail_under = 88 +run.parallel = true +run.plugins = ["covdefaults"] diff --git a/src/sourmash/__init__.py b/src/sourmash/__init__.py index 33170edcd8..53ee6e4803 100644 --- a/src/sourmash/__init__.py +++ b/src/sourmash/__init__.py @@ -18,17 +18,19 @@ class MinHash - hash sketch class from deprecation import deprecated from importlib.metadata import version -__all__ = ['MinHash', 'SourmashSignature', - 'load_one_signature', - 'SourmashSignature', - 'load_file_as_index', - 'load_file_as_signatures', - 'save_signatures', - 'create_sbt_index', - 'load_signatures', # deprecated - remove in 5.0 - 'load_sbt_index', # deprecated - remove in 5.0 - 'search_sbt_index', # deprecated - remove in 5.0 - ] +__all__ = [ + "MinHash", + "SourmashSignature", + "load_one_signature", + "SourmashSignature", + "load_file_as_index", + "load_file_as_signatures", + "save_signatures", + "create_sbt_index", + "load_signatures", # deprecated - remove in 5.0 + "load_sbt_index", # deprecated - remove in 5.0 + "search_sbt_index", # deprecated - remove in 5.0 +] from ._lowlevel import ffi, lib @@ -48,9 +50,13 @@ class MinHash - hash sketch class save_signatures, ) -@deprecated(deprecated_in="3.5.1", removed_in="5.0", - current_version=VERSION, - details='Use load_file_as_signatures instead.') + +@deprecated( + deprecated_in="3.5.1", + removed_in="5.0", + current_version=VERSION, + details="Use load_file_as_signatures instead.", +) def load_signatures(*args, **kwargs): """Load a JSON string with signatures into classes. @@ -65,12 +71,17 @@ def load_signatures(*args, **kwargs): """ return load_signatures_private(*args, **kwargs) + from .sbtmh import load_sbt_index as load_sbt_index_private from .sbtmh import search_sbt_index as search_sbt_index_private -@deprecated(deprecated_in="3.5.1", removed_in="5.0", - current_version=VERSION, - details='Use load_file_as_index instead.') + +@deprecated( + deprecated_in="3.5.1", + removed_in="5.0", + current_version=VERSION, + details="Use load_file_as_index instead.", +) def load_sbt_index(*args, **kwargs): """Load and return an SBT index. @@ -80,9 +91,12 @@ def load_sbt_index(*args, **kwargs): return load_sbt_index_private(*args, **kwargs) -@deprecated(deprecated_in="3.5.1", removed_in="5.0", - current_version=VERSION, - details='Use the new Index API instead.') +@deprecated( + deprecated_in="3.5.1", + removed_in="5.0", + current_version=VERSION, + details="Use the new Index API instead.", +) def search_sbt_index(*args, **kwargs): """\ Search an SBT index `tree` with signature `query` for matches above @@ -98,6 +112,7 @@ def search_sbt_index(*args, **kwargs): """ return search_sbt_index_private(*args, **kwargs) + from .sbtmh import create_sbt_index from . import lca from . import tax diff --git a/src/sourmash/__main__.py b/src/sourmash/__main__.py index 74fdf270c0..a8c70878fa 100644 --- a/src/sourmash/__main__.py +++ b/src/sourmash/__main__.py @@ -7,18 +7,19 @@ def main(arglist=None): import sourmash + args = sourmash.cli.parse_args(arglist) - if hasattr(args, 'subcmd'): + if hasattr(args, "subcmd"): mod = getattr(sourmash.cli, args.cmd) submod = getattr(mod, args.subcmd) - mainmethod = getattr(submod, 'main') + mainmethod = getattr(submod, "main") else: mod = getattr(sourmash.cli, args.cmd) - mainmethod = getattr(mod, 'main') + mainmethod = getattr(mod, "main") retval = mainmethod(args) raise SystemExit(retval) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/src/sourmash/cli/__init__.py b/src/sourmash/cli/__init__.py index 575bbdb0f5..a02487f4fd 100644 --- a/src/sourmash/cli/__init__.py +++ b/src/sourmash/cli/__init__.py @@ -45,7 +45,7 @@ class SourmashParser(ArgumentParser): _citation_printed = False def __init__(self, citation=True, **kwargs): - super(SourmashParser, self).__init__(**kwargs) + super().__init__(**kwargs) self.citation = citation @classmethod @@ -53,6 +53,7 @@ def print_citation(cls): if cls._citation_printed: return from sourmash.logging import notify + notify(f"\n== This is sourmash version {sourmash.VERSION}. ==") notify("== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==\n") cls._citation_printed = True @@ -70,53 +71,56 @@ def _subparser_from_name(self, name): def print_help(self): self.print_citation() - super(SourmashParser, self).print_help() - + super().print_help() def parse_args(self, args=None, namespace=None): - if (args is None and len(sys.argv) == 1) or (args is not None and len(args) == 0): + if (args is None and len(sys.argv) == 1) or ( + args is not None and len(args) == 0 + ): self.print_help() raise SystemExit(1) - args = super(SourmashParser, self).parse_args(args=args, namespace=namespace) - if ('quiet' not in args or not args.quiet) and self.citation: + args = super().parse_args(args=args, namespace=namespace) + if ("quiet" not in args or not args.quiet) and self.citation: self.print_citation() - if 'subcmd' in args and args.subcmd is None: + if "subcmd" in args and args.subcmd is None: self._subparser_from_name(args.cmd).print_help() raise SystemExit(1) # BEGIN: dirty hacks to simultaneously support new and previous interface - if hasattr(args, 'subcmd') and args.subcmd == 'import': - args.subcmd = 'ingest' + if hasattr(args, "subcmd") and args.subcmd == "import": + args.subcmd = "ingest" # END: dirty hacks to simultaneously support new and previous interface return args def get_parser(): module_descs = { - 'tax': 'Integrate taxonomy information based on "gather" results', - 'lca': 'Taxonomic operations', - 'sketch': 'Create signatures', - 'sig': 'Manipulate signature files', - 'storage': 'Operations on storage', - 'scripts': "Plug-ins", + "tax": 'Integrate taxonomy information based on "gather" results', + "lca": "Taxonomic operations", + "sketch": "Create signatures", + "sig": "Manipulate signature files", + "storage": "Operations on storage", + "scripts": "Plug-ins", } alias = { "sig": "signature", "ext": "scripts", } - expert = set(['categorize', 'import_csv', 'migrate', 'multigather', 'sbt_combine', 'watch']) + expert = set( + ["categorize", "import_csv", "migrate", "multigather", "sbt_combine", "watch"] + ) clidir = os.path.dirname(__file__) basic_ops = utils.command_list(clidir) # provide a list of the basic operations - not expert, not submodules. user_ops = [op for op in basic_ops if op not in expert and op not in module_descs] - usage = ' Basic operations\n' + usage = " Basic operations\n" for op in user_ops: docstring = getattr(sys.modules[__name__], op).__doc__ - helpstring = 'sourmash {op:s} --help'.format(op=op) - usage += ' {hs:25s} {ds:s}\n'.format(hs=helpstring, ds=docstring) + helpstring = f"sourmash {op:s} --help" + usage += f" {helpstring:25s} {docstring:s}\n" # next, all the subcommand ones - dive into subdirectories. cmd_group_dirs = next(os.walk(clidir))[1] cmd_group_dirs = filter(utils.opfilter, cmd_group_dirs) @@ -124,18 +128,33 @@ def get_parser(): cmd_group_usage = [cmd for cmd in cmd_group_dirs if cmd not in alias.values()] for dirpath in cmd_group_usage: - usage += '\n ' + module_descs[dirpath] + '\n' - usage += ' sourmash {gd:s} --help\n'.format(gd=dirpath) + usage += "\n " + module_descs[dirpath] + "\n" + usage += f" sourmash {dirpath:s} --help\n" if dirpath in alias: - usage += ' sourmash {gd:s} --help\n'.format(gd=alias[dirpath]) + usage += f" sourmash {alias[dirpath]:s} --help\n" - desc = 'Create, compare, and manipulate k-mer sketches of biological sequences.\n\nUsage instructions:\n' + usage - parser = SourmashParser(prog='sourmash', description=desc, formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS) - parser._optionals.title = 'Options' - parser.add_argument('-v', '--version', action='version', version='sourmash '+ sourmash.VERSION) - parser.add_argument('-q', '--quiet', action='store_true', help='don\'t print citation information') + desc = ( + "Create, compare, and manipulate k-mer sketches of biological sequences.\n\nUsage instructions:\n" + + usage + ) + parser = SourmashParser( + prog="sourmash", + description=desc, + formatter_class=RawDescriptionHelpFormatter, + usage=SUPPRESS, + ) + parser._optionals.title = "Options" + parser.add_argument( + "-v", "--version", action="version", version="sourmash " + sourmash.VERSION + ) + parser.add_argument( + "-q", "--quiet", action="store_true", help="don't print citation information" + ) sub = parser.add_subparsers( - title='Instructions', dest='cmd', metavar='cmd', help=SUPPRESS, + title="Instructions", + dest="cmd", + metavar="cmd", + help=SUPPRESS, ) for op in basic_ops + cmd_group_dirs: getattr(sys.modules[__name__], op).subparser(sub) diff --git a/src/sourmash/cli/categorize.py b/src/sourmash/cli/categorize.py index e3c41ec773..0c8002e224 100644 --- a/src/sourmash/cli/categorize.py +++ b/src/sourmash/cli/categorize.py @@ -4,32 +4,36 @@ def subparser(subparsers): - subparser = subparsers.add_parser('categorize') - subparser.add_argument('database', help='location of signature collection/database to load') + subparser = subparsers.add_parser("categorize") subparser.add_argument( - 'queries', nargs='+', - help='locations of signatures to categorize' + "database", help="location of signature collection/database to load" ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "queries", nargs="+", help="locations of signatures to categorize" + ) + subparser.add_argument( + "-q", "--quiet", action="store_true", help="suppress non-error output" ) add_ksize_arg(subparser) subparser.add_argument( - '--threshold', default=0.08, type=float, - help='minimum threshold for reporting matches; default=0.08' + "--threshold", + default=0.08, + type=float, + help="minimum threshold for reporting matches; default=0.08", ) subparser.add_argument( - '--ignore-abundance', action='store_true', - help='do NOT use k-mer abundances if present' + "--ignore-abundance", + action="store_true", + help="do NOT use k-mer abundances if present", ) add_moltype_args(subparser) # TODO: help messages in these - subparser.add_argument('--csv', help='output summary CSV to this file') - subparser.add_argument('--load-csv', default=None) + subparser.add_argument("--csv", help="output summary CSV to this file") + subparser.add_argument("--load-csv", default=None) def main(args): import sourmash + return sourmash.commands.categorize(args) diff --git a/src/sourmash/cli/compare.py b/src/sourmash/cli/compare.py index 54864d6c93..74da5bd837 100644 --- a/src/sourmash/cli/compare.py +++ b/src/sourmash/cli/compare.py @@ -1,6 +1,6 @@ """create a similarity matrix comparing many samples""" -usage=""" +usage = """ The `compare` subcommand compares one or more signatures (created with `sketch`) using estimated Jaccard index [1] or (if signatures are @@ -27,69 +27,91 @@ --- """ -from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args, add_pattern_args, - add_scaled_arg) +from sourmash.cli.utils import ( + add_ksize_arg, + add_moltype_args, + add_picklist_args, + add_pattern_args, + add_scaled_arg, +) def subparser(subparsers): - subparser = subparsers.add_parser('compare', description=__doc__, usage=usage) + subparser = subparsers.add_parser("compare", description=__doc__, usage=usage) subparser.add_argument( - 'signatures', nargs='*', help='list of signatures to compare', - default=[] + "signatures", nargs="*", help="list of signatures to compare", default=[] ) subparser.add_argument( - '-q', '--quiet', action='store_true', help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='F', - help='file to which output will be written; default is terminal ' - '(standard output)' + "-o", + "--output", + metavar="F", + help="file to which output will be written; default is terminal " + "(standard output)", ) subparser.add_argument( - '--ignore-abundance', action='store_true', - help='do NOT use k-mer abundances even if present' + "--ignore-abundance", + action="store_true", + help="do NOT use k-mer abundances even if present", ) subparser.add_argument( - '--containment', action='store_true', - help='calculate containment instead of similarity' + "--containment", + action="store_true", + help="calculate containment instead of similarity", ) subparser.add_argument( - '--max-containment', action='store_true', - help='calculate max containment instead of similarity' + "--max-containment", + action="store_true", + help="calculate max containment instead of similarity", ) subparser.add_argument( - '--avg-containment', '--average-containment', action='store_true', - help='calculate average containment instead of similarity' + "--avg-containment", + "--average-containment", + action="store_true", + help="calculate average containment instead of similarity", ) subparser.add_argument( - '--estimate-ani', '--ANI', '--ani', action='store_true', - help='return ANI estimated from jaccard, containment, average containment, or max containment; see https://doi.org/10.1101/2022.01.11.475870' + "--estimate-ani", + "--ANI", + "--ani", + action="store_true", + help="return ANI estimated from jaccard, containment, average containment, or max containment; see https://doi.org/10.1101/2022.01.11.475870", ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='continue past errors in file loading' + "-f", + "--force", + action="store_true", + help="continue past errors in file loading", ) subparser.add_argument( - '--csv', metavar='F', - help='write matrix to specified file in CSV format (with column ' - 'headers)' + "--csv", + metavar="F", + help="write matrix to specified file in CSV format (with column " "headers)", ) subparser.add_argument( - '-p', '--processes', metavar='N', type=int, default=None, - help='Number of processes to use to calculate similarity') + "-p", + "--processes", + metavar="N", + type=int, + default=None, + help="Number of processes to use to calculate similarity", + ) subparser.add_argument( - '--distance-matrix', action='store_true', - help='output a distance matrix, instead of a similarity matrix' + "--distance-matrix", + action="store_true", + help="output a distance matrix, instead of a similarity matrix", ) subparser.add_argument( - '--similarity-matrix', action='store_false', - dest='distance_matrix', - help='output a similarity matrix; this is the default', + "--similarity-matrix", + action="store_false", + dest="distance_matrix", + help="output a similarity matrix; this is the default", ) add_ksize_arg(subparser) @@ -101,4 +123,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.commands.compare(args) diff --git a/src/sourmash/cli/compute.py b/src/sourmash/cli/compute.py index 7b3b48d20d..cfdb48f42a 100644 --- a/src/sourmash/cli/compute.py +++ b/src/sourmash/cli/compute.py @@ -1,6 +1,6 @@ """compute sequence signatures for inputs""" -usage=""" +usage = """ ** WARNING: the sourmash compute command is DEPRECATED as of 4.0 and ** will be removed in 5.0. Please see the 'sourmash sketch' command instead. @@ -35,8 +35,8 @@ def ksize_parser(ksizes): # get list of k-mer sizes for which to compute sketches - if ',' in ksizes: - ksizes = ksizes.split(',') + if "," in ksizes: + ksizes = ksizes.split(",") ksizes = list(map(int, ksizes)) else: ksizes = [int(ksizes)] @@ -45,81 +45,98 @@ def ksize_parser(ksizes): def subparser(subparsers): - subparser = subparsers.add_parser('compute', description=__doc__, usage=usage) + subparser = subparsers.add_parser("compute", description=__doc__, usage=usage) - sketch_args = subparser.add_argument_group('Sketching options') + sketch_args = subparser.add_argument_group("Sketching options") sketch_args.add_argument( - '-k', '--ksizes', default='21,31,51', + "-k", + "--ksizes", + default="21,31,51", type=ksize_parser, - help='comma-separated list of k-mer sizes; default=%(default)s' + help="comma-separated list of k-mer sizes; default=%(default)s", ) sketch_args.add_argument( - '--track-abundance', action='store_true', - help='track k-mer abundances in the generated signature' + "--track-abundance", + action="store_true", + help="track k-mer abundances in the generated signature", ) sketch_args.add_argument( - '--scaled', type=float, default=0, - help='choose number of hashes as 1 in FRACTION of input k-mers' + "--scaled", + type=float, + default=0, + help="choose number of hashes as 1 in FRACTION of input k-mers", ) add_construct_moltype_args(sketch_args) sketch_args.add_argument( - '--input-is-protein', action='store_true', - help='Consume protein sequences - no translation needed.' + "--input-is-protein", + action="store_true", + help="Consume protein sequences - no translation needed.", ) sketch_args.add_argument( - '--seed', type=int, default=get_minhash_default_seed(), - help='seed used by MurmurHash; default=%(default)i' + "--seed", + type=int, + default=get_minhash_default_seed(), + help="seed used by MurmurHash; default=%(default)i", ) - file_args = subparser.add_argument_group('File handling options') + file_args = subparser.add_argument_group("File handling options") file_args.add_argument( - '-f', '--force', action='store_true', - help='recompute signatures even if the file exists' + "-f", + "--force", + action="store_true", + help="recompute signatures even if the file exists", ) file_args.add_argument( - '-o', '--output', - help='output computed signatures to this file' + "-o", "--output", help="output computed signatures to this file" ) file_args.add_argument( - '--output-dir', '--outdir', - help='output computed signatures to this directory', + "--output-dir", + "--outdir", + help="output computed signatures to this directory", ) file_args.add_argument( - '--singleton', action='store_true', - help='compute a signature for each sequence record individually' + "--singleton", + action="store_true", + help="compute a signature for each sequence record individually", ) file_args.add_argument( - '--merge', '--name', type=str, default='', metavar="FILE", - help='merge all input files into one signature file with the ' - 'specified name' + "--merge", + "--name", + type=str, + default="", + metavar="FILE", + help="merge all input files into one signature file with the " "specified name", ) file_args.add_argument( - '--name-from-first', action='store_true', - help='name the signature generated from each file after the first ' - 'record in the file' + "--name-from-first", + action="store_true", + help="name the signature generated from each file after the first " + "record in the file", ) file_args.add_argument( - '--randomize', action='store_true', - help='shuffle the list of input filenames randomly' + "--randomize", + action="store_true", + help="shuffle the list of input filenames randomly", ) subparser.add_argument( - '-q', '--quiet', action='store_true', help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '--check-sequence', action='store_true', - help='complain if input sequence is invalid' + "--check-sequence", + action="store_true", + help="complain if input sequence is invalid", ) subparser.add_argument( - '--license', default='CC0', type=str, - help='signature license. Currently only CC0 is supported.' + "--license", + default="CC0", + type=str, + help="signature license. Currently only CC0 is supported.", ) - subparser.add_argument( - 'filenames', nargs='+', help='file(s) of sequences' - ) - subparser._positionals.title = 'Required arguments' - subparser._optionals.title = 'Miscellaneous options' + subparser.add_argument("filenames", nargs="+", help="file(s) of sequences") + subparser._positionals.title = "Required arguments" + subparser._optionals.title = "Miscellaneous options" add_num_arg(sketch_args, 500) @@ -127,8 +144,10 @@ def main(args): from sourmash.command_compute import compute from sourmash.logging import notify - notify("""\ + notify( + """\ ** WARNING: the sourmash compute command is DEPRECATED as of 4.0 and ** will be removed in 5.0. Please see the 'sourmash sketch' command instead. -""") +""" + ) return compute(args) diff --git a/src/sourmash/cli/gather.py b/src/sourmash/cli/gather.py index 0b0115efd2..88860a50cd 100644 --- a/src/sourmash/cli/gather.py +++ b/src/sourmash/cli/gather.py @@ -1,6 +1,6 @@ """search a metagenome signature against dbs""" -usage=""" +usage = """ The `gather` subcommand selects the best reference genomes to use for a metagenome analysis, by finding the smallest set of non-overlapping @@ -62,103 +62,133 @@ --- """ -from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args, add_scaled_arg, - add_pattern_args) +from sourmash.cli.utils import ( + add_ksize_arg, + add_moltype_args, + add_picklist_args, + add_scaled_arg, + add_pattern_args, +) def subparser(subparsers): - subparser = subparsers.add_parser('gather', description=__doc__, usage=usage) - subparser.add_argument('query', help='query signature') + subparser = subparsers.add_parser("gather", description=__doc__, usage=usage) + subparser.add_argument("query", help="query signature") subparser.add_argument( - 'databases', nargs='+', - help='signatures/SBTs to search', + "databases", + nargs="+", + help="signatures/SBTs to search", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) + subparser.add_argument("-d", "--debug", action="store_true") subparser.add_argument( - '-d', '--debug', action='store_true' + "-n", + "--num-results", + default=None, + type=int, + metavar="N", + help="number of results to report (default: terminate at --threshold-bp)", ) subparser.add_argument( - '-n', '--num-results', default=None, type=int, metavar='N', - help='number of results to report (default: terminate at --threshold-bp)' + "-o", + "--output", + metavar="FILE", + help="output CSV containing matches to this file", ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output CSV containing matches to this file' + "--save-matches", + metavar="FILE", + help="save gather matched signatures from the database to the " + "specified file", ) subparser.add_argument( - '--save-matches', metavar='FILE', - help='save gather matched signatures from the database to the ' - 'specified file' + "--save-prefetch", + metavar="FILE", + help="save all prefetch-matched signatures from the databases to the " + "specified file or directory", ) subparser.add_argument( - '--save-prefetch', metavar='FILE', - help='save all prefetch-matched signatures from the databases to the ' - 'specified file or directory' + "--save-prefetch-csv", + metavar="FILE", + help="save a csv with information from all prefetch-matched signatures " + "to the specified file", ) subparser.add_argument( - '--save-prefetch-csv', metavar='FILE', - help='save a csv with information from all prefetch-matched signatures ' - 'to the specified file' + "--threshold-bp", + metavar="REAL", + type=float, + default=5e4, + help="reporting threshold (in bp) for estimated overlap with remaining query (default=50kb)", ) subparser.add_argument( - '--threshold-bp', metavar='REAL', type=float, default=5e4, - help='reporting threshold (in bp) for estimated overlap with remaining query (default=50kb)' + "--output-unassigned", + metavar="FILE", + help="output unassigned portions of the query as a signature to the " + "specified file", ) subparser.add_argument( - '--output-unassigned', metavar='FILE', - help='output unassigned portions of the query as a signature to the ' - 'specified file' + "--ignore-abundance", + action="store_true", + help="do NOT use k-mer abundances if present", ) subparser.add_argument( - '--ignore-abundance', action='store_true', - help='do NOT use k-mer abundances if present' + "--md5", default=None, help="select the signature with this md5 as query" ) subparser.add_argument( - '--md5', default=None, - help='select the signature with this md5 as query' - ) - subparser.add_argument( - '--cache-size', default=0, type=int, metavar='N', - help='number of internal SBT nodes to cache in memory (default: 0, cache all nodes)' + "--cache-size", + default=0, + type=int, + metavar="N", + help="number of internal SBT nodes to cache in memory (default: 0, cache all nodes)", ) # advanced parameters subparser.add_argument( - '--linear', dest="linear", action='store_true', + "--linear", + dest="linear", + action="store_true", help="force a low-memory but maybe slower database search", ) subparser.add_argument( - '--no-linear', dest="linear", action='store_false', + "--no-linear", + dest="linear", + action="store_false", ) subparser.add_argument( - '--no-prefetch', dest="prefetch", action='store_false', + "--no-prefetch", + dest="prefetch", + action="store_false", help="do not use prefetch before gather; see documentation", ) subparser.add_argument( - '--prefetch', dest="prefetch", action='store_true', + "--prefetch", + dest="prefetch", + action="store_true", help="use prefetch before gather; see documentation", ) subparser.add_argument( - '--estimate-ani-ci', action='store_true', - help='also output confidence intervals for ANI estimates' + "--estimate-ani-ci", + action="store_true", + help="also output confidence intervals for ANI estimates", ) subparser.add_argument( - '--fail-on-empty-database', action='store_true', - help='stop at databases that contain no compatible signatures' + "--fail-on-empty-database", + action="store_true", + help="stop at databases that contain no compatible signatures", ) subparser.add_argument( - '--no-fail-on-empty-database', action='store_false', - dest='fail_on_empty_database', - help='continue past databases that contain no compatible signatures' + "--no-fail-on-empty-database", + action="store_false", + dest="fail_on_empty_database", + help="continue past databases that contain no compatible signatures", ) subparser.set_defaults(fail_on_empty_database=True) subparser.add_argument( - '--create-empty-results', action='store_true', - help='create an empty results file even if no matches.' + "--create-empty-results", + action="store_true", + help="create an empty results file even if no matches.", ) add_ksize_arg(subparser) @@ -170,4 +200,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.commands.gather(args) diff --git a/src/sourmash/cli/import_csv.py b/src/sourmash/cli/import_csv.py index 77fcbd14f8..6e0964678a 100644 --- a/src/sourmash/cli/import_csv.py +++ b/src/sourmash/cli/import_csv.py @@ -4,17 +4,19 @@ def subparser(subparsers): - subparser = subparsers.add_parser('import_csv') - subparser.add_argument('mash_csvfile', help='CSV file with mash sketches') + subparser = subparsers.add_parser("import_csv") + subparser.add_argument("mash_csvfile", help="CSV file with mash sketches") subparser.add_argument( - '-o', '--output', - help='save signature generated from data to this file (default stdout)' + "-o", + "--output", + help="save signature generated from data to this file (default stdout)", ) def main(args): import sourmash + notify("** WARNING: 'import_csv' is deprecated as of sourmash 4.0, and will") notify("** be removed in sourmash 5.0; use 'sourmash sig import --csv' instead.") - notify('') + notify("") return sourmash.commands.import_csv(args) diff --git a/src/sourmash/cli/index.py b/src/sourmash/cli/index.py index dcd8572ca0..4fb0fc7ab8 100644 --- a/src/sourmash/cli/index.py +++ b/src/sourmash/cli/index.py @@ -1,6 +1,6 @@ """index signatures for rapid search""" -usage=""" +usage = """ sourmash index -k 31 dbname *.sig @@ -25,46 +25,63 @@ --- """ -from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args, add_scaled_arg) +from sourmash.cli.utils import ( + add_ksize_arg, + add_moltype_args, + add_picklist_args, + add_scaled_arg, +) def subparser(subparsers): - subparser = subparsers.add_parser('index', description=__doc__, - usage=usage) - subparser.add_argument('sbt_name', help='name to save index into; .sbt.zip or .sbt.json file') + subparser = subparsers.add_parser("index", description=__doc__, usage=usage) subparser.add_argument( - 'signatures', nargs='*', - help='signatures to load into SBT' + "sbt_name", help="name to save index into; .sbt.zip or .sbt.json file" ) + subparser.add_argument("signatures", nargs="*", help="signatures to load into SBT") subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--n_children', metavar='D', type=int, default=2, - help='number of children for internal nodes; default=2' + "-d", + "--n_children", + metavar="D", + type=int, + default=2, + help="number of children for internal nodes; default=2", ) subparser.add_argument( - '--append', action='store_true', default=False, - help='add signatures to an existing SBT' + "--append", + action="store_true", + default=False, + help="add signatures to an existing SBT", ) subparser.add_argument( - '-x', '--bf-size', metavar='S', type=float, default=1e5, - help='Bloom filter size used for internal nodes' + "-x", + "--bf-size", + metavar="S", + type=float, + default=1e5, + help="Bloom filter size used for internal nodes", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try loading *all* files in provided subdirectories, not just .sig files"' + "-f", + "--force", + action="store_true", + help='try loading *all* files in provided subdirectories, not just .sig files"', ) subparser.add_argument( - '-s', '--sparseness', metavar='FLOAT', type=float, default=.0, - help='What percentage of internal nodes will not be saved; ranges ' - 'from 0.0 (save all nodes) to 1.0 (no nodes saved)' + "-s", + "--sparseness", + metavar="FLOAT", + type=float, + default=0.0, + help="What percentage of internal nodes will not be saved; ranges " + "from 0.0 (save all nodes) to 1.0 (no nodes saved)", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -74,4 +91,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.commands.index(args) diff --git a/src/sourmash/cli/info.py b/src/sourmash/cli/info.py index b607112b7c..5d79790389 100644 --- a/src/sourmash/cli/info.py +++ b/src/sourmash/cli/info.py @@ -6,26 +6,29 @@ from sourmash.logging import notify from sourmash.plugins import list_all_plugins + def subparser(subparsers): - subparser = subparsers.add_parser('info') + subparser = subparsers.add_parser("info") subparser.add_argument( - '-v', '--verbose', action='store_true', - help='report versions of khmer and screed' + "-v", + "--verbose", + action="store_true", + help="report versions of khmer and screed", ) def info(verbose=False): "Report sourmash version + version of installed dependencies." - notify(f'sourmash version {sourmash.VERSION}') - notify(f'- loaded from path: {os.path.dirname(__file__)}') - notify('') + notify(f"sourmash version {sourmash.VERSION}") + notify(f"- loaded from path: {os.path.dirname(__file__)}") + notify("") if verbose: - notify('khmer version: None (internal Nodegraph)') - notify('') + notify("khmer version: None (internal Nodegraph)") + notify("") - notify(f'screed version {screed.__version__}') - notify(f'- loaded from path: {os.path.dirname(screed.__file__)}') + notify(f"screed version {screed.__version__}") + notify(f"- loaded from path: {os.path.dirname(screed.__file__)}") list_all_plugins() diff --git a/src/sourmash/cli/lca/__init__.py b/src/sourmash/cli/lca/__init__.py index a403876d02..6fbb73619c 100644 --- a/src/sourmash/cli/lca/__init__.py +++ b/src/sourmash/cli/lca/__init__.py @@ -16,19 +16,24 @@ def subparser(subparsers): - subparser = subparsers.add_parser('lca', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS) - desc = 'Operations\n' + subparser = subparsers.add_parser( + "lca", formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS + ) + desc = "Operations\n" clidir = os.path.dirname(__file__) ops = command_list(clidir) for subcmd in ops: docstring = getattr(sys.modules[__name__], subcmd).__doc__ - helpstring = 'sourmash lca {op:s} --help'.format(op=subcmd) - desc += ' {hs:33s} {ds:s}\n'.format(hs=helpstring, ds=docstring) + helpstring = f"sourmash lca {subcmd:s} --help" + desc += f" {helpstring:33s} {docstring:s}\n" s = subparser.add_subparsers( - title='Taxonomic utilities', dest='subcmd', metavar='subcmd', help=SUPPRESS, - description=desc + title="Taxonomic utilities", + dest="subcmd", + metavar="subcmd", + help=SUPPRESS, + description=desc, ) for subcmd in ops: getattr(sys.modules[__name__], subcmd).subparser(s) subparser._action_groups.reverse() - subparser._optionals.title = 'Options' + subparser._optionals.title = "Options" diff --git a/src/sourmash/cli/lca/classify.py b/src/sourmash/cli/lca/classify.py index 7efe112bd8..55c6134f07 100644 --- a/src/sourmash/cli/lca/classify.py +++ b/src/sourmash/cli/lca/classify.py @@ -2,34 +2,49 @@ def subparser(subparsers): - subparser = subparsers.add_parser('classify') - subparser.add_argument('--db', nargs='+', action='append', - help='databases to use to classify') - subparser.add_argument('--query', nargs='*', default=[], action='append', - help='query signatures to classify') - subparser.add_argument('--query-from-file', - help='file containing list of signature files to query') - subparser.add_argument('--threshold', metavar='T', type=int, default=5, - help="minimum number of hashes needed for a taxonomic classification (default: 5)") + subparser = subparsers.add_parser("classify") subparser.add_argument( - '--majority', action='store_true', - help='use majority vote classification instead of lca' + "--db", nargs="+", action="append", help="databases to use to classify" ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "--query", + nargs="*", + default=[], + action="append", + help="query signatures to classify", ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debugging output' + "--query-from-file", help="file containing list of signature files to query" ) subparser.add_argument( - '-o', '--output', metavar='FILE', default='-', - help='output CSV to the specified file; by default output to stdout' + "--threshold", + metavar="T", + type=int, + default=5, + help="minimum number of hashes needed for a taxonomic classification (default: 5)", ) - subparser.add_argument('--scaled', type=float) + subparser.add_argument( + "--majority", + action="store_true", + help="use majority vote classification instead of lca", + ) + subparser.add_argument( + "-q", "--quiet", action="store_true", help="suppress non-error output" + ) + subparser.add_argument( + "-d", "--debug", action="store_true", help="output debugging output" + ) + subparser.add_argument( + "-o", + "--output", + metavar="FILE", + default="-", + help="output CSV to the specified file; by default output to stdout", + ) + subparser.add_argument("--scaled", type=float) def main(args): import sourmash + return sourmash.lca.command_classify.classify(args) diff --git a/src/sourmash/cli/lca/compare_csv.py b/src/sourmash/cli/lca/compare_csv.py index 1f62fe4aa0..6732940325 100644 --- a/src/sourmash/cli/lca/compare_csv.py +++ b/src/sourmash/cli/lca/compare_csv.py @@ -1,35 +1,41 @@ """compare spreadsheets""" + def subparser(subparsers): # Dirty hack to simultaneously support new and previous interface # If desired, this function can be removed with a major version bump. - for cmd in ('compare', 'compare_csv'): + for cmd in ("compare", "compare_csv"): subparser = subparsers.add_parser(cmd) - subparser.add_argument('csv1', help='taxonomy spreadsheet output by classify') - subparser.add_argument('csv2', help='custom taxonomy spreadsheet') + subparser.add_argument("csv1", help="taxonomy spreadsheet output by classify") + subparser.add_argument("csv2", help="custom taxonomy spreadsheet") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debugging output' + "-d", "--debug", action="store_true", help="output debugging output" ) subparser.add_argument( - '-C', '--start-column', metavar='C', default=2, type=int, - help='column at which taxonomic assignments start; default=2' + "-C", + "--start-column", + metavar="C", + default=2, + type=int, + help="column at which taxonomic assignments start; default=2", ) subparser.add_argument( - '--tabs', action='store_true', - help='input spreadsheet is tab-delimited; default is commas' + "--tabs", + action="store_true", + help="input spreadsheet is tab-delimited; default is commas", ) subparser.add_argument( - '--no-headers', action='store_true', - help='no headers present in taxonomy spreadsheet' + "--no-headers", + action="store_true", + help="no headers present in taxonomy spreadsheet", ) - subparser.add_argument('-f', '--force', action='store_true') + subparser.add_argument("-f", "--force", action="store_true") def main(args): import sourmash + return sourmash.lca.command_compare_csv.compare_csv(args) diff --git a/src/sourmash/cli/lca/index.py b/src/sourmash/cli/lca/index.py index 3e1e456273..afc0702e9f 100644 --- a/src/sourmash/cli/lca/index.py +++ b/src/sourmash/cli/lca/index.py @@ -1,69 +1,74 @@ """create LCA database""" -from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args) +from sourmash.cli.utils import add_ksize_arg, add_moltype_args, add_picklist_args def subparser(subparsers): - subparser = subparsers.add_parser('index') - subparser.add_argument('csv', help='taxonomy spreadsheet') - subparser.add_argument('lca_db_out', help='output database name') + subparser = subparsers.add_parser("index") + subparser.add_argument("csv", help="taxonomy spreadsheet") + subparser.add_argument("lca_db_out", help="output database name") subparser.add_argument( - 'signatures', nargs='*', - help='signatures or directory of signatures to index (optional if provided via --from-file)' + "signatures", + nargs="*", + help="signatures or directory of signatures to index (optional if provided via --from-file)", ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) + subparser.add_argument("--scaled", metavar="S", default=10000, type=float) subparser.add_argument( - '--scaled', metavar='S', default=10000, type=float + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-d", "--debug", action="store_true", help="output debugging output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debugging output' + "-C", + "--start-column", + metavar="C", + default=2, + type=int, + help="column at which taxonomic assignments start; default=2", ) subparser.add_argument( - '-C', '--start-column', metavar='C', default=2, type=int, - help='column at which taxonomic assignments start; default=2' + "--tabs", + action="store_true", + help="input spreadsheet is tab-delimited; default is commas", ) subparser.add_argument( - '--tabs', action='store_true', - help='input spreadsheet is tab-delimited; default is commas' + "--no-headers", + action="store_true", + help="no headers present in taxonomy spreadsheet", ) subparser.add_argument( - '--no-headers', action='store_true', - help='no headers present in taxonomy spreadsheet' + "--split-identifiers", + action="store_true", + help="split names in signatures on whitespace", ) subparser.add_argument( - '--split-identifiers', action='store_true', - help='split names in signatures on whitespace' + "--keep-identifier-versions", + action="store_true", + help="do not remove accession versions", ) + subparser.add_argument("-f", "--force", action="store_true") + subparser.add_argument("--report", help="output a report on anomalies, if any") subparser.add_argument( - '--keep-identifier-versions', action='store_true', - help='do not remove accession versions' + "--require-taxonomy", + action="store_true", + help="ignore signatures with no taxonomy entry", ) - subparser.add_argument('-f', '--force', action='store_true') subparser.add_argument( - '--report', help='output a report on anomalies, if any' + "--fail-on-missing-taxonomy", + action="store_true", + help="fail quickly if taxonomy is not available for an identifier", ) subparser.add_argument( - '--require-taxonomy', action='store_true', - help='ignore signatures with no taxonomy entry' - ) - subparser.add_argument( - '--fail-on-missing-taxonomy', action='store_true', - help='fail quickly if taxonomy is not available for an identifier', - ) - subparser.add_argument( - '-F', '--database-format', + "-F", + "--database-format", help="format of output database; default is 'json')", - default='json', - choices=['json', 'sql'], + default="json", + choices=["json", "sql"], ) add_ksize_arg(subparser, default=31) @@ -73,4 +78,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.lca.command_index.index(args) diff --git a/src/sourmash/cli/lca/rankinfo.py b/src/sourmash/cli/lca/rankinfo.py index 6108dcdf4f..5d89612942 100644 --- a/src/sourmash/cli/lca/rankinfo.py +++ b/src/sourmash/cli/lca/rankinfo.py @@ -1,23 +1,25 @@ """database rank info""" + def subparser(subparsers): - subparser = subparsers.add_parser('rankinfo') - subparser.add_argument('db', nargs='+') + subparser = subparsers.add_parser("rankinfo") + subparser.add_argument("db", nargs="+") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debugging output' + "-d", "--debug", action="store_true", help="output debugging output" ) - subparser.add_argument('--scaled', metavar='FLOAT', type=float) + subparser.add_argument("--scaled", metavar="FLOAT", type=float) subparser.add_argument( - '--minimum-num', type=int, default=0, - help='Minimum number of different lineages a k-mer must be in to be counted' + "--minimum-num", + type=int, + default=0, + help="Minimum number of different lineages a k-mer must be in to be counted", ) def main(args): import sourmash + return sourmash.lca.command_rankinfo.rankinfo_main(args) diff --git a/src/sourmash/cli/lca/summarize.py b/src/sourmash/cli/lca/summarize.py index a3a8809e73..d9411a7f5b 100644 --- a/src/sourmash/cli/lca/summarize.py +++ b/src/sourmash/cli/lca/summarize.py @@ -2,35 +2,52 @@ def subparser(subparsers): - subparser = subparsers.add_parser('summarize') - subparser.add_argument('--db', nargs='+', action='append', - help='one or more LCA databases to use') - subparser.add_argument('--query', nargs='*', default=[], action='append', - help='one or more signature files to use as queries') - subparser.add_argument('--query-from-file', - help='file containing list of signature files to query') - subparser.add_argument('--threshold', metavar='T', type=int, default=5, - help='minimum number of hashes to require for a match') - subparser.add_argument( - '-o', '--output', metavar='FILE', - help='file to which CSV output will be written' - ) - subparser.add_argument('--scaled', metavar='FLOAT', type=float, - help='scaled value to downsample to') + subparser = subparsers.add_parser("summarize") + subparser.add_argument( + "--db", nargs="+", action="append", help="one or more LCA databases to use" + ) + subparser.add_argument( + "--query", + nargs="*", + default=[], + action="append", + help="one or more signature files to use as queries", + ) + subparser.add_argument( + "--query-from-file", help="file containing list of signature files to query" + ) + subparser.add_argument( + "--threshold", + metavar="T", + type=int, + default=5, + help="minimum number of hashes to require for a match", + ) + subparser.add_argument( + "-o", + "--output", + metavar="FILE", + help="file to which CSV output will be written", + ) + subparser.add_argument( + "--scaled", metavar="FLOAT", type=float, help="scaled value to downsample to" + ) - subparser.add_argument('--ignore-abundance', action='store_true', - help='ignore hash abundances in query signatures do not weight results') + subparser.add_argument( + "--ignore-abundance", + action="store_true", + help="ignore hash abundances in query signatures do not weight results", + ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debugging output' + "-d", "--debug", action="store_true", help="output debugging output" ) def main(args): import sourmash + return sourmash.lca.command_summarize.summarize_main(args) diff --git a/src/sourmash/cli/migrate.py b/src/sourmash/cli/migrate.py index fc5ebe1560..b4140c5afe 100644 --- a/src/sourmash/cli/migrate.py +++ b/src/sourmash/cli/migrate.py @@ -1,10 +1,12 @@ "'sourmash migrate' - migrate an SBT database to the latest version." + def subparser(subparsers): - subparser = subparsers.add_parser('migrate') - subparser.add_argument('sbt_name', help='name to save SBT into') + subparser = subparsers.add_parser("migrate") + subparser.add_argument("sbt_name", help="name to save SBT into") def main(args): import sourmash + return sourmash.commands.migrate(args) diff --git a/src/sourmash/cli/multigather.py b/src/sourmash/cli/multigather.py index cf20a32cd2..15f7f1fc71 100644 --- a/src/sourmash/cli/multigather.py +++ b/src/sourmash/cli/multigather.py @@ -1,6 +1,6 @@ "'sourmash multigather' - gather many signatures against multiple databases." -usage=""" +usage = """ The `multigather` subcommand runs 'gather' for multiple query sequences against the same collection of sequences. The main use for multigather @@ -40,52 +40,57 @@ def subparser(subparsers): - subparser = subparsers.add_parser('multigather') + subparser = subparsers.add_parser("multigather") subparser.add_argument( - '--query', nargs='*', default=[], action='append', - help='query signature' + "--query", nargs="*", default=[], action="append", help="query signature" ) subparser.add_argument( - '--query-from-file', - help='file containing list of signature files to query' + "--query-from-file", help="file containing list of signature files to query" ) subparser.add_argument( - '--db', nargs='+', action='append', - help='signatures/SBTs to search', + "--db", + nargs="+", + action="append", + help="signatures/SBTs to search", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) + subparser.add_argument("-d", "--debug", action="store_true") subparser.add_argument( - '-d', '--debug', action='store_true' + "--threshold-bp", + metavar="REAL", + type=float, + default=5e4, + help="threshold (in bp) for reporting results (default=50,000)", ) subparser.add_argument( - '--threshold-bp', metavar='REAL', type=float, default=5e4, - help='threshold (in bp) for reporting results (default=50,000)' + "--ignore-abundance", + action="store_true", + help="do NOT use k-mer abundances if present", ) subparser.add_argument( - '--ignore-abundance', action='store_true', - help='do NOT use k-mer abundances if present' + "--estimate-ani-ci", + action="store_true", + help="also output confidence intervals for ANI estimates", ) subparser.add_argument( - '--estimate-ani-ci', action='store_true', - help='also output confidence intervals for ANI estimates' + "--fail-on-empty-database", + action="store_true", + help="stop at databases that contain no compatible signatures", ) subparser.add_argument( - '--fail-on-empty-database', action='store_true', - help='stop at databases that contain no compatible signatures' - ) - subparser.add_argument( - '--no-fail-on-empty-database', action='store_false', - dest='fail_on_empty_database', - help='continue past databases that contain no compatible signatures' + "--no-fail-on-empty-database", + action="store_false", + dest="fail_on_empty_database", + help="continue past databases that contain no compatible signatures", ) subparser.set_defaults(fail_on_empty_database=True) subparser.add_argument( - '--output-dir', '--outdir', - help='output CSV results to this directory', + "--output-dir", + "--outdir", + help="output CSV results to this directory", ) add_ksize_arg(subparser) @@ -95,4 +100,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.commands.multigather(args) diff --git a/src/sourmash/cli/plot.py b/src/sourmash/cli/plot.py index a548683c39..718a5c8528 100644 --- a/src/sourmash/cli/plot.py +++ b/src/sourmash/cli/plot.py @@ -1,64 +1,80 @@ """plot distance matrix made by 'compare'""" + def subparser(subparsers): - subparser = subparsers.add_parser('plot') - subparser.add_argument( - 'distances', help='output from "sourmash compare"' - ) + subparser = subparsers.add_parser("plot") + subparser.add_argument("distances", help='output from "sourmash compare"') subparser.add_argument( - '--pdf', action='store_true', - help='output PDF; default is PNG' + "--pdf", action="store_true", help="output PDF; default is PNG" ) subparser.add_argument( - '--labels', action='store_true', default=None, - help='show sample labels on dendrogram/matrix' + "--labels", + action="store_true", + default=None, + help="show sample labels on dendrogram/matrix", ) subparser.add_argument( - '--no-labels', action='store_false', dest='labels', - help='do not show sample labels' + "--no-labels", + action="store_false", + dest="labels", + help="do not show sample labels", ) subparser.add_argument( - '--labeltext', - help='filename containing list of labels (overrides signature names); implies --labels' + "--labeltext", + help="filename containing list of labels (overrides signature names); implies --labels", ) subparser.add_argument( - '--indices', action='store_true', default=None, - help='show sample indices but not labels; overridden by --labels' + "--indices", + action="store_true", + default=None, + help="show sample indices but not labels; overridden by --labels", ) subparser.add_argument( - '--no-indices', action='store_false', dest='indices', - help='do not show sample indices' + "--no-indices", + action="store_false", + dest="indices", + help="do not show sample indices", ) subparser.add_argument( - '--vmin', default=0.0, type=float, - help='lower limit of heatmap scale; default=%(default)f' + "--vmin", + default=0.0, + type=float, + help="lower limit of heatmap scale; default=%(default)f", ) subparser.add_argument( - '--vmax', default=1.0, type=float, - help='upper limit of heatmap scale; default=%(default)f' + "--vmax", + default=1.0, + type=float, + help="upper limit of heatmap scale; default=%(default)f", ) subparser.add_argument( - '--subsample', type=int, metavar='N', - help='randomly downsample to this many samples, max' + "--subsample", + type=int, + metavar="N", + help="randomly downsample to this many samples, max", ) subparser.add_argument( - '--subsample-seed', type=int, default=1, metavar='S', - help='random seed for --subsample; default=1' + "--subsample-seed", + type=int, + default=1, + metavar="S", + help="random seed for --subsample; default=1", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='forcibly plot non-distance matrices' + "-f", "--force", action="store_true", help="forcibly plot non-distance matrices" ) subparser.add_argument( - '--output-dir', metavar='DIR', help='directory for output plots' + "--output-dir", metavar="DIR", help="directory for output plots" ) subparser.add_argument( - '--csv', metavar='F', - help='write clustered matrix and labels out in CSV format (with column' - ' headers) to this file' + "--csv", + metavar="F", + help="write clustered matrix and labels out in CSV format (with column" + " headers) to this file", ) def main(args): import sourmash + return sourmash.commands.plot(args) diff --git a/src/sourmash/cli/prefetch.py b/src/sourmash/cli/prefetch.py index 3727960292..55ee063d0b 100644 --- a/src/sourmash/cli/prefetch.py +++ b/src/sourmash/cli/prefetch.py @@ -1,66 +1,77 @@ """search a signature against dbs, find all overlaps""" -from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args, add_scaled_arg, - add_pattern_args) +from sourmash.cli.utils import ( + add_ksize_arg, + add_moltype_args, + add_picklist_args, + add_scaled_arg, + add_pattern_args, +) def subparser(subparsers): - subparser = subparsers.add_parser('prefetch') - subparser.add_argument('query', help='query signature') - subparser.add_argument("databases", + subparser = subparsers.add_parser("prefetch") + subparser.add_argument("query", help="query signature") + subparser.add_argument( + "databases", nargs="*", help="one or more databases to search", ) subparser.add_argument( "--db-from-file", default=None, - help="list of paths containing signatures to search" - ) - subparser.add_argument( - "--linear", action='store_true', - help="force linear traversal of indexes to minimize loading time and memory use" + help="list of paths containing signatures to search", ) subparser.add_argument( - '--no-linear', dest="linear", action='store_false', + "--linear", + action="store_true", + help="force linear traversal of indexes to minimize loading time and memory use", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "--no-linear", + dest="linear", + action="store_false", ) subparser.add_argument( - '-d', '--debug', action='store_true' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) + subparser.add_argument("-d", "--debug", action="store_true") subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output CSV containing matches to this file' + "-o", + "--output", + metavar="FILE", + help="output CSV containing matches to this file", ) subparser.add_argument( - '--save-matches', metavar='FILE', - help='save all matching signatures from the databases to the ' - 'specified file or directory' + "--save-matches", + metavar="FILE", + help="save all matching signatures from the databases to the " + "specified file or directory", ) subparser.add_argument( - '--threshold-bp', metavar='REAL', type=float, default=5e4, - help='reporting threshold (in bp) for estimated overlap with remaining query hashes (default=50kb)' + "--threshold-bp", + metavar="REAL", + type=float, + default=5e4, + help="reporting threshold (in bp) for estimated overlap with remaining query hashes (default=50kb)", ) subparser.add_argument( - '--save-unmatched-hashes', metavar='FILE', - help='output unmatched query hashes as a signature to the ' - 'specified file' + "--save-unmatched-hashes", + metavar="FILE", + help="output unmatched query hashes as a signature to the " "specified file", ) subparser.add_argument( - '--save-matching-hashes', metavar='FILE', - help='output matching query hashes as a signature to the ' - 'specified file' + "--save-matching-hashes", + metavar="FILE", + help="output matching query hashes as a signature to the " "specified file", ) subparser.add_argument( - '--md5', default=None, - help='select the signature with this md5 as query' + "--md5", default=None, help="select the signature with this md5 as query" ) subparser.add_argument( - '--estimate-ani-ci', action='store_true', - help='also output confidence intervals for ANI estimates' + "--estimate-ani-ci", + action="store_true", + help="also output confidence intervals for ANI estimates", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -71,4 +82,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.commands.prefetch(args) diff --git a/src/sourmash/cli/sbt_combine.py b/src/sourmash/cli/sbt_combine.py index 1b5ce0febf..20c09fe57a 100644 --- a/src/sourmash/cli/sbt_combine.py +++ b/src/sourmash/cli/sbt_combine.py @@ -1,17 +1,14 @@ """combine multiple Sequence Bloom Trees""" + def subparser(subparsers): - subparser = subparsers.add_parser('sbt_combine') - subparser.add_argument('sbt_name', help='name to save SBT into') - subparser.add_argument( - 'sbts', nargs='+', - help='SBTs to combine to form a new SBT' - ) - subparser.add_argument( - '-x', '--bf-size', metavar='S', type=float, default=1e5 - ) + subparser = subparsers.add_parser("sbt_combine") + subparser.add_argument("sbt_name", help="name to save SBT into") + subparser.add_argument("sbts", nargs="+", help="SBTs to combine to form a new SBT") + subparser.add_argument("-x", "--bf-size", metavar="S", type=float, default=1e5) def main(args): import sourmash + return sourmash.commands.sbt_combine(args) diff --git a/src/sourmash/cli/scripts/__init__.py b/src/sourmash/cli/scripts/__init__.py index 7062ff6c71..9655f05c98 100644 --- a/src/sourmash/cli/scripts/__init__.py +++ b/src/sourmash/cli/scripts/__init__.py @@ -21,16 +21,20 @@ # by sourmash.plugins.add_cli_scripts. _extension_dict = {} + def __getattr__(name): if name in _extension_dict: return _extension_dict[name] raise AttributeError(name) + def subparser(subparsers): - subparser = subparsers.add_parser('scripts', - usage=argparse.SUPPRESS, - formatter_class=argparse.RawDescriptionHelpFormatter, - aliases=['ext']) + subparser = subparsers.add_parser( + "scripts", + usage=argparse.SUPPRESS, + formatter_class=argparse.RawDescriptionHelpFormatter, + aliases=["ext"], + ) # get individual help strings: descrs = list(sourmash.plugins.get_cli_scripts_descriptions()) @@ -39,10 +43,12 @@ def subparser(subparsers): else: description = "(No script plugins detected!)" - s = subparser.add_subparsers(title="available plugin/extension commands", - dest='subcmd', - metavar='subcmd', - help=argparse.SUPPRESS, - description=description) + s = subparser.add_subparsers( + title="available plugin/extension commands", + dest="subcmd", + metavar="subcmd", + help=argparse.SUPPRESS, + description=description, + ) _extension_dict.update(sourmash.plugins.add_cli_scripts(s)) diff --git a/src/sourmash/cli/search.py b/src/sourmash/cli/search.py index 2c11873963..46bf46723b 100644 --- a/src/sourmash/cli/search.py +++ b/src/sourmash/cli/search.py @@ -1,6 +1,6 @@ """search a signature against other signatures""" -usage=""" +usage = """ The `search` subcommand searches a collection of signatures or SBTs for matches to the query signature. It can search for matches with @@ -41,77 +41,95 @@ --- """ -from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args, add_scaled_arg, - add_pattern_args) +from sourmash.cli.utils import ( + add_ksize_arg, + add_moltype_args, + add_picklist_args, + add_scaled_arg, + add_pattern_args, +) def subparser(subparsers): - subparser = subparsers.add_parser('search', description=__doc__, usage=usage) + subparser = subparsers.add_parser("search", description=__doc__, usage=usage) + subparser.add_argument("query", help="query signature") subparser.add_argument( - 'query', help='query signature' + "databases", + nargs="+", + help="signatures/SBTs to search", ) subparser.add_argument( - 'databases', nargs='+', - help='signatures/SBTs to search', + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-d", "--debug", action="store_true", help="output debug information" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debug information' + "-t", + "--threshold", + metavar="T", + default=0.08, + type=float, + help="minimum threshold for reporting matches; default=0.08", ) subparser.add_argument( - '-t', '--threshold', metavar='T', default=0.08, type=float, - help='minimum threshold for reporting matches; default=0.08' + "--save-matches", + metavar="FILE", + help="output matching signatures to the specified file", ) subparser.add_argument( - '--save-matches', metavar='FILE', - help='output matching signatures to the specified file' + "--best-only", + action="store_true", + help="report only the best match (with greater speed)", ) subparser.add_argument( - '--best-only', action='store_true', - help='report only the best match (with greater speed)' + "-n", + "--num-results", + default=3, + type=int, + metavar="N", + help="number of results to display to user; 0 to report all", ) subparser.add_argument( - '-n', '--num-results', default=3, type=int, metavar='N', - help='number of results to display to user; 0 to report all' + "--containment", + action="store_true", + help="score based on containment rather than similarity", ) subparser.add_argument( - '--containment', action='store_true', - help='score based on containment rather than similarity' + "--max-containment", + action="store_true", + help="score based on max containment rather than similarity", ) subparser.add_argument( - '--max-containment', action='store_true', - help='score based on max containment rather than similarity' + "--estimate-ani-ci", + action="store_true", + help="for containment searches, also output confidence intervals for ANI estimates", ) subparser.add_argument( - '--estimate-ani-ci', action='store_true', - help='for containment searches, also output confidence intervals for ANI estimates' + "--ignore-abundance", + action="store_true", + help="do NOT use k-mer abundances if present; note: has no effect if " + "--containment or --max-containment is specified", ) subparser.add_argument( - '--ignore-abundance', action='store_true', - help='do NOT use k-mer abundances if present; note: has no effect if ' - '--containment or --max-containment is specified' + "-o", + "--output", + metavar="FILE", + help="output CSV containing matches to this file", ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output CSV containing matches to this file' + "--md5", default=None, help="select the signature with this md5 as query" ) subparser.add_argument( - '--md5', default=None, - help='select the signature with this md5 as query' + "--fail-on-empty-database", + action="store_true", + help="stop at databases that contain no compatible signatures", ) subparser.add_argument( - '--fail-on-empty-database', action='store_true', - help='stop at databases that contain no compatible signatures' - ) - subparser.add_argument( - '--no-fail-on-empty-database', action='store_false', - dest='fail_on_empty_database', - help='continue past databases that contain no compatible signatures' + "--no-fail-on-empty-database", + action="store_false", + dest="fail_on_empty_database", + help="continue past databases that contain no compatible signatures", ) subparser.set_defaults(fail_on_empty_database=True) @@ -124,4 +142,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.commands.search(args) diff --git a/src/sourmash/cli/sig/__init__.py b/src/sourmash/cli/sig/__init__.py index f256a7473d..2ea27abf1d 100644 --- a/src/sourmash/cli/sig/__init__.py +++ b/src/sourmash/cli/sig/__init__.py @@ -33,19 +33,27 @@ def subparser(subparsers): - subparser = subparsers.add_parser('sig', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS, aliases=['signature']) - desc = 'Operations\n' + subparser = subparsers.add_parser( + "sig", + formatter_class=RawDescriptionHelpFormatter, + usage=SUPPRESS, + aliases=["signature"], + ) + desc = "Operations\n" clidir = os.path.dirname(__file__) ops = command_list(clidir) for subcmd in ops: docstring = getattr(sys.modules[__name__], subcmd).__doc__ - helpstring = 'sourmash sig {op:s} --help'.format(op=subcmd) - desc += ' {hs:33s} {ds:s}\n'.format(hs=helpstring, ds=docstring) + helpstring = f"sourmash sig {subcmd:s} --help" + desc += f" {helpstring:33s} {docstring:s}\n" s = subparser.add_subparsers( - title='Manipulate signature files', dest='subcmd', metavar='subcmd', help=SUPPRESS, - description=desc + title="Manipulate signature files", + dest="subcmd", + metavar="subcmd", + help=SUPPRESS, + description=desc, ) for subcmd in ops: getattr(sys.modules[__name__], subcmd).subparser(s) subparser._action_groups.reverse() - subparser._optionals.title = 'Options' + subparser._optionals.title = "Options" diff --git a/src/sourmash/cli/sig/cat.py b/src/sourmash/cli/sig/cat.py index ed85932f5f..b84905f254 100644 --- a/src/sourmash/cli/sig/cat.py +++ b/src/sourmash/cli/sig/cat.py @@ -1,6 +1,6 @@ """concatenate signature files""" -usage=""" +usage = """ ### `sourmash signature cat` - concatenate multiple signatures together @@ -15,37 +15,43 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args, add_pattern_args) +from sourmash.cli.utils import ( + add_moltype_args, + add_ksize_arg, + add_picklist_args, + add_pattern_args, +) def subparser(subparsers): # working on this - subparser = subparsers.add_parser('cat', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("cat", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='provide debugging output' + "-d", "--debug", action="store_true", help="provide debugging output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', default='-', - help='output signature to this file (default stdout)' + "-o", + "--output", + metavar="FILE", + default="-", + help="output signature to this file (default stdout)", ) subparser.add_argument( - '-u', '--unique', action='store_true', - help='keep only distinct signatures, removing duplicates (based on md5sum)' + "-u", + "--unique", + action="store_true", + help="keep only distinct signatures, removing duplicates (based on md5sum)", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -55,4 +61,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.cat(args) diff --git a/src/sourmash/cli/sig/check.py b/src/sourmash/cli/sig/check.py index b9dd353501..a4c940eecb 100644 --- a/src/sourmash/cli/sig/check.py +++ b/src/sourmash/cli/sig/check.py @@ -1,6 +1,6 @@ """check signature collections against a picklist""" -usage=""" +usage = """ sourmash sig check --picklist ... -o miss.csv -m manifest.csv @@ -15,51 +15,57 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args, add_pattern_args) +from sourmash.cli.utils import ( + add_moltype_args, + add_ksize_arg, + add_picklist_args, + add_pattern_args, +) def subparser(subparsers): - subparser = subparsers.add_parser('check', usage=usage) - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("check", usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='provide debugging output' + "-d", "--debug", action="store_true", help="provide debugging output" ) subparser.add_argument( - '-o', '--output-missing', metavar='FILE', - help='output picklist with remaining unmatched entries to this file', + "-o", + "--output-missing", + metavar="FILE", + help="output picklist with remaining unmatched entries to this file", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) subparser.add_argument( - '-m', '--save-manifest-matching', - help='save a manifest of the matching entries to this file.' + "-m", + "--save-manifest-matching", + help="save a manifest of the matching entries to this file.", ) subparser.add_argument( - '--fail-if-missing', action='store_true', - help='exit with an error code (-1) if there are any missing picklist values.' + "--fail-if-missing", + action="store_true", + help="exit with an error code (-1) if there are any missing picklist values.", ) subparser.add_argument( - '--no-require-manifest', - help='do not require a manifest; generate dynamically if needed', - action='store_true' + "--no-require-manifest", + help="do not require a manifest; generate dynamically if needed", + action="store_true", ) subparser.add_argument( - '-F', '--manifest-format', + "-F", + "--manifest-format", help="format of manifest output file; default is 'csv')", - default='csv', - choices=['csv', 'sql'], + default="csv", + choices=["csv", "sql"], ) add_ksize_arg(subparser) @@ -70,4 +76,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.check(args) diff --git a/src/sourmash/cli/sig/collect.py b/src/sourmash/cli/sig/collect.py index 397b0bf34e..1e5d8ded2f 100644 --- a/src/sourmash/cli/sig/collect.py +++ b/src/sourmash/cli/sig/collect.py @@ -1,6 +1,6 @@ """collect manifest information across many files""" -usage=""" +usage = """ sourmash sig collect -o all.sqlmf @@ -13,45 +13,49 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args, add_pattern_args) +from sourmash.cli.utils import ( + add_moltype_args, + add_ksize_arg, + add_picklist_args, + add_pattern_args, +) def subparser(subparsers): - subparser = subparsers.add_parser('collect', usage=usage) - subparser.add_argument('locations', nargs='*', - help='locations of input signatures') - subparser.add_argument('-o', '--output', help='manifest output file', - required=True) + subparser = subparsers.add_parser("collect", usage=usage) + subparser.add_argument("locations", nargs="*", help="locations of input signatures") + subparser.add_argument("-o", "--output", help="manifest output file", required=True) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='provide debugging output' + "-d", "--debug", action="store_true", help="provide debugging output" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) subparser.add_argument( - '--no-require-manifest', - help='do not require a manifest; generate dynamically if needed', - action='store_true' + "--no-require-manifest", + help="do not require a manifest; generate dynamically if needed", + action="store_true", ) subparser.add_argument( - '-F', '--manifest-format', + "-F", + "--manifest-format", help="format of manifest output file; default is 'csv')", - default='sql', - choices=['csv', 'sql'], + default="sql", + choices=["csv", "sql"], ) - subparser.add_argument('--merge-previous', action='store_true', - help='merge new manifests into existing') - subparser.add_argument('--abspath', - help="convert all locations to absolute paths", - action='store_true') + subparser.add_argument( + "--merge-previous", + action="store_true", + help="merge new manifests into existing", + ) + subparser.add_argument( + "--abspath", help="convert all locations to absolute paths", action="store_true" + ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -59,4 +63,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.collect(args) diff --git a/src/sourmash/cli/sig/describe.py b/src/sourmash/cli/sig/describe.py index c59ea1fede..a7984e89d3 100644 --- a/src/sourmash/cli/sig/describe.py +++ b/src/sourmash/cli/sig/describe.py @@ -1,6 +1,6 @@ """show details of signature""" -usage=""" +usage = """ ### `sourmash signature describe` - display detailed information about signatures @@ -22,32 +22,32 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args, add_pattern_args) +from sourmash.cli.utils import ( + add_moltype_args, + add_ksize_arg, + add_picklist_args, + add_pattern_args, +) def subparser(subparsers): - subparser = subparsers.add_parser('describe', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("describe", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='provide debugging output' + "-d", "--debug", action="store_true", help="provide debugging output" ) subparser.add_argument( - '--csv', metavar='FILE', - help='output information to a CSV file' + "--csv", metavar="FILE", help="output information to a CSV file" ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -57,4 +57,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.describe(args) diff --git a/src/sourmash/cli/sig/downsample.py b/src/sourmash/cli/sig/downsample.py index a06b7d2eb5..7a39221d29 100644 --- a/src/sourmash/cli/sig/downsample.py +++ b/src/sourmash/cli/sig/downsample.py @@ -1,6 +1,6 @@ """downsample one or more signatures""" -usage=""" +usage = """ ### `sourmash signature downsample` - decrease the size of a signature @@ -26,33 +26,36 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args, add_num_arg) +from sourmash.cli.utils import ( + add_moltype_args, + add_ksize_arg, + add_picklist_args, + add_num_arg, +) def subparser(subparsers): - subparser = subparsers.add_parser('downsample', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs="*") + subparser = subparsers.add_parser("downsample", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '--scaled', type=int, default=0, - help='scaled value to downsample to' + "--scaled", type=int, default=0, help="scaled value to downsample to" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output signature to this file (default stdout)', - default='-', + "-o", + "--output", + metavar="FILE", + help="output signature to this file (default stdout)", + default="-", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -62,4 +65,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.downsample(args) diff --git a/src/sourmash/cli/sig/export.py b/src/sourmash/cli/sig/export.py index 0299dba5d1..b6a4142d39 100644 --- a/src/sourmash/cli/sig/export.py +++ b/src/sourmash/cli/sig/export.py @@ -1,6 +1,6 @@ """export a signature, e.g. to mash""" -usage=""" +usage = """ ### `sourmash signature export` - export signatures to mash. @@ -17,19 +17,19 @@ def subparser(subparsers): - subparser = subparsers.add_parser('export', description=__doc__, usage=usage) - subparser.add_argument('filename') + subparser = subparsers.add_parser("export", description=__doc__, usage=usage) + subparser.add_argument("filename") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output signature to this file (default stdout)' + "-o", + "--output", + metavar="FILE", + help="output signature to this file (default stdout)", ) subparser.add_argument( - '--md5', default=None, - help='select the signature with this md5 as query' + "--md5", default=None, help="select the signature with this md5 as query" ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -37,4 +37,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.export(args) diff --git a/src/sourmash/cli/sig/extract.py b/src/sourmash/cli/sig/extract.py index a482526290..d3c483bb5e 100644 --- a/src/sourmash/cli/sig/extract.py +++ b/src/sourmash/cli/sig/extract.py @@ -1,6 +1,6 @@ """extract one or more signatures""" -usage=""" +usage = """ ### `sourmash signature extract` - extract signatures from a collection @@ -37,37 +37,43 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args, add_pattern_args) +from sourmash.cli.utils import ( + add_moltype_args, + add_ksize_arg, + add_picklist_args, + add_pattern_args, +) def subparser(subparsers): - subparser = subparsers.add_parser('extract', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("extract", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output signature to this file (default stdout)', - default='-', + "-o", + "--output", + metavar="FILE", + help="output signature to this file (default stdout)", + default="-", ) subparser.add_argument( - '--md5', default=None, - help='select signatures whose md5 contains this substring' + "--md5", + default=None, + help="select signatures whose md5 contains this substring", ) subparser.add_argument( - '--name', default=None, - help='select signatures whose name contains this substring' + "--name", + default=None, + help="select signatures whose name contains this substring", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -77,4 +83,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.extract(args) diff --git a/src/sourmash/cli/sig/fileinfo.py b/src/sourmash/cli/sig/fileinfo.py index 0b5e71df71..52a894fafb 100644 --- a/src/sourmash/cli/sig/fileinfo.py +++ b/src/sourmash/cli/sig/fileinfo.py @@ -1,6 +1,6 @@ """provide summary information on the given file""" -usage=""" +usage = """ sourmash sig fileinfo @@ -14,33 +14,27 @@ """ - def subparser(subparsers): - subparser = subparsers.add_parser('fileinfo', aliases=['summarize'], - usage=usage) - subparser.add_argument('path') + subparser = subparsers.add_parser("fileinfo", aliases=["summarize"], usage=usage) + subparser.add_argument("path") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debug information' + "-d", "--debug", action="store_true", help="output debug information" ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--rebuild-manifest', help='forcibly rebuild the manifest', - action='store_true' + "--rebuild-manifest", help="forcibly rebuild the manifest", action="store_true" ) subparser.add_argument( - '--json-out', help='output information in JSON format only', - action='store_true' + "--json-out", help="output information in JSON format only", action="store_true" ) def main(args): import sourmash + return sourmash.sig.__main__.fileinfo(args) diff --git a/src/sourmash/cli/sig/filter.py b/src/sourmash/cli/sig/filter.py index 4f5f020d83..3cfaa2c7a2 100644 --- a/src/sourmash/cli/sig/filter.py +++ b/src/sourmash/cli/sig/filter.py @@ -1,6 +1,6 @@ """filter k-mers on abundance""" -usage=""" +usage = """ ### `sourmash signature filter` - remove hashes based on abundance @@ -25,32 +25,43 @@ def subparser(subparsers): - subparser = subparsers.add_parser('filter', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='+') + subparser = subparsers.add_parser("filter", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="+") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output signature to this file (default stdout)', - default='-' + "-o", + "--output", + metavar="FILE", + help="output signature to this file (default stdout)", + default="-", ) subparser.add_argument( - '--md5', type=str, default=None, - help='select signatures whose md5 contains this substring' + "--md5", + type=str, + default=None, + help="select signatures whose md5 contains this substring", ) subparser.add_argument( - '--name', type=str, default=None, - help='select signatures whose name contains this substring' + "--name", + type=str, + default=None, + help="select signatures whose name contains this substring", ) subparser.add_argument( - '-m', '--min-abundance', type=int, default=1, - help='keep hashes >= this minimum abundance' + "-m", + "--min-abundance", + type=int, + default=1, + help="keep hashes >= this minimum abundance", ) subparser.add_argument( - '-M', '--max-abundance', type=int, default=None, - help='keep hashes <= this maximum abundance' + "-M", + "--max-abundance", + type=int, + default=None, + help="keep hashes <= this maximum abundance", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -58,4 +69,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.filter(args) diff --git a/src/sourmash/cli/sig/flatten.py b/src/sourmash/cli/sig/flatten.py index ca87b171c1..fa75f3434c 100644 --- a/src/sourmash/cli/sig/flatten.py +++ b/src/sourmash/cli/sig/flatten.py @@ -1,6 +1,6 @@ """remove abundances""" -usage=""" +usage = """ ### `sourmash signature flatten` - remove abundance information from signatures @@ -18,37 +18,38 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args) +from sourmash.cli.utils import add_moltype_args, add_ksize_arg, add_picklist_args def subparser(subparsers): - subparser = subparsers.add_parser('flatten', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("flatten", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output signature to this file (default stdout)', - default='-', + "-o", + "--output", + metavar="FILE", + help="output signature to this file (default stdout)", + default="-", ) subparser.add_argument( - '--md5', default=None, - help='select signatures whose md5 contains this substring' + "--md5", + default=None, + help="select signatures whose md5 contains this substring", ) subparser.add_argument( - '--name', default=None, - help='select signatures whose name contains this substring' + "--name", + default=None, + help="select signatures whose name contains this substring", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -57,4 +58,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.flatten(args) diff --git a/src/sourmash/cli/sig/grep.py b/src/sourmash/cli/sig/grep.py index 03d93299da..bf1c5ccf4a 100644 --- a/src/sourmash/cli/sig/grep.py +++ b/src/sourmash/cli/sig/grep.py @@ -1,6 +1,6 @@ """extract one or more signatures by substr/regex match""" -usage=""" +usage = """ sourmash sig grep [... ] This will search for the provided pattern in the files or databases, @@ -26,63 +26,67 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args) +from sourmash.cli.utils import add_moltype_args, add_ksize_arg, add_picklist_args def subparser(subparsers): - subparser = subparsers.add_parser('grep', usage=usage) - subparser.add_argument('pattern', help='search pattern (string/regex)') - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("grep", usage=usage) + subparser.add_argument("pattern", help="search pattern (string/regex)") + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debug information' + "-d", "--debug", action="store_true", help="output debug information" ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output matching signatures to this file (default stdout)', - default='-', + "-o", + "--output", + metavar="FILE", + help="output matching signatures to this file (default stdout)", + default="-", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures, independent of filename' + "-f", + "--force", + action="store_true", + help="try to load all files as signatures, independent of filename", ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) subparser.add_argument( - '-v', '--invert-match', + "-v", + "--invert-match", help="select non-matching signatures", - action="store_true" + action="store_true", ) subparser.add_argument( - '-i', '--ignore-case', + "-i", + "--ignore-case", help="ignore case distinctions (search lower and upper case both)", - action="store_true" + action="store_true", ) subparser.add_argument( - '--no-require-manifest', - help='do not require a manifest; generate dynamically if needed', - action='store_true' + "--no-require-manifest", + help="do not require a manifest; generate dynamically if needed", + action="store_true", ) subparser.add_argument( - '--csv', - help='save CSV file containing signature data in manifest format' + "--csv", help="save CSV file containing signature data in manifest format" ) subparser.add_argument( - '--silent', '--no-signatures-output', + "--silent", + "--no-signatures-output", help="do not output signatures", - action='store_true', + action="store_true", ) subparser.add_argument( - '-c', '--count', + "-c", + "--count", help="only output a count of discovered signatures; implies --silent", - action='store_true' + action="store_true", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -91,4 +95,5 @@ def subparser(subparsers): def main(args): import sourmash.sig.grep + return sourmash.sig.grep.main(args) diff --git a/src/sourmash/cli/sig/inflate.py b/src/sourmash/cli/sig/inflate.py index c5a247727a..50b86e6dcf 100644 --- a/src/sourmash/cli/sig/inflate.py +++ b/src/sourmash/cli/sig/inflate.py @@ -1,24 +1,24 @@ """borrow abundances from one signature => one or more other signatures""" -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args) +from sourmash.cli.utils import add_moltype_args, add_ksize_arg, add_picklist_args def subparser(subparsers): - subparser = subparsers.add_parser('inflate') - subparser.add_argument('signature_from') - subparser.add_argument('other_sigs', nargs='+') + subparser = subparsers.add_parser("inflate") + subparser.add_argument("signature_from") + subparser.add_argument("other_sigs", nargs="+") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', default='-', - help='output signature to this file (default stdout)' + "-o", + "--output", + metavar="FILE", + default="-", + help="output signature to this file (default stdout)", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -27,4 +27,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.inflate(args) diff --git a/src/sourmash/cli/sig/ingest.py b/src/sourmash/cli/sig/ingest.py index 9c7d9e0547..99e84f7a63 100644 --- a/src/sourmash/cli/sig/ingest.py +++ b/src/sourmash/cli/sig/ingest.py @@ -1,6 +1,6 @@ """ingest/import a mash or other signature""" -usage=""" +usage = """ sourmash sig ingest --csv [ ] -o @@ -16,21 +16,25 @@ def subparser(subparsers): # Dirty hack to simultaneously support new and previous interface # If desired, this function can be removed with a major version bump. - for cmd in ('ingest', 'import'): + for cmd in ("ingest", "import"): subparser = subparsers.add_parser(cmd, usage=usage) - subparser.add_argument('--csv', action='store_true', - help='import in Mash CSV format') - subparser.add_argument('filenames', nargs='+') subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "--csv", action="store_true", help="import in Mash CSV format" ) + subparser.add_argument("filenames", nargs="+") subparser.add_argument( - '-o', '--output', metavar='FILE', default='-', - help='output signature to this file (default stdout)' + "-q", "--quiet", action="store_true", help="suppress non-error output" + ) + subparser.add_argument( + "-o", + "--output", + metavar="FILE", + default="-", + help="output signature to this file (default stdout)", ) def main(args): import sourmash + return sourmash.sig.__main__.ingest(args) diff --git a/src/sourmash/cli/sig/intersect.py b/src/sourmash/cli/sig/intersect.py index 4a5ea4db23..521e83f10f 100644 --- a/src/sourmash/cli/sig/intersect.py +++ b/src/sourmash/cli/sig/intersect.py @@ -1,6 +1,6 @@ """intersect two or more signatures""" -usage=""" +usage = """ ### `sourmash signature intersect` - intersect two (or more) signatures @@ -22,32 +22,34 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args) +from sourmash.cli.utils import add_moltype_args, add_ksize_arg, add_picklist_args def subparser(subparsers): - subparser = subparsers.add_parser('intersect', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("intersect", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', default='-', - help='output signature to this file (default stdout)' + "-o", + "--output", + metavar="FILE", + default="-", + help="output signature to this file (default stdout)", ) subparser.add_argument( - '-A', '--abundances-from', metavar='FILE', - help='intersect with & take abundances from this signature' + "-A", + "--abundances-from", + metavar="FILE", + help="intersect with & take abundances from this signature", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -56,4 +58,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.intersect(args) diff --git a/src/sourmash/cli/sig/kmers.py b/src/sourmash/cli/sig/kmers.py index 08863f33c9..98d7ee9d8d 100644 --- a/src/sourmash/cli/sig/kmers.py +++ b/src/sourmash/cli/sig/kmers.py @@ -1,6 +1,6 @@ """show k-mers/sequences matching the signature hashes""" -usage=""" +usage = """ ### `sourmash signature kmers` - extract k-mers and/or sequences that match to signatures @@ -48,44 +48,52 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args) +from sourmash.cli.utils import add_moltype_args, add_ksize_arg, add_picklist_args def subparser(subparsers): - subparser = subparsers.add_parser('kmers', description=__doc__, usage=usage) - subparser.add_argument('--signatures', nargs='*', default=[]) + subparser = subparsers.add_parser("kmers", description=__doc__, usage=usage) + subparser.add_argument("--signatures", nargs="*", default=[]) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) add_ksize_arg(subparser) add_moltype_args(subparser) add_picklist_args(subparser) - subparser.add_argument('--sequences', nargs='+', required=True, - help="FASTA/FASTQ/bz2/gz files with sequences") + subparser.add_argument( + "--sequences", + nargs="+", + required=True, + help="FASTA/FASTQ/bz2/gz files with sequences", + ) - subparser.add_argument('--save-kmers', - help="save k-mers and hash values to a CSV file") - subparser.add_argument('--save-sequences', - help="save sequences with matching hashes to a FASTA file") - subparser.add_argument('--translate', action="store_true", - help="translate DNA k-mers into amino acids (for protein, dayhoff, and hp sketches)") subparser.add_argument( - '--check-sequence', action='store_true', - help='complain if input sequence is invalid (NOTE: only checks DNA)' + "--save-kmers", help="save k-mers and hash values to a CSV file" + ) + subparser.add_argument( + "--save-sequences", help="save sequences with matching hashes to a FASTA file" + ) + subparser.add_argument( + "--translate", + action="store_true", + help="translate DNA k-mers into amino acids (for protein, dayhoff, and hp sketches)", + ) + subparser.add_argument( + "--check-sequence", + action="store_true", + help="complain if input sequence is invalid (NOTE: only checks DNA)", ) def main(args): import sourmash + return sourmash.sig.__main__.kmers(args) diff --git a/src/sourmash/cli/sig/manifest.py b/src/sourmash/cli/sig/manifest.py index e066dbda67..72f00500c4 100644 --- a/src/sourmash/cli/sig/manifest.py +++ b/src/sourmash/cli/sig/manifest.py @@ -1,6 +1,6 @@ """create a manifest for a collection of signatures""" -usage=""" +usage = """ sourmash sig manifest -o manifest.csv @@ -17,36 +17,40 @@ def subparser(subparsers): - subparser = subparsers.add_parser('manifest', usage=usage) - subparser.add_argument('location') + subparser = subparsers.add_parser("manifest", usage=usage) + subparser.add_argument("location") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debug information' + "-d", "--debug", action="store_true", help="output debug information" ) subparser.add_argument( - '-o', '--output', '--csv', metavar='FILE', - help='output information to a CSV file', + "-o", + "--output", + "--csv", + metavar="FILE", + help="output information to a CSV file", required=True, ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--no-rebuild-manifest', help='use existing manifest if available', - action='store_true' + "--no-rebuild-manifest", + help="use existing manifest if available", + action="store_true", ) subparser.add_argument( - '-F', '--manifest-format', + "-F", + "--manifest-format", help="format of manifest output file; default is 'csv')", - default='csv', - choices=['csv', 'sql'], + default="csv", + choices=["csv", "sql"], ) + def main(args): import sourmash + return sourmash.sig.__main__.manifest(args) diff --git a/src/sourmash/cli/sig/merge.py b/src/sourmash/cli/sig/merge.py index 6de8b77d16..026749a5f0 100644 --- a/src/sourmash/cli/sig/merge.py +++ b/src/sourmash/cli/sig/merge.py @@ -1,6 +1,6 @@ """merge one or more signatures""" -usage=""" +usage = """ ### `sourmash signature merge` - merge two or more signatures into one @@ -24,36 +24,32 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args) +from sourmash.cli.utils import add_moltype_args, add_ksize_arg, add_picklist_args def subparser(subparsers): - subparser = subparsers.add_parser('merge', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("merge", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', default='-', - help='output signature to this file (default stdout)' + "-o", + "--output", + metavar="FILE", + default="-", + help="output signature to this file (default stdout)", ) subparser.add_argument( - '--flatten', action='store_true', - help='remove abundances from all signatures' + "--flatten", action="store_true", help="remove abundances from all signatures" ) + subparser.add_argument("--name", help="rename merged signature") subparser.add_argument( - '--name', - help='rename merged signature' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' - ) - subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -62,4 +58,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.merge(args) diff --git a/src/sourmash/cli/sig/overlap.py b/src/sourmash/cli/sig/overlap.py index 373336253c..c268e62a85 100644 --- a/src/sourmash/cli/sig/overlap.py +++ b/src/sourmash/cli/sig/overlap.py @@ -1,6 +1,6 @@ """see detailed comparison of signatures""" -usage=""" +usage = """ ### `sourmash signature overlap` - detailed comparison of two signatures' overlap @@ -28,12 +28,11 @@ def subparser(subparsers): - subparser = subparsers.add_parser('overlap', description=__doc__, usage=usage) - subparser.add_argument('signature1') - subparser.add_argument('signature2') + subparser = subparsers.add_parser("overlap", description=__doc__, usage=usage) + subparser.add_argument("signature1") + subparser.add_argument("signature2") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -41,4 +40,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.overlap(args) diff --git a/src/sourmash/cli/sig/rename.py b/src/sourmash/cli/sig/rename.py index 2b360fa8d3..4ed25612fc 100644 --- a/src/sourmash/cli/sig/rename.py +++ b/src/sourmash/cli/sig/rename.py @@ -1,6 +1,6 @@ """rename signature""" -usage=""" +usage = """ ### `sourmash signature rename` - rename a signature @@ -17,34 +17,37 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args, add_pattern_args) +from sourmash.cli.utils import ( + add_moltype_args, + add_ksize_arg, + add_picklist_args, + add_pattern_args, +) def subparser(subparsers): - subparser = subparsers.add_parser('rename', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='*') - subparser.add_argument('name') + subparser = subparsers.add_parser("rename", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") + subparser.add_argument("name") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-d', '--debug', action='store_true', - help='print debugging output' + "-d", "--debug", action="store_true", help="print debugging output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output renamed signature to this file (default stdout)', - default='-' + "-o", + "--output", + metavar="FILE", + help="output renamed signature to this file (default stdout)", + default="-", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -54,4 +57,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.rename(args) diff --git a/src/sourmash/cli/sig/split.py b/src/sourmash/cli/sig/split.py index e4587b3e0f..bf98fc71fe 100644 --- a/src/sourmash/cli/sig/split.py +++ b/src/sourmash/cli/sig/split.py @@ -1,6 +1,6 @@ """split signature files""" -usage=""" +usage = """ ### `sourmash signature split` - split signatures into individual files @@ -36,32 +36,33 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, - add_picklist_args) +from sourmash.cli.utils import add_moltype_args, add_ksize_arg, add_picklist_args def subparser(subparsers): - subparser = subparsers.add_parser('split', description=__doc__, usage=usage) - subparser.add_argument('signatures', nargs='*') + subparser = subparsers.add_parser("split", description=__doc__, usage=usage) + subparser.add_argument("signatures", nargs="*") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '--output-dir', '--outdir', - help='output signatures to this directory', + "--output-dir", + "--outdir", + help="output signatures to this directory", ) subparser.add_argument( - '-f', '--force', action='store_true', - help='try to load all files as signatures' + "-f", "--force", action="store_true", help="try to load all files as signatures" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of files to load signatures from' + "--from-file", + help="a text file containing a list of files to load signatures from", ) subparser.add_argument( - '-E', '--extension', type=str, default='.sig', - help="write files with this extension ('.sig' by default)" + "-E", + "--extension", + type=str, + default=".sig", + help="write files with this extension ('.sig' by default)", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -70,4 +71,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.split(args) diff --git a/src/sourmash/cli/sig/subtract.py b/src/sourmash/cli/sig/subtract.py index 118d91fe41..69a349ace3 100644 --- a/src/sourmash/cli/sig/subtract.py +++ b/src/sourmash/cli/sig/subtract.py @@ -1,6 +1,6 @@ """subtract one or more signatures""" -usage=""" +usage = """ ### `sourmash signature subtract` - subtract other signatures from a signature @@ -22,28 +22,33 @@ """ -from sourmash.cli.utils import (add_moltype_args, add_ksize_arg) +from sourmash.cli.utils import add_moltype_args, add_ksize_arg def subparser(subparsers): - subparser = subparsers.add_parser('subtract', description=__doc__, usage=usage) - subparser.add_argument('signature_from') - subparser.add_argument('subtraction_sigs', nargs='+') + subparser = subparsers.add_parser("subtract", description=__doc__, usage=usage) + subparser.add_argument("signature_from") + subparser.add_argument("subtraction_sigs", nargs="+") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', metavar='FILE', default='-', - help='output signature to this file (default stdout)' + "-o", + "--output", + metavar="FILE", + default="-", + help="output signature to this file (default stdout)", ) subparser.add_argument( - '--flatten', action='store_true', - help='remove abundance from signatures before subtracting' + "--flatten", + action="store_true", + help="remove abundance from signatures before subtracting", ) subparser.add_argument( - '-A', '--abundances-from', metavar='FILE', - help='intersect with & take abundances from this signature' + "-A", + "--abundances-from", + metavar="FILE", + help="intersect with & take abundances from this signature", ) add_ksize_arg(subparser) add_moltype_args(subparser) @@ -51,4 +56,5 @@ def subparser(subparsers): def main(args): import sourmash + return sourmash.sig.__main__.subtract(args) diff --git a/src/sourmash/cli/sketch/__init__.py b/src/sourmash/cli/sketch/__init__.py index 22abf26ed1..999ce1d3b9 100644 --- a/src/sourmash/cli/sketch/__init__.py +++ b/src/sourmash/cli/sketch/__init__.py @@ -18,19 +18,24 @@ def subparser(subparsers): - subparser = subparsers.add_parser('sketch', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS) - desc = 'Operations\n' + subparser = subparsers.add_parser( + "sketch", formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS + ) + desc = "Operations\n" clidir = os.path.dirname(__file__) ops = command_list(clidir) for subcmd in ops: docstring = getattr(sys.modules[__name__], subcmd).__doc__ - helpstring = 'sourmash sketch {op:s} --help'.format(op=subcmd) - desc += ' {hs:33s} {ds:s}\n'.format(hs=helpstring, ds=docstring) + helpstring = f"sourmash sketch {subcmd:s} --help" + desc += f" {helpstring:33s} {docstring:s}\n" s = subparser.add_subparsers( - title='Create signatures', dest='subcmd', metavar='subcmd', help=SUPPRESS, - description=desc + title="Create signatures", + dest="subcmd", + metavar="subcmd", + help=SUPPRESS, + description=desc, ) for subcmd in ops: getattr(sys.modules[__name__], subcmd).subparser(s) subparser._action_groups.reverse() - subparser._optionals.title = 'Options' + subparser._optionals.title = "Options" diff --git a/src/sourmash/cli/sketch/dna.py b/src/sourmash/cli/sketch/dna.py index 1d82f9df65..19f6de7509 100644 --- a/src/sourmash/cli/sketch/dna.py +++ b/src/sourmash/cli/sketch/dna.py @@ -1,6 +1,6 @@ """create DNA signatures""" -usage=""" +usage = """ sourmash sketch dna data/*.fna.gz @@ -25,66 +25,79 @@ from sourmash.logging import notify, print_results, error from sourmash import command_sketch -assert command_sketch.DEFAULTS['dna'] == 'k=31,scaled=1000,noabund' + +assert command_sketch.DEFAULTS["dna"] == "k=31,scaled=1000,noabund" def subparser(subparsers): - subparser = subparsers.add_parser('dna', - aliases=['rna', 'nucleotide', 'nt'], - usage=usage) - subparser.add_argument( - '--license', default='CC0', type=str, - help='signature license. Currently only CC0 is supported.' + subparser = subparsers.add_parser( + "dna", aliases=["rna", "nucleotide", "nt"], usage=usage ) subparser.add_argument( - '--check-sequence', action='store_true', - help='complain if input sequence is invalid DNA' + "--license", + default="CC0", + type=str, + help="signature license. Currently only CC0 is supported.", ) subparser.add_argument( - '-p', '--param-string', default=[], - help='signature parameters to use.', action='append', + "--check-sequence", + action="store_true", + help="complain if input sequence is invalid DNA", ) - subparser.add_argument( - 'filenames', nargs='*', help='file(s) of sequences' + "-p", + "--param-string", + default=[], + help="signature parameters to use.", + action="append", ) - file_args = subparser.add_argument_group('File handling options') + + subparser.add_argument("filenames", nargs="*", help="file(s) of sequences") + file_args = subparser.add_argument_group("File handling options") file_args.add_argument( - '-f', '--force', action='store_true', - help='recompute signatures even if the file exists' + "-f", + "--force", + action="store_true", + help="recompute signatures even if the file exists", ) subparser.add_argument( - '--from-file', - help='a text file containing a list of sequence files to load' + "--from-file", help="a text file containing a list of sequence files to load" ) file_args.add_argument( - '-o', '--output', - help='output computed signatures to this file' + "-o", "--output", help="output computed signatures to this file" ) file_args.add_argument( - '--merge', '--name', type=str, default='', metavar="FILE", - help='merge all input files into one signature file with the ' - 'specified name' + "--merge", + "--name", + type=str, + default="", + metavar="FILE", + help="merge all input files into one signature file with the " "specified name", ) file_args.add_argument( - '--output-dir', '--outdir', - help='output computed signatures to this directory', + "--output-dir", + "--outdir", + help="output computed signatures to this directory", ) file_args.add_argument( - '--singleton', action='store_true', - help='compute a signature for each sequence record individually' + "--singleton", + action="store_true", + help="compute a signature for each sequence record individually", ) file_args.add_argument( - '--name-from-first', action='store_true', - help='name the signature generated from each file after the first ' - 'record in the file' + "--name-from-first", + action="store_true", + help="name the signature generated from each file after the first " + "record in the file", ) file_args.add_argument( - '--randomize', action='store_true', - help='shuffle the list of input filenames randomly' + "--randomize", + action="store_true", + help="shuffle the list of input filenames randomly", ) def main(args): import sourmash.command_sketch + return sourmash.command_sketch.dna(args) diff --git a/src/sourmash/cli/sketch/fromfile.py b/src/sourmash/cli/sketch/fromfile.py index 08a3e44661..6bd57d26ad 100644 --- a/src/sourmash/cli/sketch/fromfile.py +++ b/src/sourmash/cli/sketch/fromfile.py @@ -1,6 +1,6 @@ """create signatures from a CSV file""" -usage=""" +usage = """ sourmash sketch fromfile --output-signatures -p <...> @@ -28,55 +28,66 @@ def subparser(subparsers): - subparser = subparsers.add_parser('fromfile', - usage=usage) + subparser = subparsers.add_parser("fromfile", usage=usage) subparser.add_argument( - 'csvs', nargs='+', - help="input CSVs providing 'name', 'genome_filename', and 'protein_filename'" + "csvs", + nargs="+", + help="input CSVs providing 'name', 'genome_filename', and 'protein_filename'", ) subparser.add_argument( - '-p', '--param-string', default=[], - help='signature parameters to use.', action='append', + "-p", + "--param-string", + default=[], + help="signature parameters to use.", + action="append", ) subparser.add_argument( - '--already-done', nargs='+', default=[], - help='one or more collections of existing signatures to avoid recalculating' + "--already-done", + nargs="+", + default=[], + help="one or more collections of existing signatures to avoid recalculating", ) subparser.add_argument( - '--license', default='CC0', type=str, - help='signature license. Currently only CC0 is supported.' + "--license", + default="CC0", + type=str, + help="signature license. Currently only CC0 is supported.", ) subparser.add_argument( - '--check-sequence', action='store_true', - help='complain if input sequence is invalid (NOTE: only checks DNA)' + "--check-sequence", + action="store_true", + help="complain if input sequence is invalid (NOTE: only checks DNA)", ) - file_args = subparser.add_argument_group('File handling options') + file_args = subparser.add_argument_group("File handling options") file_args.add_argument( - '-o', '--output-signatures', - help='output computed signatures to this file', + "-o", + "--output-signatures", + help="output computed signatures to this file", ) file_args.add_argument( - '--force-output-already-exists', action='store_true', - help='overwrite/append to --output-signatures location' + "--force-output-already-exists", + action="store_true", + help="overwrite/append to --output-signatures location", ) file_args.add_argument( - '--ignore-missing', action='store_true', - help='proceed with building possible signatures, even if some input files are missing' + "--ignore-missing", + action="store_true", + help="proceed with building possible signatures, even if some input files are missing", ) file_args.add_argument( - '--output-csv-info', - help='output information about what signatures need to be generated' + "--output-csv-info", + help="output information about what signatures need to be generated", ) file_args.add_argument( - '--output-manifest-matching', - help='output a manifest file of already-existing signatures' + "--output-manifest-matching", + help="output a manifest file of already-existing signatures", ) file_args.add_argument( - '--report-duplicated', action='store_true', - help='report duplicated names' + "--report-duplicated", action="store_true", help="report duplicated names" ) def main(args): import sourmash.command_sketch + return sourmash.command_sketch.fromfile(args) diff --git a/src/sourmash/cli/sketch/protein.py b/src/sourmash/cli/sketch/protein.py index 24324ea905..3092d35367 100644 --- a/src/sourmash/cli/sketch/protein.py +++ b/src/sourmash/cli/sketch/protein.py @@ -1,6 +1,6 @@ """create protein signatures""" -usage=""" +usage = """ sourmash sketch protein data/*.fna.gz @@ -26,69 +26,82 @@ from sourmash.logging import notify, print_results, error from sourmash import command_sketch -assert command_sketch.DEFAULTS['protein'] == 'k=10,scaled=200,noabund' + +assert command_sketch.DEFAULTS["protein"] == "k=10,scaled=200,noabund" def subparser(subparsers): - subparser = subparsers.add_parser('protein', aliases=['aa', 'prot'], - usage=usage) - subparser.add_argument( - '--license', default='CC0', type=str, - help='signature license. Currently only CC0 is supported.' - ) + subparser = subparsers.add_parser("protein", aliases=["aa", "prot"], usage=usage) subparser.add_argument( - '-p', '--param-string', default=[], - help='signature parameters to use.', action='append', + "--license", + default="CC0", + type=str, + help="signature license. Currently only CC0 is supported.", ) - subparser.add_argument( - 'filenames', nargs='*', help='file(s) of sequences' + "-p", + "--param-string", + default=[], + help="signature parameters to use.", + action="append", ) - file_args = subparser.add_argument_group('File handling options') + + subparser.add_argument("filenames", nargs="*", help="file(s) of sequences") + file_args = subparser.add_argument_group("File handling options") file_args.add_argument( - '-f', '--force', action='store_true', - help='recompute signatures even if the file exists' + "-f", + "--force", + action="store_true", + help="recompute signatures even if the file exists", ) file_args.add_argument( - '-o', '--output', - help='output computed signatures to this file' + "-o", "--output", help="output computed signatures to this file" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of sequence files to load' + "--from-file", help="a text file containing a list of sequence files to load" ) file_args.add_argument( - '--merge', '--name', type=str, default='', metavar="FILE", - help='merge all input files into one signature file with the ' - 'specified name' + "--merge", + "--name", + type=str, + default="", + metavar="FILE", + help="merge all input files into one signature file with the " "specified name", ) file_args.add_argument( - '--output-dir', '--outdir', - help='output computed signatures to this directory', + "--output-dir", + "--outdir", + help="output computed signatures to this directory", ) file_args.add_argument( - '--singleton', action='store_true', - help='compute a signature for each sequence record individually' + "--singleton", + action="store_true", + help="compute a signature for each sequence record individually", ) file_args.add_argument( - '--name-from-first', action='store_true', - help='name the signature generated from each file after the first ' - 'record in the file' + "--name-from-first", + action="store_true", + help="name the signature generated from each file after the first " + "record in the file", ) file_args.add_argument( - '--randomize', action='store_true', - help='shuffle the list of input filenames randomly' + "--randomize", + action="store_true", + help="shuffle the list of input filenames randomly", ) file_args.add_argument( - '--dayhoff', action='store_true', - help='compute sketches using the dayhoff alphabet instead' + "--dayhoff", + action="store_true", + help="compute sketches using the dayhoff alphabet instead", ) file_args.add_argument( - '--hp', action='store_true', - help='compute sketches using the dayhoff alphabet instead' + "--hp", + action="store_true", + help="compute sketches using the dayhoff alphabet instead", ) def main(args): import sourmash.command_sketch + return sourmash.command_sketch.protein(args) diff --git a/src/sourmash/cli/sketch/translate.py b/src/sourmash/cli/sketch/translate.py index df48d4818a..f5bccab46f 100644 --- a/src/sourmash/cli/sketch/translate.py +++ b/src/sourmash/cli/sketch/translate.py @@ -1,6 +1,6 @@ """create protein signature from DNA/RNA sequence""" -usage=""" +usage = """ sourmash sketch translate data/*.fna.gz @@ -24,75 +24,90 @@ """ from sourmash import command_sketch -assert command_sketch.DEFAULTS['protein'] == 'k=10,scaled=200,noabund' + +assert command_sketch.DEFAULTS["protein"] == "k=10,scaled=200,noabund" import sourmash from sourmash.logging import notify, print_results, error def subparser(subparsers): - subparser = subparsers.add_parser('translate', usage=usage) - subparser.add_argument( - '--license', default='CC0', type=str, - help='signature license. Currently only CC0 is supported.' - ) + subparser = subparsers.add_parser("translate", usage=usage) subparser.add_argument( - '--check-sequence', action='store_true', - help='complain if input sequence is invalid DNA' + "--license", + default="CC0", + type=str, + help="signature license. Currently only CC0 is supported.", ) subparser.add_argument( - '-p', '--param-string', default=[], - help='signature parameters to use.', action='append', + "--check-sequence", + action="store_true", + help="complain if input sequence is invalid DNA", ) - subparser.add_argument( - 'filenames', nargs='*', help='file(s) of sequences' + "-p", + "--param-string", + default=[], + help="signature parameters to use.", + action="append", ) - file_args = subparser.add_argument_group('File handling options') + + subparser.add_argument("filenames", nargs="*", help="file(s) of sequences") + file_args = subparser.add_argument_group("File handling options") file_args.add_argument( - '-f', '--force', action='store_true', - help='recompute signatures even if the file exists' + "-f", + "--force", + action="store_true", + help="recompute signatures even if the file exists", ) file_args.add_argument( - '-o', '--output', - help='output computed signatures to this file' + "-o", "--output", help="output computed signatures to this file" ) subparser.add_argument( - '--from-file', - help='a text file containing a list of sequence files to load' + "--from-file", help="a text file containing a list of sequence files to load" ) file_args.add_argument( - '--merge', '--name', type=str, default='', metavar="FILE", - help='merge all input files into one signature file with the ' - 'specified name' + "--merge", + "--name", + type=str, + default="", + metavar="FILE", + help="merge all input files into one signature file with the " "specified name", ) file_args.add_argument( - '--output-dir', '--outdir', - help='output computed signatures to this directory', + "--output-dir", + "--outdir", + help="output computed signatures to this directory", ) file_args.add_argument( - '--singleton', action='store_true', - help='compute a signature for each sequence record individually' + "--singleton", + action="store_true", + help="compute a signature for each sequence record individually", ) file_args.add_argument( - '--name-from-first', action='store_true', - help='name the signature generated from each file after the first ' - 'record in the file' + "--name-from-first", + action="store_true", + help="name the signature generated from each file after the first " + "record in the file", ) file_args.add_argument( - '--randomize', action='store_true', - help='shuffle the list of input filenames randomly' + "--randomize", + action="store_true", + help="shuffle the list of input filenames randomly", ) file_args.add_argument( - '--dayhoff', action='store_true', - help='compute sketches using the dayhoff alphabet instead' + "--dayhoff", + action="store_true", + help="compute sketches using the dayhoff alphabet instead", ) file_args.add_argument( - '--hp', action='store_true', - help='compute sketches using the dayhoff alphabet instead' + "--hp", + action="store_true", + help="compute sketches using the dayhoff alphabet instead", ) def main(args): import sourmash.command_sketch + return sourmash.command_sketch.translate(args) diff --git a/src/sourmash/cli/storage/__init__.py b/src/sourmash/cli/storage/__init__.py index 8ad0b2ada1..42f1a292b2 100644 --- a/src/sourmash/cli/storage/__init__.py +++ b/src/sourmash/cli/storage/__init__.py @@ -12,19 +12,24 @@ def subparser(subparsers): - subparser = subparsers.add_parser('storage', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS) - desc = 'Operations\n' + subparser = subparsers.add_parser( + "storage", formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS + ) + desc = "Operations\n" clidir = os.path.dirname(__file__) ops = command_list(clidir) for subcmd in ops: docstring = getattr(sys.modules[__name__], subcmd).__doc__ - helpstring = 'sourmash storage {op:s} --help'.format(op=subcmd) - desc += ' {hs:33s} {ds:s}\n'.format(hs=helpstring, ds=docstring) + helpstring = f"sourmash storage {subcmd:s} --help" + desc += f" {helpstring:33s} {docstring:s}\n" s = subparser.add_subparsers( - title='Storage utilities', dest='subcmd', metavar='subcmd', help=SUPPRESS, - description=desc + title="Storage utilities", + dest="subcmd", + metavar="subcmd", + help=SUPPRESS, + description=desc, ) for subcmd in ops: getattr(sys.modules[__name__], subcmd).subparser(s) subparser._action_groups.reverse() - subparser._optionals.title = 'Options' + subparser._optionals.title = "Options" diff --git a/src/sourmash/cli/storage/convert.py b/src/sourmash/cli/storage/convert.py index 0aa5c23fa5..7efbc2e7ff 100644 --- a/src/sourmash/cli/storage/convert.py +++ b/src/sourmash/cli/storage/convert.py @@ -1,16 +1,13 @@ "'sourmash storage convert' - convert an SBT to use a different back end." + def subparser(subparsers): - subparser = subparsers.add_parser('convert') - subparser.add_argument( - 'sbt', help='name to save SBT into' - ) - subparser.add_argument( - '-b', '--backend', type=str, - help='Backend to convert to' - ) + subparser = subparsers.add_parser("convert") + subparser.add_argument("sbt", help="name to save SBT into") + subparser.add_argument("-b", "--backend", type=str, help="Backend to convert to") def main(args): import sourmash + return sourmash.sbt.convert_cmd(args.sbt, args.backend) diff --git a/src/sourmash/cli/tax/__init__.py b/src/sourmash/cli/tax/__init__.py index b8bf95f8d8..0b58299f56 100644 --- a/src/sourmash/cli/tax/__init__.py +++ b/src/sourmash/cli/tax/__init__.py @@ -18,19 +18,27 @@ def subparser(subparsers): - subparser = subparsers.add_parser('tax', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS, aliases=['taxonomy']) - desc = 'Operations\n' + subparser = subparsers.add_parser( + "tax", + formatter_class=RawDescriptionHelpFormatter, + usage=SUPPRESS, + aliases=["taxonomy"], + ) + desc = "Operations\n" clidir = os.path.dirname(__file__) ops = command_list(clidir) for subcmd in ops: docstring = getattr(sys.modules[__name__], subcmd).__doc__ - helpstring = 'sourmash tax {op:s} --help'.format(op=subcmd) - desc += ' {hs:33s} {ds:s}\n'.format(hs=helpstring, ds=docstring) + helpstring = f"sourmash tax {subcmd:s} --help" + desc += f" {helpstring:33s} {docstring:s}\n" s = subparser.add_subparsers( - title="Integrate taxonomy information based on 'gather' results", dest='subcmd', metavar='subcmd', help=SUPPRESS, - description=desc + title="Integrate taxonomy information based on 'gather' results", + dest="subcmd", + metavar="subcmd", + help=SUPPRESS, + description=desc, ) for subcmd in ops: getattr(sys.modules[__name__], subcmd).subparser(s) subparser._action_groups.reverse() - subparser._optionals.title = 'Options' + subparser._optionals.title = "Options" diff --git a/src/sourmash/cli/tax/annotate.py b/src/sourmash/cli/tax/annotate.py index 501a02fd58..7541440fc2 100644 --- a/src/sourmash/cli/tax/annotate.py +++ b/src/sourmash/cli/tax/annotate.py @@ -1,6 +1,6 @@ """annotate gather results with taxonomy information""" -usage=""" +usage = """ sourmash tax annotate --gather-csv [ ... ] --taxonomy-csv [ ... ] @@ -19,53 +19,70 @@ def subparser(subparsers): - subparser = subparsers.add_parser('annotate', - aliases=['annotate'], - usage=usage) + subparser = subparsers.add_parser("annotate", aliases=["annotate"], usage=usage) subparser.add_argument( - '-g', '--gather-csv', nargs='*', default = [], action='extend', - help='CSV output files from sourmash gather' + "-g", + "--gather-csv", + nargs="*", + default=[], + action="extend", + help="CSV output files from sourmash gather", ) subparser.add_argument( - '--from-file', metavar='FILE', default=None, - help='input many gather results as a text file, with one gather CSV per line' + "--from-file", + metavar="FILE", + default=None, + help="input many gather results as a text file, with one gather CSV per line", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', - nargs='*', required=True, action="extend", - help='database lineages CSV' + "-t", + "--taxonomy-csv", + "--taxonomy", + metavar="FILE", + nargs="*", + required=True, + action="extend", + help="database lineages CSV", ) subparser.add_argument( - '-o', '--output-dir', default= "", - help='directory for output files' + "-o", "--output-dir", default="", help="directory for output files" ) subparser.add_argument( - '--keep-full-identifiers', action='store_true', - help='do not split identifiers on whitespace' + "--keep-full-identifiers", + action="store_true", + help="do not split identifiers on whitespace", ) subparser.add_argument( - '--keep-identifier-versions', action='store_true', - help='after splitting identifiers, do not remove accession versions' + "--keep-identifier-versions", + action="store_true", + help="after splitting identifiers, do not remove accession versions", ) subparser.add_argument( - '--fail-on-missing-taxonomy', action='store_true', - help='fail quickly if taxonomy is not available for an identifier', + "--fail-on-missing-taxonomy", + action="store_true", + help="fail quickly if taxonomy is not available for an identifier", ) subparser.add_argument( - '-f', '--force', action = 'store_true', - help='continue past errors in file and taxonomy loading', + "-f", + "--force", + action="store_true", + help="continue past errors in file and taxonomy loading", ) subparser.add_argument( - '--lins', '--lin-taxonomy', action='store_true', default=False, - help='use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.' + "--lins", + "--lin-taxonomy", + action="store_true", + default=False, + help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.", ) + def main(args): - import sourmash if not args.gather_csv and not args.from_file: - raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") + raise ValueError( + "No gather CSVs found! Please input via '-g' or '--from-file'." + ) return sourmash.tax.__main__.annotate(args) diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py index 3f3ee41578..b9712658a4 100644 --- a/src/sourmash/cli/tax/genome.py +++ b/src/sourmash/cli/tax/genome.py @@ -1,6 +1,6 @@ """classify genomes from gather results""" -usage=""" +usage = """ sourmash tax genome --gather-csv [ ... ] --taxonomy-csv [ ... ] @@ -34,81 +34,114 @@ import argparse import sourmash from sourmash.logging import notify, print_results, error -from sourmash.cli.utils import add_tax_threshold_arg, check_rank, check_tax_outputs, add_rank_arg +from sourmash.cli.utils import ( + add_tax_threshold_arg, + check_rank, + check_tax_outputs, + add_rank_arg, +) + def subparser(subparsers): - subparser = subparsers.add_parser('genome', - aliases=['classify'], - usage=usage) + subparser = subparsers.add_parser("genome", aliases=["classify"], usage=usage) subparser.add_argument( - '-g', '--gather-csv', action='extend', nargs='*', default = [], - help='CSVs output by sourmash gather for this sample' + "-g", + "--gather-csv", + action="extend", + nargs="*", + default=[], + help="CSVs output by sourmash gather for this sample", ) subparser.add_argument( - '--from-file', metavar='FILE', default=None, - help='input many gather results as a text file, with one gather CSV per line' + "--from-file", + metavar="FILE", + default=None, + help="input many gather results as a text file, with one gather CSV per line", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', - nargs='*', required=True, action='extend', - help='database lineages CSV' + "-t", + "--taxonomy-csv", + "--taxonomy", + metavar="FILE", + nargs="*", + required=True, + action="extend", + help="database lineages CSV", ) subparser.add_argument( - '-o', '--output-base', default='-', - help='base filepath for output file(s) (default stdout)' + "-o", + "--output-base", + default="-", + help="base filepath for output file(s) (default stdout)", ) subparser.add_argument( - '--output-dir', default= "", - help='directory for output files' + "--output-dir", default="", help="directory for output files" ) subparser.add_argument( - '--keep-full-identifiers', action='store_true', - help='do not split identifiers on whitespace' + "--keep-full-identifiers", + action="store_true", + help="do not split identifiers on whitespace", ) subparser.add_argument( - '--keep-identifier-versions', action='store_true', - help='after splitting identifiers, do not remove accession versions' + "--keep-identifier-versions", + action="store_true", + help="after splitting identifiers, do not remove accession versions", ) subparser.add_argument( - '--fail-on-missing-taxonomy', action='store_true', - help='fail quickly if taxonomy is not available for an identifier', + "--fail-on-missing-taxonomy", + action="store_true", + help="fail quickly if taxonomy is not available for an identifier", ) subparser.add_argument( - '-F', '--output-format', default=[], nargs='*', action='extend', + "-F", + "--output-format", + default=[], + nargs="*", + action="extend", choices=["csv_summary", "krona", "human", "lineage_csv"], - help='choose output format(s)', + help="choose output format(s)", ) subparser.add_argument( - '-f', '--force', action = 'store_true', - help='continue past survivable errors in loading taxonomy database or gather results', + "-f", + "--force", + action="store_true", + help="continue past survivable errors in loading taxonomy database or gather results", ) subparser.add_argument( - '--lins', '--lin-taxonomy', action='store_true', default=False, - help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain 'lin' lineage information." + "--lins", + "--lin-taxonomy", + action="store_true", + default=False, + help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain 'lin' lineage information.", ) subparser.add_argument( - '--lingroup', '--lingroups', metavar='FILE', default=None, - help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will restrict classification to these groups." + "--lingroup", + "--lingroups", + metavar="FILE", + default=None, + help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will restrict classification to these groups.", ) add_tax_threshold_arg(subparser, 0.1) add_rank_arg(subparser) def main(args): - import sourmash try: if not args.gather_csv and not args.from_file: - raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") + raise ValueError( + "No gather CSVs found! Please input via '-g' or '--from-file'." + ) if args.rank: args.rank = check_rank(args) - args.output_format = check_tax_outputs(args, rank_required = ['krona']) + args.output_format = check_tax_outputs(args, rank_required=["krona"]) except ValueError as exc: error(f"ERROR: {str(exc)}") - import sys; sys.exit(-1) + import sys + + sys.exit(-1) return sourmash.tax.__main__.genome(args) diff --git a/src/sourmash/cli/tax/grep.py b/src/sourmash/cli/tax/grep.py index 9aa5db3b89..13c25783fa 100644 --- a/src/sourmash/cli/tax/grep.py +++ b/src/sourmash/cli/tax/grep.py @@ -1,6 +1,6 @@ """search taxonomies and output picklists.""" -usage=""" +usage = """ sourmash tax grep --taxonomy-csv [ ... ] @@ -21,55 +21,69 @@ def subparser(subparsers): - subparser = subparsers.add_parser('grep', usage=usage) - subparser.add_argument('pattern') - subparser.add_argument('-r', '--rank', - help="search only this rank", - choices=['superkingdom', - 'phylum', - 'class', - 'order', - 'family', - 'genus', - 'species']) + subparser = subparsers.add_parser("grep", usage=usage) + subparser.add_argument("pattern") subparser.add_argument( - '-v', '--invert-match', - help="select non-matching lineages", - action="store_true" + "-r", + "--rank", + help="search only this rank", + choices=[ + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + ], ) subparser.add_argument( - '-i', '--ignore-case', + "-v", "--invert-match", help="select non-matching lineages", action="store_true" + ) + subparser.add_argument( + "-i", + "--ignore-case", help="ignore case distinctions (search lower and upper case both)", - action="store_true" + action="store_true", ) subparser.add_argument( - '--silent', '--no-picklist-output', + "--silent", + "--no-picklist-output", help="do not output picklist", - action='store_true', + action="store_true", ) subparser.add_argument( - '-c', '--count', + "-c", + "--count", help="only output a count of discovered lineages; implies --silent", - action='store_true' + action="store_true", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', - nargs="+", required=True, action="extend", - help='database lineages' + "-t", + "--taxonomy-csv", + "--taxonomy", + metavar="FILE", + nargs="+", + required=True, + action="extend", + help="database lineages", ) subparser.add_argument( - '-o', '--output', default='-', - help='output file (defaults to stdout)', + "-o", + "--output", + default="-", + help="output file (defaults to stdout)", ) subparser.add_argument( - '-f', '--force', action = 'store_true', - help='continue past errors in file and taxonomy loading', + "-f", + "--force", + action="store_true", + help="continue past errors in file and taxonomy loading", ) + def main(args): - import sourmash return sourmash.tax.__main__.grep(args) diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index 1e3cd50313..563c6c3d81 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -1,6 +1,6 @@ """summarize metagenome gather results""" -usage=""" +usage = """ sourmash tax metagenome --gather-csv [ ... ] --taxonomy-csv [ ... ] @@ -26,77 +26,118 @@ from sourmash.cli.utils import add_rank_arg, check_rank, check_tax_outputs - def subparser(subparsers): - subparser = subparsers.add_parser('metagenome', - usage=usage) + subparser = subparsers.add_parser("metagenome", usage=usage) subparser.add_argument( - '-g', '--gather-csv', action="extend", nargs='*', default = [], - help='CSVs from sourmash gather' + "-g", + "--gather-csv", + action="extend", + nargs="*", + default=[], + help="CSVs from sourmash gather", ) subparser.add_argument( - '--from-file', metavar='FILE', default = None, - help='input many gather results as a text file, with one gather CSV per line' + "--from-file", + metavar="FILE", + default=None, + help="input many gather results as a text file, with one gather CSV per line", ) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output-base', default='-', - help='base filepath for output file(s) (default stdout)' + "-o", + "--output-base", + default="-", + help="base filepath for output file(s) (default stdout)", ) subparser.add_argument( - '--output-dir', default= "", - help='directory for output files' + "--output-dir", default="", help="directory for output files" ) subparser.add_argument( - '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', - action="extend", nargs='+', required=True, - help='database lineages CSV' + "-t", + "--taxonomy-csv", + "--taxonomy", + metavar="FILE", + action="extend", + nargs="+", + required=True, + help="database lineages CSV", ) subparser.add_argument( - '--keep-full-identifiers', action='store_true', - help='do not split identifiers on whitespace' + "--keep-full-identifiers", + action="store_true", + help="do not split identifiers on whitespace", ) subparser.add_argument( - '--keep-identifier-versions', action='store_true', - help='after splitting identifiers, do not remove accession versions' + "--keep-identifier-versions", + action="store_true", + help="after splitting identifiers, do not remove accession versions", ) subparser.add_argument( - '--fail-on-missing-taxonomy', action='store_true', - help='fail quickly if taxonomy is not available for an identifier', + "--fail-on-missing-taxonomy", + action="store_true", + help="fail quickly if taxonomy is not available for an identifier", ) subparser.add_argument( - '-F', '--output-format', default=[], nargs='*', action="extend", - choices=["human", "csv_summary", "krona", "lineage_summary", "kreport", "lingroup", "bioboxes"], - help='choose output format(s)', + "-F", + "--output-format", + default=[], + nargs="*", + action="extend", + choices=[ + "human", + "csv_summary", + "krona", + "lineage_summary", + "kreport", + "lingroup", + "bioboxes", + ], + help="choose output format(s)", ) subparser.add_argument( - '-f', '--force', action = 'store_true', - help='continue past errors in taxonomy database loading', + "-f", + "--force", + action="store_true", + help="continue past errors in taxonomy database loading", ) subparser.add_argument( - '--lins', '--lin-taxonomy', action='store_true', default=False, - help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain 'lin' lineage information." + "--lins", + "--lin-taxonomy", + action="store_true", + default=False, + help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain 'lin' lineage information.", ) subparser.add_argument( - '--lingroup', '--lingroups', metavar='FILE', default=None, - help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will produce a 'lingroup' report containing taxonomic summarization for each group." + "--lingroup", + "--lingroups", + metavar="FILE", + default=None, + help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will produce a 'lingroup' report containing taxonomic summarization for each group.", ) add_rank_arg(subparser) + def main(args): - import sourmash try: if not args.gather_csv and not args.from_file: - raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") + raise ValueError( + "No gather CSVs found! Please input via '-g' or '--from-file'." + ) if args.rank: args.rank = check_rank(args) - args.output_format = check_tax_outputs(args, rank_required = ['krona', 'lineage_summary'], incompatible_with_lins = ['bioboxes', 'kreport'], use_lingroup_format=True) + args.output_format = check_tax_outputs( + args, + rank_required=["krona", "lineage_summary"], + incompatible_with_lins=["bioboxes", "kreport"], + use_lingroup_format=True, + ) except ValueError as exc: error(f"ERROR: {str(exc)}") - import sys; sys.exit(-1) + import sys + + sys.exit(-1) return sourmash.tax.__main__.metagenome(args) diff --git a/src/sourmash/cli/tax/prepare.py b/src/sourmash/cli/tax/prepare.py index de2e58521b..88e4a9f504 100644 --- a/src/sourmash/cli/tax/prepare.py +++ b/src/sourmash/cli/tax/prepare.py @@ -1,6 +1,6 @@ """combine multiple taxonomy databases into one.""" -usage=""" +usage = """ sourmash tax prepare --taxonomy-csv [ ... ] -o @@ -17,44 +17,55 @@ def subparser(subparsers): - subparser = subparsers.add_parser('prepare', - usage=usage) + subparser = subparsers.add_parser("prepare", usage=usage) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', - nargs="+", required=True, action="extend", - help='database lineages' + "-t", + "--taxonomy-csv", + "--taxonomy", + metavar="FILE", + nargs="+", + required=True, + action="extend", + help="database lineages", ) subparser.add_argument( - '-o', '--output', required=True, - help='output file', + "-o", + "--output", + required=True, + help="output file", ) subparser.add_argument( - '-F', '--database-format', + "-F", + "--database-format", help="format of output file; default is 'sql')", - default='sql', - choices=['csv', 'sql'], + default="sql", + choices=["csv", "sql"], ) subparser.add_argument( - '--keep-full-identifiers', action='store_true', - help='do not split identifiers on whitespace' + "--keep-full-identifiers", + action="store_true", + help="do not split identifiers on whitespace", ) subparser.add_argument( - '--keep-identifier-versions', action='store_true', - help='after splitting identifiers, do not remove accession versions' + "--keep-identifier-versions", + action="store_true", + help="after splitting identifiers, do not remove accession versions", ) subparser.add_argument( - '--fail-on-missing-taxonomy', action='store_true', - help='fail quickly if taxonomy is not available for an identifier', + "--fail-on-missing-taxonomy", + action="store_true", + help="fail quickly if taxonomy is not available for an identifier", ) subparser.add_argument( - '-f', '--force', action = 'store_true', - help='continue past errors in file and taxonomy loading', + "-f", + "--force", + action="store_true", + help="continue past errors in file and taxonomy loading", ) + def main(args): - import sourmash return sourmash.tax.__main__.prepare(args) diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index 06a109e95c..d430677b8f 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -1,6 +1,6 @@ """summarize taxonomy/lineage information""" -usage=""" +usage = """ sourmash tax summarize [ ... ] @@ -18,39 +18,46 @@ def subparser(subparsers): - subparser = subparsers.add_parser('summarize', - usage=usage) + subparser = subparsers.add_parser("summarize", usage=usage) subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - 'taxonomy_files', - metavar='FILE', - nargs="+", action="extend", - help='database lineages' + "taxonomy_files", + metavar="FILE", + nargs="+", + action="extend", + help="database lineages", ) subparser.add_argument( - '-o', '--output-lineage-information', - help='output a CSV file containing individual lineage counts', + "-o", + "--output-lineage-information", + help="output a CSV file containing individual lineage counts", ) subparser.add_argument( - '--keep-full-identifiers', action='store_true', - help='do not split identifiers on whitespace' + "--keep-full-identifiers", + action="store_true", + help="do not split identifiers on whitespace", ) subparser.add_argument( - '--keep-identifier-versions', action='store_true', - help='after splitting identifiers, do not remove accession versions' + "--keep-identifier-versions", + action="store_true", + help="after splitting identifiers, do not remove accession versions", ) subparser.add_argument( - '-f', '--force', action = 'store_true', - help='continue past errors in file and taxonomy loading', + "-f", + "--force", + action="store_true", + help="continue past errors in file and taxonomy loading", ) subparser.add_argument( - '--lins', '--lin-taxonomy', action='store_true', default=False, - help='use LIN taxonomy in place of standard taxonomic ranks.' + "--lins", + "--lin-taxonomy", + action="store_true", + default=False, + help="use LIN taxonomy in place of standard taxonomic ranks.", ) + def main(args): - import sourmash return sourmash.tax.__main__.summarize(args) diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index e0d8975b09..26da5ead5f 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -7,38 +7,66 @@ def add_moltype_args(parser): parser.add_argument( - '--protein', dest='protein', action='store_true', - help='choose a protein signature; by default, a nucleotide signature is used' + "--protein", + dest="protein", + action="store_true", + help="choose a protein signature; by default, a nucleotide signature is used", ) parser.add_argument( - '--no-protein', dest='protein', action='store_false', - help='do not choose a protein signature') + "--no-protein", + dest="protein", + action="store_false", + help="do not choose a protein signature", + ) parser.set_defaults(protein=False) parser.add_argument( - '--dayhoff', dest='dayhoff', action='store_true', - help='choose Dayhoff-encoded amino acid signatures' + "--dayhoff", + dest="dayhoff", + action="store_true", + help="choose Dayhoff-encoded amino acid signatures", ) parser.add_argument( - '--no-dayhoff', dest='dayhoff', action='store_false', - help='do not choose Dayhoff-encoded amino acid signatures') + "--no-dayhoff", + dest="dayhoff", + action="store_false", + help="do not choose Dayhoff-encoded amino acid signatures", + ) parser.set_defaults(dayhoff=False) parser.add_argument( - '--hp', '--hydrophobic-polar', dest='hp', action='store_true', - help='choose hydrophobic-polar-encoded amino acid signatures' + "--hp", + "--hydrophobic-polar", + dest="hp", + action="store_true", + help="choose hydrophobic-polar-encoded amino acid signatures", ) parser.add_argument( - '--no-hp', '--no-hydrophobic-polar', dest='hp', action='store_false', - help='do not choose hydrophobic-polar-encoded amino acid signatures') + "--no-hp", + "--no-hydrophobic-polar", + dest="hp", + action="store_false", + help="do not choose hydrophobic-polar-encoded amino acid signatures", + ) parser.set_defaults(hp=False) parser.add_argument( - '--dna', '--rna', '--nucleotide', dest='dna', default=None, action='store_true', - help='choose a nucleotide signature (default: True)') + "--dna", + "--rna", + "--nucleotide", + dest="dna", + default=None, + action="store_true", + help="choose a nucleotide signature (default: True)", + ) parser.add_argument( - '--no-dna', '--no-rna', '--no-nucleotide', dest='dna', action='store_false', - help='do not choose a nucleotide signature') + "--no-dna", + "--no-rna", + "--no-nucleotide", + dest="dna", + action="store_false", + help="do not choose a nucleotide signature", + ) parser.set_defaults(dna=None) @@ -52,16 +80,21 @@ def add_ksize_arg(parser, *, default=None): if default: message = f"k-mer size to select; default={default}" else: - message = f"k-mer size to select; no default." + message = "k-mer size to select; no default." parser.add_argument( - '-k', '--ksize', metavar='K', default=default, type=int, + "-k", + "--ksize", + metavar="K", + default=default, + type=int, help=message, ) -#https://stackoverflow.com/questions/55324449/how-to-specify-a-minimum-or-maximum-float-value-with-argparse#55410582 + +# https://stackoverflow.com/questions/55324449/how-to-specify-a-minimum-or-maximum-float-value-with-argparse#55410582 def range_limited_float_type(arg): - """ Type function for argparse - a float within some predefined bounds """ + """Type function for argparse - a float within some predefined bounds""" min_val = 0 max_val = 1 try: @@ -69,119 +102,168 @@ def range_limited_float_type(arg): except ValueError: raise argparse.ArgumentTypeError("\n\tERROR: Must be a floating point number.") if f < min_val or f > max_val: - raise argparse.ArgumentTypeError(f"\n\tERROR: Argument must be >{str(min_val)} and <{str(max_val)}.") + raise argparse.ArgumentTypeError( + f"\n\tERROR: Argument must be >{str(min_val)} and <{str(max_val)}." + ) return f def add_tax_threshold_arg(parser, containment_default=0.1, ani_default=None): parser.add_argument( - '--containment-threshold', default=containment_default, type=range_limited_float_type, - help=f'minimum containment threshold for classification; default={containment_default}', + "--containment-threshold", + default=containment_default, + type=range_limited_float_type, + help=f"minimum containment threshold for classification; default={containment_default}", ) parser.add_argument( - '--ani-threshold', '--aai-threshold', default=ani_default, type=range_limited_float_type, - help=f'minimum ANI threshold (nucleotide gather) or AAI threshold (protein gather) for classification; default={ani_default}', + "--ani-threshold", + "--aai-threshold", + default=ani_default, + type=range_limited_float_type, + help=f"minimum ANI threshold (nucleotide gather) or AAI threshold (protein gather) for classification; default={ani_default}", ) def add_picklist_args(parser): parser.add_argument( - '--picklist', default=None, - help="select signatures based on a picklist, i.e. 'file.csv:colname:coltype'" + "--picklist", + default=None, + help="select signatures based on a picklist, i.e. 'file.csv:colname:coltype'", ) parser.add_argument( - '--picklist-require-all', default=False, action='store_true', - help="require that all picklist values be found or else fail" + "--picklist-require-all", + default=False, + action="store_true", + help="require that all picklist values be found or else fail", ) def add_pattern_args(parser): parser.add_argument( - '--include-db-pattern', + "--include-db-pattern", default=None, - help='search only signatures that match this pattern in name, filename, or md5' + help="search only signatures that match this pattern in name, filename, or md5", ) parser.add_argument( - '--exclude-db-pattern', + "--exclude-db-pattern", default=None, - help='search only signatures that do not match this pattern in name, filename, or md5' + help="search only signatures that do not match this pattern in name, filename, or md5", ) def opfilter(path): - return not path.startswith('__') and path not in ['utils'] + return not path.startswith("__") and path not in ["utils"] def command_list(dirpath): - paths = glob(os.path.join(dirpath, '*.py')) + paths = glob(os.path.join(dirpath, "*.py")) filenames = [os.path.basename(path) for path in paths] - basenames = [os.path.splitext(path)[0] for path in filenames if not path.startswith('__')] + basenames = [ + os.path.splitext(path)[0] for path in filenames if not path.startswith("__") + ] basenames = filter(opfilter, basenames) return sorted(basenames) def add_scaled_arg(parser, default=None): parser.add_argument( - '--scaled', metavar='FLOAT', type=check_scaled_bounds, - help='downsample to this scaled; value should be between 100 and 1e6' + "--scaled", + metavar="FLOAT", + type=check_scaled_bounds, + help="downsample to this scaled; value should be between 100 and 1e6", ) def add_num_arg(parser, default=0): parser.add_argument( - '-n', '--num-hashes', '--num', metavar='N', type=check_num_bounds, default=default, - help='num value should be between 50 and 50000' + "-n", + "--num-hashes", + "--num", + metavar="N", + type=check_num_bounds, + default=default, + help="num value should be between 50 and 50000", ) def check_rank(args): - """ Check '--rank'/'--position'/'--lin-position' argument matches selected taxonomy.""" - standard_ranks =['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] + """Check '--rank'/'--position'/'--lin-position' argument matches selected taxonomy.""" + standard_ranks = [ + "strain", + "species", + "genus", + "family", + "order", + "class", + "phylum", + "superkingdom", + ] if args.lins: - if args.rank.isdigit(): + if args.rank.isdigit(): return str(args.rank) - raise argparse.ArgumentTypeError(f"Invalid '--rank'/'--position' input: '{args.rank}'. '--lins' is specified. Rank must be an integer corresponding to a LIN position.") + raise argparse.ArgumentTypeError( + f"Invalid '--rank'/'--position' input: '{args.rank}'. '--lins' is specified. Rank must be an integer corresponding to a LIN position." + ) elif args.rank in standard_ranks: return args.rank else: - raise argparse.ArgumentTypeError(f"Invalid '--rank'/'--position' input: '{args.rank}'. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'") + raise argparse.ArgumentTypeError( + f"Invalid '--rank'/'--position' input: '{args.rank}'. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'" + ) def add_rank_arg(parser): parser.add_argument( - '-r', '--rank', - '--position', '--lin-position', + "-r", + "--rank", + "--position", + "--lin-position", help="For non-default output formats. Classify to this rank (tax genome) or summarize taxonomy at this rank and above (tax metagenome). \ Note that the taxonomy CSV must contain lineage information at this rank, and that LIN positions start at 0. \ - Choices: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom' or an integer LIN position" + Choices: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom' or an integer LIN position", ) -def check_tax_outputs(args, rank_required = ["krona"], incompatible_with_lins = None, use_lingroup_format=False): +def check_tax_outputs( + args, + rank_required=["krona"], + incompatible_with_lins=None, + use_lingroup_format=False, +): "Handle ouput format combinations" # check that rank is passed for formats requiring rank. if not args.rank: if any(x in rank_required for x in args.output_format): - raise ValueError(f"Rank (--rank) is required for {', '.join(rank_required)} output formats.") + raise ValueError( + f"Rank (--rank) is required for {', '.join(rank_required)} output formats." + ) if args.lins: # check for outputs incompatible with lins if incompatible_with_lins: if any(x in args.output_format for x in incompatible_with_lins): - raise ValueError(f"The following outputs are incompatible with '--lins': : {', '.join(incompatible_with_lins)}") + raise ValueError( + f"The following outputs are incompatible with '--lins': : {', '.join(incompatible_with_lins)}" + ) # check that lingroup file exists if needed if args.lingroup: if use_lingroup_format and "lingroup" not in args.output_format: args.output_format.append("lingroup") elif "lingroup" in args.output_format: - raise ValueError(f"Must provide lingroup csv via '--lingroup' in order to output a lingroup report.") + raise ValueError( + "Must provide lingroup csv via '--lingroup' in order to output a lingroup report." + ) elif args.lingroup or "lingroup" in args.output_format: - raise ValueError(f"Must enable LIN taxonomy via '--lins' in order to use lingroups.") + raise ValueError( + "Must enable LIN taxonomy via '--lins' in order to use lingroups." + ) # check that only one output format is specified if writing to stdout if len(args.output_format) > 1: if args.output_base == "-": - raise ValueError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") + raise ValueError( + f"Writing to stdout is incompatible with multiple output formats {args.output_format}" + ) elif not args.output_format: # change to "human" for 5.0 args.output_format = ["csv_summary"] diff --git a/src/sourmash/cli/watch.py b/src/sourmash/cli/watch.py index 7828d376e2..a82c06d1a3 100644 --- a/src/sourmash/cli/watch.py +++ b/src/sourmash/cli/watch.py @@ -4,33 +4,36 @@ def subparser(subparsers): - subparser = subparsers.add_parser('watch') - subparser.add_argument('sbt_name', help='name of SBT to search') - subparser.add_argument('inp_file', nargs='?', default='/dev/stdin') + subparser = subparsers.add_parser("watch") + subparser.add_argument("sbt_name", help="name of SBT to search") + subparser.add_argument("inp_file", nargs="?", default="/dev/stdin") subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) subparser.add_argument( - '-o', '--output', - help='save signature generated from data here' + "-o", "--output", help="save signature generated from data here" ) subparser.add_argument( - '--threshold', metavar='T', default=0.05, type=float, - help='minimum threshold for matches (default=0.05)' + "--threshold", + metavar="T", + default=0.05, + type=float, + help="minimum threshold for matches (default=0.05)", ) subparser.add_argument( - '--input-is-protein', action='store_true', - help='Consume protein sequences - no translation needed' + "--input-is-protein", + action="store_true", + help="Consume protein sequences - no translation needed", ) add_moltype_args(subparser) subparser.add_argument( - '--name', type=str, default='stdin', - help='name to use for generated signature' + "--name", type=str, default="stdin", help="name to use for generated signature" ) add_ksize_arg(subparser) add_num_arg(subparser, 500) + def main(args): import sourmash + return sourmash.commands.watch(args) diff --git a/src/sourmash/command_compute.py b/src/sourmash/command_compute.py index 2dca0ae936..46c4f455f6 100644 --- a/src/sourmash/command_compute.py +++ b/src/sourmash/command_compute.py @@ -13,7 +13,7 @@ from .utils import RustObject from ._lowlevel import ffi, lib -DEFAULT_COMPUTE_K = '21,31,51' +DEFAULT_COMPUTE_K = "21,31,51" DEFAULT_MMHASH_SEED = 42 DEFAULT_LINE_COUNT = 1500 @@ -33,82 +33,82 @@ def compute(args): """ set_quiet(args.quiet) - if args.license != 'CC0': - error('error: sourmash only supports CC0-licensed signatures. sorry!') + if args.license != "CC0": + error("error: sourmash only supports CC0-licensed signatures. sorry!") sys.exit(-1) if args.input_is_protein and args.dna: - notify('WARNING: input is protein, turning off nucleotide hashing') + notify("WARNING: input is protein, turning off nucleotide hashing") args.dna = False args.protein = True if args.scaled: if args.scaled < 1: - error('ERROR: --scaled value must be >= 1') + error("ERROR: --scaled value must be >= 1") sys.exit(-1) if args.scaled != round(args.scaled, 0): - error('ERROR: --scaled value must be integer value') + error("ERROR: --scaled value must be integer value") sys.exit(-1) if args.scaled >= 1e9: - notify('WARNING: scaled value is nonsensical!? Continuing anyway.') + notify("WARNING: scaled value is nonsensical!? Continuing anyway.") if args.num_hashes != 0: - notify('setting num_hashes to 0 because --scaled is set') + notify("setting num_hashes to 0 because --scaled is set") args.num_hashes = 0 - notify('computing signatures for files: {}', ", ".join(args.filenames)) + notify("computing signatures for files: {}", ", ".join(args.filenames)) if args.randomize: - notify('randomizing file list because of --randomize') + notify("randomizing file list because of --randomize") random.shuffle(args.filenames) # get list of k-mer sizes for which to compute sketches ksizes = args.ksizes - notify('Computing signature for ksizes: {}', str(ksizes)) + notify("Computing signature for ksizes: {}", str(ksizes)) num_sigs = 0 if args.dna and args.protein: - notify('Computing both nucleotide and protein signatures.') - num_sigs = 2*len(ksizes) + notify("Computing both nucleotide and protein signatures.") + num_sigs = 2 * len(ksizes) elif args.dna and args.dayhoff: - notify('Computing both nucleotide and Dayhoff-encoded protein ' - 'signatures.') - num_sigs = 2*len(ksizes) + notify("Computing both nucleotide and Dayhoff-encoded protein " "signatures.") + num_sigs = 2 * len(ksizes) elif args.dna and args.hp: - notify('Computing both nucleotide and hp-encoded protein ' - 'signatures.') - num_sigs = 2*len(ksizes) + notify("Computing both nucleotide and hp-encoded protein " "signatures.") + num_sigs = 2 * len(ksizes) elif args.dna: - notify('Computing only nucleotide (and not protein) signatures.') + notify("Computing only nucleotide (and not protein) signatures.") num_sigs = len(ksizes) elif args.protein: - notify('Computing only protein (and not nucleotide) signatures.') + notify("Computing only protein (and not nucleotide) signatures.") num_sigs = len(ksizes) elif args.dayhoff: - notify('Computing only Dayhoff-encoded protein (and not nucleotide) ' - 'signatures.') + notify( + "Computing only Dayhoff-encoded protein (and not nucleotide) " "signatures." + ) num_sigs = len(ksizes) elif args.hp: - notify('Computing only hp-encoded protein (and not nucleotide) ' - 'signatures.') + notify("Computing only hp-encoded protein (and not nucleotide) " "signatures.") num_sigs = len(ksizes) - if (args.protein or args.dayhoff or args.hp): + if args.protein or args.dayhoff or args.hp: notify("") - notify("WARNING: you are using 'compute' to make a protein/dayhoff/hp signature,") + notify( + "WARNING: you are using 'compute' to make a protein/dayhoff/hp signature," + ) notify("WARNING: but the meaning of ksize has changed in 4.0. Please see the") notify("WARNING: migration guide to sourmash v4.0 at http://sourmash.rtfd.io/") notify("") - bad_ksizes = [ str(k) for k in ksizes if k % 3 != 0 ] + bad_ksizes = [str(k) for k in ksizes if k % 3 != 0] if bad_ksizes: - error('protein ksizes must be divisible by 3, sorry!') - error('bad ksizes: {}', ", ".join(bad_ksizes)) + error("protein ksizes must be divisible by 3, sorry!") + error("bad ksizes: {}", ", ".join(bad_ksizes)) sys.exit(-1) - notify('Computing a total of {} signature(s) for each input.', num_sigs) + notify("Computing a total of {} signature(s) for each input.", num_sigs) if num_sigs == 0: - error('...nothing to calculate!? Exiting!') + error("...nothing to calculate!? Exiting!") sys.exit(-1) if args.merge and not args.output: @@ -120,32 +120,35 @@ def compute(args): sys.exit(-1) if args.track_abundance: - notify('Tracking abundance of input k-mers.') + notify("Tracking abundance of input k-mers.") signatures_factory = _signatures_for_compute_factory(args) - if args.merge: # single name specified - combine all + if args.merge: # single name specified - combine all _compute_merged(args, signatures_factory) - else: # compute individual signatures + else: # compute individual signatures _compute_individual(args, signatures_factory) class _signatures_for_compute_factory: "Build signatures on demand, based on args input to 'compute'." + def __init__(self, args): self.args = args def __call__(self): args = self.args - params = ComputeParameters(ksizes=args.ksizes, - seed=args.seed, - protein=args.protein, - dayhoff=args.dayhoff, - hp=args.hp, - dna=args.dna, - num_hashes=args.num_hashes, - track_abundance=args.track_abundance, - scaled=args.scaled) + params = ComputeParameters( + ksizes=args.ksizes, + seed=args.seed, + protein=args.protein, + dayhoff=args.dayhoff, + hp=args.hp, + dna=args.dna, + num_hashes=args.num_hashes, + track_abundance=args.track_abundance, + scaled=args.scaled, + ) sig = SourmashSignature.from_params(params) return [sig] @@ -167,14 +170,14 @@ def _compute_individual(args, signatures_factory): for filename in args.filenames: if open_output_each_time: # for each input file, construct output filename - sigfile = os.path.basename(filename) + '.sig' + sigfile = os.path.basename(filename) + ".sig" if args.output_dir: sigfile = os.path.join(args.output_dir, sigfile) # does it already exist? skip if so. if os.path.exists(sigfile) and not args.force: - notify('skipping {} - already done', filename) - continue # go on to next file. + notify("skipping {} - already done", filename) + continue # go on to next file. # nope? ok, let's save to it. assert not save_sigs @@ -204,8 +207,12 @@ def _compute_individual(args, signatures_factory): for n, record in enumerate(screed_iter): sigs = signatures_factory() try: - add_seq(sigs, record.sequence, - args.input_is_protein, args.check_sequence) + add_seq( + sigs, + record.sequence, + args.input_is_protein, + args.check_sequence, + ) except ValueError as exc: error(f"ERROR when reading from '{filename}' - ") error(str(exc)) @@ -214,50 +221,63 @@ def _compute_individual(args, signatures_factory): set_sig_name(sigs, filename, name=record.name) save_sigs_to_location(sigs, save_sigs) - notify('calculated {} signatures for {} sequences in {}', - len(save_sigs), n + 1, filename) + notify( + "calculated {} signatures for {} sequences in {}", + len(save_sigs), + n + 1, + filename, + ) # nope; make a single sig for the whole file else: sigs = signatures_factory() # consume & calculate signatures - notify(f'... reading sequences from {filename}') + notify(f"... reading sequences from {filename}") name = None for n, record in enumerate(screed_iter): if n % 10000 == 0: if n: - notify('\r...{} {}', filename, n, end='') + notify("\r...{} {}", filename, n, end="") elif args.name_from_first: name = record.name try: - add_seq(sigs, record.sequence, - args.input_is_protein, args.check_sequence) + add_seq( + sigs, + record.sequence, + args.input_is_protein, + args.check_sequence, + ) except ValueError as exc: error(f"ERROR when reading from '{filename}' - ") error(str(exc)) sys.exit(-1) - notify('...{} {} sequences', filename, n, end='') + notify("...{} {} sequences", filename, n, end="") set_sig_name(sigs, filename, name) save_sigs_to_location(sigs, save_sigs) - notify(f'calculated {len(sigs)} signatures for {n+1} sequences in {filename}') + notify( + f"calculated {len(sigs)} signatures for {n+1} sequences in {filename}" + ) # if not args.output, close output for every input filename. if open_output_each_time: save_sigs.close() - notify(f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0.") + notify( + f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0." + ) save_sigs = None - # if --output-dir specified, all collected signatures => args.output, # and we need to close here. if args.output and save_sigs is not None: save_sigs.close() - notify(f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0.") + notify( + f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0." + ) def _compute_merged(args, signatures_factory): @@ -267,26 +287,30 @@ def _compute_merged(args, signatures_factory): total_seq = 0 for filename in args.filenames: # consume & calculate signatures - notify('... reading sequences from {}', filename) + notify("... reading sequences from {}", filename) n = None with screed.open(filename) as f: for n, record in enumerate(f): if n % 10000 == 0 and n: - notify('\r... {} {}', filename, n, end='') + notify("\r... {} {}", filename, n, end="") - add_seq(sigs, record.sequence, - args.input_is_protein, args.check_sequence) + add_seq( + sigs, record.sequence, args.input_is_protein, args.check_sequence + ) if n is not None: - notify('... {} {} sequences', filename, n + 1) + notify("... {} {} sequences", filename, n + 1) total_seq += n + 1 else: notify(f"no sequences found in '{filename}'?!") if total_seq: set_sig_name(sigs, filename, name=args.merge) - notify('calculated 1 signature for {} sequences taken from {} files', - total_seq, len(args.filenames)) + notify( + "calculated 1 signature for {} sequences taken from {} files", + total_seq, + len(args.filenames), + ) # at end, save! save_siglist(sigs, args.output) @@ -301,8 +325,8 @@ def add_seq(sigs, seq, input_is_protein, check_sequence): def set_sig_name(sigs, filename, name=None): - if filename == '-': # if stdin, set filename to empty. - filename = '' + if filename == "-": # if stdin, set filename to empty. + filename = "" for sig in sigs: if name is not None: sig._name = name @@ -332,17 +356,19 @@ def save_sigs_to_location(siglist, save_sig): class ComputeParameters(RustObject): __dealloc_func__ = lib.computeparams_free - def __init__(self, - *, - ksizes=(21, 31, 51), - seed=42, - protein=False, - dayhoff=False, - hp=False, - dna=True, - num_hashes=500, - track_abundance=False, - scaled=0): + def __init__( + self, + *, + ksizes=(21, 31, 51), + seed=42, + protein=False, + dayhoff=False, + hp=False, + dna=True, + num_hashes=500, + track_abundance=False, + scaled=0, + ): self._objptr = lib.computeparams_new() self.seed = seed @@ -359,31 +385,33 @@ def __init__(self, def from_manifest_row(cls, row): "convert a CollectionManifest row into a ComputeParameters object" is_dna = is_protein = is_dayhoff = is_hp = False - if row['moltype'] == 'DNA': + if row["moltype"] == "DNA": is_dna = True - elif row['moltype'] == 'protein': + elif row["moltype"] == "protein": is_protein = True - elif row['moltype'] == 'hp': + elif row["moltype"] == "hp": is_hp = True - elif row['moltype'] == 'dayhoff': + elif row["moltype"] == "dayhoff": is_dayhoff = True else: assert 0 if is_dna: - ksize = row['ksize'] + ksize = row["ksize"] else: - ksize = row['ksize'] * 3 - - p = cls(ksizes=[ksize], - seed=DEFAULT_MMHASH_SEED, - protein=is_protein, - dayhoff=is_dayhoff, - hp=is_hp, - dna=is_dna, - num_hashes=row['num'], - track_abundance=row['with_abundance'], - scaled=row['scaled']) + ksize = row["ksize"] * 3 + + p = cls( + ksizes=[ksize], + seed=DEFAULT_MMHASH_SEED, + protein=is_protein, + dayhoff=is_dayhoff, + hp=is_hp, + dna=is_dna, + num_hashes=row["num"], + track_abundance=row["with_abundance"], + scaled=row["scaled"], + ) return p @@ -400,7 +428,7 @@ def to_param_str(self): elif self.dayhoff: pi.append("dayhoff") else: - assert 0 # must be one of the previous + assert 0 # must be one of the previous if self.dna: kstr = [f"k={k}" for k in self.ksizes] @@ -431,15 +459,17 @@ def __repr__(self): return f"ComputeParameters(ksizes={self.ksizes}, seed={self.seed}, protein={self.protein}, dayhoff={self.dayhoff}, hp={self.hp}, dna={self.dna}, num_hashes={self.num_hashes}, track_abundance={self.track_abundance}, scaled={self.scaled})" def __eq__(self, other): - return (self.ksizes == other.ksizes and - self.seed == other.seed and - self.protein == other.protein and - self.dayhoff == other.dayhoff and - self.hp == other.hp and - self.dna == other.dna and - self.num_hashes == other.num_hashes and - self.track_abundance == other.track_abundance and - self.scaled == other.scaled) + return ( + self.ksizes == other.ksizes + and self.seed == other.seed + and self.protein == other.protein + and self.dayhoff == other.dayhoff + and self.hp == other.hp + and self.dna == other.dna + and self.num_hashes == other.num_hashes + and self.track_abundance == other.track_abundance + and self.scaled == other.scaled + ) @staticmethod def from_args(args): @@ -509,11 +539,16 @@ def dna(self, v): @property def moltype(self): - if self.dna: moltype = 'DNA' - elif self.protein: moltype = 'protein' - elif self.hp: moltype = 'hp' - elif self.dayhoff: moltype = 'dayhoff' - else: assert 0 + if self.dna: + moltype = "DNA" + elif self.protein: + moltype = "protein" + elif self.hp: + moltype = "hp" + elif self.dayhoff: + moltype = "dayhoff" + else: + assert 0 return moltype diff --git a/src/sourmash/command_sketch.py b/src/sourmash/command_sketch.py index f79e3a5fc8..508cac7c01 100644 --- a/src/sourmash/command_sketch.py +++ b/src/sourmash/command_sketch.py @@ -12,19 +12,24 @@ import sourmash from .signature import SourmashSignature from .logging import notify, error, set_quiet, print_results -from .command_compute import (_compute_individual, _compute_merged, - ComputeParameters, add_seq, set_sig_name, - DEFAULT_MMHASH_SEED) +from .command_compute import ( + _compute_individual, + _compute_merged, + ComputeParameters, + add_seq, + set_sig_name, + DEFAULT_MMHASH_SEED, +) from sourmash import sourmash_args from sourmash.sourmash_args import check_scaled_bounds, check_num_bounds from sourmash.sig.__main__ import _summarize_manifest, _SketchInfo from sourmash.manifest import CollectionManifest DEFAULTS = dict( - dna='k=31,scaled=1000,noabund', - protein='k=10,scaled=200,noabund', - dayhoff='k=16,scaled=200,noabund', - hp='k=42,scaled=200,noabund' + dna="k=31,scaled=1000,noabund", + protein="k=10,scaled=200,noabund", + dayhoff="k=16,scaled=200,noabund", + hp="k=42,scaled=200,noabund", ) @@ -32,21 +37,21 @@ def _parse_params_str(params_str): "Parse a parameter string of the form 'k=ks,num=num,scaled=scaled,abund'." moltype = None params = {} - params['ksize'] = [] - items = params_str.split(',') + params["ksize"] = [] + items = params_str.split(",") for item in items: - if item == 'abund': - params['track_abundance'] = True - elif item == 'noabund': - params['track_abundance'] = False - elif item.startswith('k'): - if len(item) < 3 or item[1] != '=': + if item == "abund": + params["track_abundance"] = True + elif item == "noabund": + params["track_abundance"] = False + elif item.startswith("k"): + if len(item) < 3 or item[1] != "=": raise ValueError("k takes a parameter, e.g. 'k=31'") - params['ksize'].append(int(item[2:])) - elif item.startswith('num'): - if len(item) < 5 or item[3] != '=': + params["ksize"].append(int(item[2:])) + elif item.startswith("num"): + if len(item) < 5 or item[3] != "=": raise ValueError("num takes a parameter, e.g. 'num=500'") - if params.get('scaled'): + if params.get("scaled"): raise ValueError("cannot set both num and scaled in a single minhash") try: num = item[4:] @@ -56,12 +61,12 @@ def _parse_params_str(params_str): num = check_num_bounds(num) - params['num'] = int(item[4:]) - params['scaled'] = 0 - elif item.startswith('scaled'): - if len(item) < 8 or item[6] != '=': + params["num"] = int(item[4:]) + params["scaled"] = 0 + elif item.startswith("scaled"): + if len(item) < 8 or item[6] != "=": raise ValueError("scaled takes a parameter, e.g. 'scaled=1000'") - if params.get('num'): + if params.get("num"): raise ValueError("cannot set both num and scaled in a single minhash") try: scaled = item[7:] @@ -71,13 +76,13 @@ def _parse_params_str(params_str): scaled = check_scaled_bounds(scaled) - params['scaled'] = scaled - params['num'] = 0 - elif item.startswith('seed'): - if len(item) < 6 or item[4] != '=': + params["scaled"] = scaled + params["num"] = 0 + elif item.startswith("seed"): + if len(item) < 6 or item[4] != "=": raise ValueError("seed takes a parameter, e.g. 'seed=42'") - params['seed'] = int(item[5:]) - elif item in ('protein', 'dayhoff', 'hp', 'dna'): + params["seed"] = int(item[5:]) + elif item in ("protein", "dayhoff", "hp", "dna"): moltype = item else: raise ValueError(f"unknown component '{item}' in params string") @@ -87,12 +92,13 @@ def _parse_params_str(params_str): class _signatures_for_sketch_factory: "Build sigs on demand, based on args input to 'sketch'." + def __init__(self, params_str_list, default_moltype): # first, set up defaults per-moltype defaults = {} for moltype, pstr in DEFAULTS.items(): mt, d = _parse_params_str(pstr) - assert mt is None # defaults cannot have moltype set! + assert mt is None # defaults cannot have moltype set! defaults[moltype] = d self.defaults = defaults @@ -105,19 +111,27 @@ def __init__(self, params_str_list, default_moltype): # provided. for params_str in params_str_list: moltype, params = _parse_params_str(params_str) - if moltype and moltype != 'dna' and default_moltype == 'dna': - raise ValueError(f"Incompatible sketch type ({default_moltype}) and parameter override ({moltype}) in '{params_str}'; maybe use 'sketch translate'?") - elif moltype == 'dna' and default_moltype and default_moltype != 'dna': - raise ValueError(f"Incompatible sketch type ({default_moltype}) and parameter override ({moltype}) in '{params_str}'") + if moltype and moltype != "dna" and default_moltype == "dna": + raise ValueError( + f"Incompatible sketch type ({default_moltype}) and parameter override ({moltype}) in '{params_str}'; maybe use 'sketch translate'?" + ) + elif moltype == "dna" and default_moltype and default_moltype != "dna": + raise ValueError( + f"Incompatible sketch type ({default_moltype}) and parameter override ({moltype}) in '{params_str}'" + ) elif moltype is None: if default_moltype is None: - raise ValueError(f"No default moltype and none specified in param string") + raise ValueError( + "No default moltype and none specified in param string" + ) moltype = default_moltype self.params_list.append((moltype, params)) else: if default_moltype is None: - raise ValueError(f"No default moltype and none specified in param string") + raise ValueError( + "No default moltype and none specified in param string" + ) # no params str? default to a single sig, using default_moltype. self.params_list.append((default_moltype, {})) @@ -125,38 +139,37 @@ def get_compute_params(self, *, split_ksizes=False): for moltype, params_d in self.params_list: # get defaults for this moltype from self.defaults: default_params = self.defaults[moltype] - def_seed = default_params.get('seed', DEFAULT_MMHASH_SEED) - def_num = default_params.get('num', 0) - def_abund = default_params['track_abundance'] - def_scaled = default_params.get('scaled', 0) - def_dna = default_params.get('is_dna', moltype == 'dna') - def_protein = default_params.get('is_protein', - moltype == 'protein') - def_dayhoff = default_params.get('is_dayhoff', - moltype == 'dayhoff') - def_hp = default_params.get('is_hp', moltype == 'hp') + def_seed = default_params.get("seed", DEFAULT_MMHASH_SEED) + def_num = default_params.get("num", 0) + def_abund = default_params["track_abundance"] + def_scaled = default_params.get("scaled", 0) + def_dna = default_params.get("is_dna", moltype == "dna") + def_protein = default_params.get("is_protein", moltype == "protein") + def_dayhoff = default_params.get("is_dayhoff", moltype == "dayhoff") + def_hp = default_params.get("is_hp", moltype == "hp") # handle ksize specially, for now - multiply by three? - def_ksizes = default_params['ksize'] - ksizes = params_d.get('ksize') + def_ksizes = default_params["ksize"] + ksizes = params_d.get("ksize") if not ksizes: ksizes = def_ksizes # 'command sketch' adjusts k-mer sizes by 3 if non-DNA sketch. if self.mult_ksize_by_3 and not def_dna: - ksizes = [ k*3 for k in ksizes ] - - make_param = lambda ksizes: ComputeParameters( - ksizes=ksizes, - seed=params_d.get('seed', def_seed), - protein=def_protein, - dayhoff=def_dayhoff, - hp=def_hp, - dna=def_dna, - num_hashes=params_d.get('num', def_num), - track_abundance=params_d.get('track_abundance', - def_abund), - scaled=params_d.get('scaled', def_scaled)) + ksizes = [k * 3 for k in ksizes] + + def make_param(ksizes): + return ComputeParameters( + ksizes=ksizes, + seed=params_d.get("seed", def_seed), + protein=def_protein, + dayhoff=def_dayhoff, + hp=def_hp, + dna=def_dna, + num_hashes=params_d.get("num", def_num), + track_abundance=params_d.get("track_abundance", def_abund), + scaled=params_d.get("scaled", def_scaled), + ) if split_ksizes: for ksize in ksizes: @@ -179,6 +192,7 @@ def __call__(self, *, split_ksizes=False): def _add_from_file_to_filenames(args): "Add filenames from --from-file to args.filenames" from .sourmash_args import load_pathlist_from_file + if args.from_file: file_list = load_pathlist_from_file(args.from_file) args.filenames.extend(file_list) @@ -189,11 +203,11 @@ def _execute_sketch(args, signatures_factory): set_quiet(args.quiet) if not args.filenames: - error('error: no input filenames provided! nothing to do - exiting.') + error("error: no input filenames provided! nothing to do - exiting.") sys.exit(-1) - if args.license != 'CC0': - error('error: sourmash only supports CC0-licensed signatures. sorry!') + if args.license != "CC0": + error("error: sourmash only supports CC0-licensed signatures. sorry!") sys.exit(-1) notify(f'computing signatures for files: {", ".join(args.filenames)}') @@ -208,15 +222,15 @@ def _execute_sketch(args, signatures_factory): # get number of output sigs: num_sigs = len(signatures_factory.params_list) - notify(f'Computing a total of {num_sigs} signature(s) for each input.') + notify(f"Computing a total of {num_sigs} signature(s) for each input.") if num_sigs == 0: - error('...nothing to calculate!? Exiting!') + error("...nothing to calculate!? Exiting!") sys.exit(-1) - if args.merge: # single name specified - combine all + if args.merge: # single name specified - combine all _compute_merged(args, signatures_factory) - else: # compute individual signatures + else: # compute individual signatures _compute_individual(args, signatures_factory) @@ -229,8 +243,7 @@ def dna(args): args.input_is_protein = False try: - signatures_factory = _signatures_for_sketch_factory(args.param_string, - 'dna') + signatures_factory = _signatures_for_sketch_factory(args.param_string, "dna") except ValueError as e: error(f"Error creating signatures: {str(e)}") sys.exit(-1) @@ -252,15 +265,14 @@ def protein(args): if args.dayhoff and args.hp: raise ValueError("cannot set both --dayhoff and --hp") if args.dayhoff: - moltype = 'dayhoff' + moltype = "dayhoff" elif args.hp: - moltype = 'hp' + moltype = "hp" else: - moltype = 'protein' + moltype = "protein" try: - signatures_factory = _signatures_for_sketch_factory(args.param_string, - moltype) + signatures_factory = _signatures_for_sketch_factory(args.param_string, moltype) except ValueError as e: error(f"Error creating signatures: {str(e)}") sys.exit(-1) @@ -281,15 +293,14 @@ def translate(args): if args.dayhoff and args.hp: raise ValueError("cannot set both --dayhoff and --hp") if args.dayhoff: - moltype = 'dayhoff' + moltype = "dayhoff" elif args.hp: - moltype = 'hp' + moltype = "hp" else: - moltype = 'protein' + moltype = "protein" try: - signatures_factory = _signatures_for_sketch_factory(args.param_string, - moltype) + signatures_factory = _signatures_for_sketch_factory(args.param_string, moltype) except ValueError as e: error(f"Error creating signatures: {str(e)}") sys.exit(-1) @@ -317,47 +328,51 @@ def _compute_sigs(to_build, output, *, check_sequence=False): is_dna = param_objs[0].dna for p in param_objs: - if p.dna: assert is_dna + if p.dna: + assert is_dna sig = SourmashSignature.from_params(p) sigs.append(sig) input_is_protein = not is_dna # read sequence records & sketch - notify(f'... reading sequences from {filename}') + notify(f"... reading sequences from {filename}") for n, record in enumerate(screed_iter): if n % 10000 == 0: if n: - notify('\r...{} {}', filename, n, end='') + notify("\r...{} {}", filename, n, end="") try: - add_seq(sigs, record.sequence, input_is_protein, - check_sequence) + add_seq(sigs, record.sequence, input_is_protein, check_sequence) except ValueError as exc: error(f"ERROR when reading from '{filename}' - ") error(str(exc)) sys.exit(-1) - notify('...{} {} sequences', filename, n, end='') + notify("...{} {} sequences", filename, n, end="") set_sig_name(sigs, filename, name) for sig in sigs: save_sigs.add(sig) - notify(f'calculated {len(sigs)} signatures for {n+1} sequences in {filename}') - + notify( + f"calculated {len(sigs)} signatures for {n+1} sequences in {filename}" + ) save_sigs.close() - notify(f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0.") + notify( + f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0." + ) def _output_csv_info(filename, sigs_to_build): "output information about what signatures to build, in CSV format" output_n = 0 with sourmash_args.FileOutputCSV(filename) as csv_fp: - w = csv.DictWriter(csv_fp, fieldnames=['filename', 'sketchtype', - 'output_index', 'name', - 'param_strs']) + w = csv.DictWriter( + csv_fp, + fieldnames=["filename", "sketchtype", "output_index", "name", "param_strs"], + ) w.writeheader() output_n = 0 @@ -366,18 +381,22 @@ def _output_csv_info(filename, sigs_to_build): # should all be the same! if param_objs[0].dna: - assert all( ( p.dna for p in param_objs ) ) + assert all(p.dna for p in param_objs) sketchtype = "dna" else: - assert not any( ( p.dna for p in param_objs ) ) + assert not any(p.dna for p in param_objs) sketchtype = "protein" for p in param_objs: param_strs.append(p.to_param_str()) - row = dict(filename=filename, sketchtype=sketchtype, - param_strs="-p " + " -p ".join(param_strs), - name=name, output_index=output_n) + row = dict( + filename=filename, + sketchtype=sketchtype, + param_strs="-p " + " -p ".join(param_strs), + name=name, + output_index=output_n, + ) w.writerow(row) @@ -385,15 +404,19 @@ def _output_csv_info(filename, sigs_to_build): def fromfile(args): - if args.license != 'CC0': - error('error: sourmash only supports CC0-licensed signatures. sorry!') + if args.license != "CC0": + error("error: sourmash only supports CC0-licensed signatures. sorry!") sys.exit(-1) if args.output_signatures and os.path.exists(args.output_signatures): if not args.force_output_already_exists: - error(f"** ERROR: output location '{args.output_signatures}' already exists!") - error(f"** Not overwriting/appending.") - error(f"** Use --force-output-already-exists if you want to overwrite/append.") + error( + f"** ERROR: output location '{args.output_signatures}' already exists!" + ) + error("** Not overwriting/appending.") + error( + "** Use --force-output-already-exists if you want to overwrite/append." + ) sys.exit(-1) # now, create the set of desired sketch specs. @@ -429,13 +452,13 @@ def fromfile(args): for csvfile in args.csvs: with sourmash_args.FileInputCSV(csvfile) as r: for row in r: - name = row['name'] + name = row["name"] if not name: n_missing_name += 1 continue - genome = row['genome_filename'] - proteome = row['protein_filename'] + genome = row["genome_filename"] + proteome = row["protein_filename"] total_rows += 1 if name in all_names: @@ -447,8 +470,10 @@ def fromfile(args): fail_exit = False if n_duplicate_name: if args.report_duplicated: - notify("duplicated:\n" + '\n'.join(sorted(duplicate_names))) - error(f"** ERROR: {n_duplicate_name} entries have duplicate 'name' records. Exiting!") + notify("duplicated:\n" + "\n".join(sorted(duplicate_names))) + error( + f"** ERROR: {n_duplicate_name} entries have duplicate 'name' records. Exiting!" + ) fail_exit = True if n_missing_name: @@ -470,7 +495,7 @@ def fromfile(args): # for each manifest row, for row in manifest.rows: - name = row['name'] + name = row["name"] if name: # build a ComputeParameters object for later comparison p = ComputeParameters.from_manifest_row(row) @@ -505,7 +530,7 @@ def fromfile(args): if p not in plist: # nope - figure out genome/proteome needed filename = genome if p.dna else proteome - filetype = 'genome' if p.dna else 'proteome' + filetype = "genome" if p.dna else "proteome" if filename: # add to build list @@ -524,77 +549,91 @@ def fromfile(args): if already_done_manifest: info_d = _summarize_manifest(already_done_manifest) - print_results('---') + print_results("---") print_results("summary of already-done sketches:") - for ski in info_d['sketch_info']: - mh_type = f"num={ski['num']}" if ski['num'] else f"scaled={ski['scaled']}" - mh_abund = ", abund" if ski['abund'] else "" + for ski in info_d["sketch_info"]: + mh_type = f"num={ski['num']}" if ski["num"] else f"scaled={ski['scaled']}" + mh_abund = ", abund" if ski["abund"] else "" sketch_str = f"{ski['count']} sketches with {ski['moltype']}, k={ski['ksize']}, {mh_type}{mh_abund}" print_results(f" {sketch_str: <50} {ski['n_hashes']} total hashes") - print_results('---') + print_results("---") if args.output_manifest_matching: already_done_manifest.write_to_filename(args.output_manifest_matching) - notify(f"output {len(already_done_manifest)} already-done signatures to '{args.output_manifest_matching}' in manifest format.") + notify( + f"output {len(already_done_manifest)} already-done signatures to '{args.output_manifest_matching}' in manifest format." + ) if missing: error("** ERROR: we cannot build some of the requested signatures.") - error(f"** {missing_count} total signatures (for {len(missing)} names) cannot be built.") + error( + f"** {missing_count} total signatures (for {len(missing)} names) cannot be built." + ) if args.ignore_missing: error("** (continuing past this error because --ignore-missing was set)") else: sys.exit(-1) - notify(f"** {total_sigs - skipped_sigs} new signatures to build from {len(to_build)} files;") + notify( + f"** {total_sigs - skipped_sigs} new signatures to build from {len(to_build)} files;" + ) if not to_build: - notify(f"** Nothing to build. Exiting!") + notify("** Nothing to build. Exiting!") sys.exit(0) if skipped_sigs: notify(f"** {skipped_sigs} already exist, so skipping those.") else: - notify(f"** we found no pre-existing signatures that match.") + notify("** we found no pre-existing signatures that match.") ## first, print out a summary of to_build: - print_results('---') + print_results("---") print_results("summary of sketches to build:") counter = Counter() - build_info_d = {} for filename, param_objs in to_build.items(): for p in param_objs: - moltype = p.moltype assert len(p.ksizes) == 1 ksize = p.ksizes[0] - if not p.dna: ksize //= 3 - - ski = _SketchInfo(ksize=ksize, moltype=p.moltype, - scaled=p.scaled, num=p.num_hashes, - abund=p.track_abundance) + if not p.dna: + ksize //= 3 + + ski = _SketchInfo( + ksize=ksize, + moltype=p.moltype, + scaled=p.scaled, + num=p.num_hashes, + abund=p.track_abundance, + ) counter[ski] += 1 for ski, count in counter.items(): mh_type = f"num={ski.num}" if ski.num else f"scaled={ski.scaled}" mh_abund = ", abund" if ski.abund else "" - sketch_str = f"{count} sketches with {ski.moltype}, k={ski.ksize}, {mh_type}{mh_abund}" + sketch_str = ( + f"{count} sketches with {ski.moltype}, k={ski.ksize}, {mh_type}{mh_abund}" + ) print_results(f" {sketch_str: <50}") - print_results('---') + print_results("---") ## now, onward ho - do we build anything, or output stuff, or just exit? - if args.output_signatures: # actually compute - _compute_sigs(to_build, args.output_signatures, - check_sequence=args.check_sequence) + if args.output_signatures: # actually compute + _compute_sigs( + to_build, args.output_signatures, check_sequence=args.check_sequence + ) - if args.output_csv_info: # output info necessary to construct + if args.output_csv_info: # output info necessary to construct _output_csv_info(args.output_csv_info, to_build) - notify(f"** {total_sigs} total requested; output {total_sigs - skipped_sigs}, skipped {skipped_sigs}") + notify( + f"** {total_sigs} total requested; output {total_sigs - skipped_sigs}, skipped {skipped_sigs}" + ) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 7de69c5621..e2d1a09a50 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -9,20 +9,24 @@ import io import screed -from .compare import (compare_all_pairs, compare_serial_containment, - compare_serial_max_containment, compare_serial_avg_containment) +from .compare import ( + compare_all_pairs, + compare_serial_containment, + compare_serial_max_containment, + compare_serial_avg_containment, +) from . import MinHash from .sbtmh import load_sbt_index, create_sbt_index from . import signature as sig from . import sourmash_args from .logging import notify, error, print_results, set_quiet -from .sourmash_args import (FileOutput, FileOutputCSV, - SaveSignaturesToLocation) +from .sourmash_args import FileOutput, FileOutputCSV, SaveSignaturesToLocation from .search import prefetch_database, PrefetchResult from .index import LazyLinearIndex WATERMARK_SIZE = 10000 + def _get_screen_width(): # default fallback is 80x24 (col, rows) = shutil.get_terminal_size() @@ -52,17 +56,21 @@ def compare(args): moltypes = set() size_may_be_inaccurate = False for filename in inp_files: - notify(f"loading '{filename}'", end='\r') - loaded = sourmash_args.load_file_as_signatures(filename, - ksize=args.ksize, - select_moltype=moltype, - picklist=picklist, - yield_all_files=args.force, - progress=progress, - pattern=pattern_search) + notify(f"loading '{filename}'", end="\r") + loaded = sourmash_args.load_file_as_signatures( + filename, + ksize=args.ksize, + select_moltype=moltype, + picklist=picklist, + yield_all_files=args.force, + progress=progress, + pattern=pattern_search, + ) loaded = list(loaded) if not loaded: - notify(f'\nwarning: no signatures loaded at given ksize/molecule type/picklist from {filename}') + notify( + f"\nwarning: no signatures loaded at given ksize/molecule type/picklist from {filename}" + ) siglist.extend(loaded) # track ksizes/moltypes @@ -75,22 +83,22 @@ def compare(args): break if not siglist: - error('no signatures found! exiting.') + error("no signatures found! exiting.") sys.exit(-1) # check ksizes and type if len(ksizes) > 1: - error('multiple k-mer sizes loaded; please specify one with -k.') + error("multiple k-mer sizes loaded; please specify one with -k.") ksizes = sorted(ksizes) - error('(saw k-mer sizes {})'.format(', '.join(map(str, ksizes)))) + error("(saw k-mer sizes {})".format(", ".join(map(str, ksizes)))) sys.exit(-1) if len(moltypes) > 1: - error('multiple molecule types loaded; please specify --dna, --protein') + error("multiple molecule types loaded; please specify --dna, --protein") sys.exit(-1) - notify(' '*79, end='\r') - notify(f'loaded {format(len(siglist))} signatures total.') + notify(" " * 79, end="\r") + notify(f"loaded {format(len(siglist))} signatures total.") if picklist: sourmash_args.report_picklist(args, picklist) @@ -103,21 +111,27 @@ def compare(args): # complain if it's not all one or the other if is_scaled != is_scaled_2: - error('ERROR: cannot mix scaled signatures with num signatures') + error("ERROR: cannot mix scaled signatures with num signatures") sys.exit(-1) is_containment = False if args.containment or args.max_containment or args.avg_containment: is_containment = True - containment_args = [args.containment, args.max_containment, args.avg_containment] + containment_args = [ + args.containment, + args.max_containment, + args.avg_containment, + ] if sum(containment_args) > 1: notify("ERROR: cannot specify more than one containment argument!") sys.exit(-1) # complain if --containment and not is_scaled if is_containment and not is_scaled: - error('must use scaled signatures with --containment, --max-containment, and --avg-containment') + error( + "must use scaled signatures with --containment, --max-containment, and --avg-containment" + ) sys.exit(-1) # complain if --ani and not is_scaled @@ -126,14 +140,16 @@ def compare(args): return_ani = True if return_ani and not is_scaled: - error('must use scaled signatures with --estimate-ani') + error("must use scaled signatures with --estimate-ani") sys.exit(-1) # notify about implicit --ignore-abundance: if is_containment or return_ani: - track_abundances = any(( s.minhash.track_abundance for s in siglist )) + track_abundances = any(s.minhash.track_abundance for s in siglist) if track_abundances: - notify('NOTE: --containment, --max-containment, --avg-containment, and --estimate-ani ignore signature abundances.') + notify( + "NOTE: --containment, --max-containment, --avg-containment, and --estimate-ani ignore signature abundances." + ) # if using scaled sketches or --scaled, downsample to common max scaled. printed_scaled_msg = False @@ -144,7 +160,9 @@ def compare(args): max_scaled = max(max_scaled, args.scaled) if max_scaled > args.scaled: - notify(f"WARNING: --scaled specified {args.scaled}, but max scaled of sketches is {max_scaled}") + notify( + f"WARNING: --scaled specified {args.scaled}, but max scaled of sketches is {max_scaled}" + ) notify(f"WARNING: continuing with scaled value of {max_scaled}.") new_siglist = [] @@ -153,7 +171,9 @@ def compare(args): size_may_be_inaccurate = True if s.minhash.scaled != max_scaled: if not printed_scaled_msg: - notify(f'NOTE: downsampling to scaled value of {format(max_scaled)}') + notify( + f"NOTE: downsampling to scaled value of {format(max_scaled)}" + ) printed_scaled_msg = True with s.update() as s: s.minhash = s.minhash.downsample(scaled=max_scaled) @@ -166,10 +186,10 @@ def compare(args): sys.exit(-1) if len(siglist) == 0: - error('no signatures!') + error("no signatures!") sys.exit(-1) - notify('') + notify("") # build the distance matrix numpy.set_printoptions(precision=3, suppress=True) @@ -184,8 +204,9 @@ def compare(args): elif args.avg_containment: similarity = compare_serial_avg_containment(siglist, return_ani=return_ani) else: - similarity = compare_all_pairs(siglist, args.ignore_abundance, - n_jobs=args.processes, return_ani=return_ani) + similarity = compare_all_pairs( + siglist, args.ignore_abundance, n_jobs=args.processes, return_ani=return_ani + ) # if distance matrix desired, switch to 1-similarity if args.distance_matrix: @@ -196,25 +217,33 @@ def compare(args): if len(siglist) < 30: for i, ss in enumerate(siglist): # for small matrices, pretty-print some output - name_num = '{}-{}'.format(i, str(ss)) + name_num = f"{i}-{str(ss)}" if len(name_num) > 20: - name_num = name_num[:17] + '...' - print_results('{:20s}\t{}'.format(name_num, matrix[i, :, ],)) + name_num = name_num[:17] + "..." + print_results( + "{:20s}\t{}".format( + name_num, + matrix[ + i, + :, + ], + ) + ) if args.distance_matrix: - print_results('max distance in matrix: {:.3f}', numpy.max(matrix)) + print_results("max distance in matrix: {:.3f}", numpy.max(matrix)) else: - print_results('min similarity in matrix: {:.3f}', numpy.min(matrix)) + print_results("min similarity in matrix: {:.3f}", numpy.min(matrix)) # shall we output a matrix to stdout? if args.output: - labeloutname = args.output + '.labels.txt' - notify(f'saving labels to: {labeloutname}') - with open(labeloutname, 'w') as fp: + labeloutname = args.output + ".labels.txt" + notify(f"saving labels to: {labeloutname}") + with open(labeloutname, "w") as fp: fp.write("\n".join(labeltext)) - notify(f'saving comparison matrix to: {args.output}') - with open(args.output, 'wb') as fp: + notify(f"saving comparison matrix to: {args.output}") + with open(args.output, "wb") as fp: numpy.save(fp, matrix) # output CSV? @@ -231,15 +260,20 @@ def compare(args): if size_may_be_inaccurate: if args.distance_matrix: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI distances will be set to 1 for these comparisons.") + notify( + "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI distances will be set to 1 for these comparisons." + ) else: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 1 for these comparisons.") + notify( + "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 1 for these comparisons." + ) def plot(args): "Produce a clustering matrix and plot." import matplotlib as mpl - mpl.use('Agg') + + mpl.use("Agg") import numpy import pylab import scipy.cluster.hierarchy as sch @@ -248,16 +282,16 @@ def plot(args): # load files D_filename = args.distances - notify(f'loading comparison matrix from {D_filename}...') - with open(D_filename, 'rb') as f: + notify(f"loading comparison matrix from {D_filename}...") + with open(D_filename, "rb") as f: D = numpy.load(f) # not sure how to change this to use f-strings - notify('...got {} x {} matrix.', *D.shape) + notify("...got {} x {} matrix.", *D.shape) # see sourmash#2790 for details :) if args.labeltext or args.labels: display_labels = True - args.labels = True # override => labels always true + args.labels = True # override => labels always true elif args.labels is None and not args.indices: # default to labels args.labels = True @@ -273,14 +307,14 @@ def plot(args): if args.labeltext: labelfilename = args.labeltext else: - labelfilename = D_filename + '.labels.txt' + labelfilename = D_filename + ".labels.txt" - notify(f'loading labels from {labelfilename}') + notify(f"loading labels from {labelfilename}") with open(labelfilename) as f: - labeltext = [ x.strip() for x in f ] - + labeltext = [x.strip() for x in f] + if len(labeltext) != D.shape[0]: - error('{} labels != matrix size, exiting', len(labeltext)) + error("{} labels != matrix size, exiting", len(labeltext)) sys.exit(-1) elif args.indices: # construct integer labels @@ -290,14 +324,14 @@ def plot(args): labeltext = [""] * D.shape[0] if args.pdf: - ext = '.pdf' + ext = ".pdf" else: - ext = '.png' + ext = ".png" # build filenames, decide on PDF/PNG output - dendrogram_out = os.path.basename(D_filename) + '.dendro' + ext - matrix_out = os.path.basename(D_filename) + '.matrix' + ext - hist_out = os.path.basename(D_filename) + '.hist' + ext + dendrogram_out = os.path.basename(D_filename) + ".dendro" + ext + matrix_out = os.path.basename(D_filename) + ".matrix" + ext + hist_out = os.path.basename(D_filename) + ".hist" + ext # output to a different directory? if args.output_dir: @@ -308,13 +342,13 @@ def plot(args): hist_out = os.path.join(args.output_dir, hist_out) # make the histogram - notify(f'saving histogram of matrix values => {hist_out}') - fig = pylab.figure(figsize=(8,5)) + notify(f"saving histogram of matrix values => {hist_out}") + fig = pylab.figure(figsize=(8, 5)) pylab.hist(numpy.array(D.flat), bins=100) fig.savefig(hist_out) ### make the dendrogram: - fig = pylab.figure(figsize=(8,5)) + fig = pylab.figure(figsize=(8, 5)) ax1 = fig.add_axes([0.1, 0.1, 0.7, 0.8]) ax1.set_xticks([]) ax1.set_yticks([]) @@ -325,32 +359,36 @@ def plot(args): sample_idx = list(range(len(labeltext))) numpy.random.shuffle(sample_idx) - sample_idx = sample_idx[:args.subsample] + sample_idx = sample_idx[: args.subsample] np_idx = numpy.array(sample_idx) D = D[numpy.ix_(np_idx, np_idx)] - labeltext = [ labeltext[idx] for idx in sample_idx ] + labeltext = [labeltext[idx] for idx in sample_idx] ### do clustering - Y = sch.linkage(D, method='single') - sch.dendrogram(Y, orientation='right', labels=labeltext, - no_labels=not display_labels) + Y = sch.linkage(D, method="single") + sch.dendrogram( + Y, orientation="right", labels=labeltext, no_labels=not display_labels + ) fig.savefig(dendrogram_out) - notify(f'wrote dendrogram to: {dendrogram_out}') + notify(f"wrote dendrogram to: {dendrogram_out}") ### make the dendrogram+matrix: - (fig, rlabels, rmat) = sourmash_fig.plot_composite_matrix(D, labeltext, - show_labels=display_labels, - vmin=args.vmin, - vmax=args.vmax, - force=args.force) + (fig, rlabels, rmat) = sourmash_fig.plot_composite_matrix( + D, + labeltext, + show_labels=display_labels, + vmin=args.vmin, + vmax=args.vmax, + force=args.force, + ) fig.savefig(matrix_out) - notify(f'wrote numpy distance matrix to: {matrix_out}') + notify(f"wrote numpy distance matrix to: {matrix_out}") if len(labeltext) < 30: # for small matrices, print out sample numbering for FYI. for i, name in enumerate(labeltext): - print_results('{}\t{}', i, name) + print_results("{}\t{}", i, name) # write out re-ordered matrix and labels if args.csv: @@ -361,15 +399,15 @@ def plot(args): for i in range(len(rlabels)): y = [] for j in range(len(rlabels)): - y.append('{}'.format(rmat[i][j])) + y.append(f"{rmat[i][j]}") w.writerow(y) - notify(f'Wrote clustered matrix and labels out to {args.csv}') + notify(f"Wrote clustered matrix and labels out to {args.csv}") def import_csv(args): "Import a CSV file full of signatures/hashes." - with open(args.mash_csvfile, newline='') as fp: + with open(args.mash_csvfile, newline="") as fp: reader = csv.reader(fp) siglist = [] for row in reader: @@ -377,29 +415,29 @@ def import_csv(args): hashseed = int(row[1]) # only support a limited import type, for now ;) - assert hashfn == 'murmur64' + assert hashfn == "murmur64" assert hashseed == 42 _, _, ksize, name, hashes = row ksize = int(ksize) hashes = hashes.strip() - hashes = list(map(int, hashes.split(' ' ))) + hashes = list(map(int, hashes.split(" "))) e = MinHash(len(hashes), ksize) e.add_many(hashes) s = sig.SourmashSignature(e, filename=name) siglist.append(s) - notify(f'loaded signature: {name} {s.md5sum()[:8]}') + notify(f"loaded signature: {name} {s.md5sum()[:8]}") - notify(f'saving {len(siglist)} signatures to JSON') + notify(f"saving {len(siglist)} signatures to JSON") with SaveSignaturesToLocation(args.output) as save_sig: save_sig.add_many(siglist) def sbt_combine(args): inp_files = list(args.sbts) - notify(f'combining {len(inp_files)} SBTs') + notify(f"combining {len(inp_files)} SBTs") tree = load_sbt_index(inp_files.pop(0)) @@ -426,11 +464,11 @@ def index(args): tree = create_sbt_index(args.bf_size, n_children=args.n_children) if args.sparseness < 0 or args.sparseness > 1.0: - error('sparseness must be in range [0.0, 1.0].') + error("sparseness must be in range [0.0, 1.0].") if args.scaled: args.scaled = int(args.scaled) - notify(f'downsampling signatures to scaled={args.scaled}') + notify(f"downsampling signatures to scaled={args.scaled}") inp_files = list(args.signatures) if args.from_file: @@ -441,7 +479,7 @@ def index(args): error("ERROR: no files to index!? Supply on command line or use --from-file") sys.exit(-1) - notify(f'loading {len(inp_files)} files into SBT') + notify(f"loading {len(inp_files)} files into SBT") progress = sourmash_args.SignatureLoadingProgress() @@ -451,12 +489,14 @@ def index(args): nums = set() scaleds = set() for f in inp_files: - siglist = sourmash_args.load_file_as_signatures(f, - ksize=args.ksize, - select_moltype=moltype, - yield_all_files=args.force, - picklist=picklist, - progress=progress) + siglist = sourmash_args.load_file_as_signatures( + f, + ksize=args.ksize, + select_moltype=moltype, + yield_all_files=args.force, + picklist=picklist, + progress=progress, + ) # load all matching signatures in this file ss = None @@ -481,26 +521,29 @@ def index(args): # check to make sure we aren't loading incompatible signatures if len(ksizes) > 1 or len(moltypes) > 1: - error('multiple k-mer sizes or molecule types present; fail.') - error('specify --dna/--protein and --ksize as necessary') - error('ksizes: {}; moltypes: {}', - ", ".join(map(str, ksizes)), ", ".join(moltypes)) + error("multiple k-mer sizes or molecule types present; fail.") + error("specify --dna/--protein and --ksize as necessary") + error( + "ksizes: {}; moltypes: {}", + ", ".join(map(str, ksizes)), + ", ".join(moltypes), + ) sys.exit(-1) - if nums == { 0 } and len(scaleds) == 1: - pass # good - elif scaleds == { 0 } and len(nums) == 1: - pass # also good + if nums == {0} and len(scaleds) == 1: + pass # good + elif scaleds == {0} and len(nums) == 1: + pass # also good else: - error('trying to build an SBT with incompatible signatures.') - error('nums = {}; scaleds = {}', repr(nums), repr(scaleds)) + error("trying to build an SBT with incompatible signatures.") + error("nums = {}; scaleds = {}", repr(nums), repr(scaleds)) sys.exit(-1) - notify('') + notify("") # did we load any!? if n == 0: - error('no signatures found to load into tree!? failing.') + error("no signatures found to load into tree!? failing.") sys.exit(-1) if picklist: @@ -513,8 +556,10 @@ def index(args): def search(args): - from .search import (search_databases_with_flat_query, - search_databases_with_abund_query) + from .search import ( + search_databases_with_flat_query, + search_databases_with_abund_query, + ) set_quiet(args.quiet, args.debug) moltype = sourmash_args.calculate_moltype(args) @@ -522,18 +567,21 @@ def search(args): pattern_search = sourmash_args.load_include_exclude_db_patterns(args) # set up the query. - query = sourmash_args.load_query_signature(args.query, - ksize=args.ksize, - select_moltype=moltype, - select_md5=args.md5) - notify(f'loaded query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})') + query = sourmash_args.load_query_signature( + args.query, ksize=args.ksize, select_moltype=moltype, select_md5=args.md5 + ) + notify( + f"loaded query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})" + ) if args.scaled: if not query.minhash.scaled: - error('cannot downsample a signature not created with --scaled') + error("cannot downsample a signature not created with --scaled") sys.exit(-1) if args.scaled != query.minhash.scaled: - notify(f'downsampling query from scaled={query.minhash.scaled} to {int(args.scaled)}') + notify( + f"downsampling query from scaled={query.minhash.scaled} to {int(args.scaled)}" + ) with query.update() as query: query.minhash = query.minhash.downsample(scaled=args.scaled) @@ -544,11 +592,14 @@ def search(args): notify("ERROR: cannot specify both --containment and --max-containment!") sys.exit(-1) - databases = sourmash_args.load_dbs_and_sigs(args.databases, query, - not is_containment, - picklist=picklist, - pattern=pattern_search, - fail_on_empty_database=args.fail_on_empty_database) + databases = sourmash_args.load_dbs_and_sigs( + args.databases, + query, + not is_containment, + picklist=picklist, + pattern=pattern_search, + fail_on_empty_database=args.fail_on_empty_database, + ) # handle signatures with abundance if query.minhash.track_abundance: @@ -559,7 +610,9 @@ def search(args): query.minhash = query.minhash.flatten() elif args.containment or args.max_containment: # abund sketch + keep abundance => no containment searches - notify("ERROR: cannot do containment searches on an abund signature; maybe specify --ignore-abundance?") + notify( + "ERROR: cannot do containment searches on an abund signature; maybe specify --ignore-abundance?" + ) sys.exit(-1) else: # forcibly ignore abundances if query has no abundances @@ -568,32 +621,40 @@ def search(args): # do the actual search if query.minhash.track_abundance: try: - results = search_databases_with_abund_query(query, databases, - threshold=args.threshold, - do_containment=args.containment, - do_max_containment=args.max_containment, - best_only=args.best_only, - unload_data=True) + results = search_databases_with_abund_query( + query, + databases, + threshold=args.threshold, + do_containment=args.containment, + do_max_containment=args.max_containment, + best_only=args.best_only, + unload_data=True, + ) except TypeError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) else: - results = search_databases_with_flat_query(query, databases, - threshold=args.threshold, - do_containment=args.containment, - do_max_containment=args.max_containment, - best_only=args.best_only, - unload_data=True, - estimate_ani_ci=args.estimate_ani_ci) + results = search_databases_with_flat_query( + query, + databases, + threshold=args.threshold, + do_containment=args.containment, + do_max_containment=args.max_containment, + best_only=args.best_only, + unload_data=True, + estimate_ani_ci=args.estimate_ani_ci, + ) n_matches = len(results) if args.best_only: args.num_results = 1 if not args.num_results or n_matches <= args.num_results: - print_results(f'{len(results)} matches above threshold {args.threshold:0.3f}:') + print_results(f"{len(results)} matches above threshold {args.threshold:0.3f}:") else: - print_results(f'{len(results)} matches above threshold {args.threshold:0.3f}; showing first {args.num_results}:') + print_results( + f"{len(results)} matches above threshold {args.threshold:0.3f}; showing first {args.num_results}:" + ) n_matches = args.num_results @@ -604,9 +665,9 @@ def search(args): print_results("similarity match") print_results("---------- -----") for sr in results[:n_matches]: - pct = '{:.1f}%'.format(sr.similarity*100) + pct = f"{sr.similarity * 100:.1f}%" name = sr.match._display_name(60) - print_results('{:>6} {}', pct, name) + print_results("{:>6} {}", pct, name) if sr.cmp_scaled is not None: if not size_may_be_inaccurate and sr.size_may_be_inaccurate: size_may_be_inaccurate = True @@ -637,9 +698,13 @@ def search(args): sourmash_args.report_picklist(args, picklist) if size_may_be_inaccurate: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.") + notify( + "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." + ) if jaccard_ani_untrustworthy: - notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.") + notify( + "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." + ) def categorize(args): @@ -653,7 +718,7 @@ def categorize(args): # eliminate names we've already categorized already_names = set() if args.load_csv: - with open(args.load_csv, newline='') as fp: + with open(args.load_csv, newline="") as fp: r = csv.reader(fp) for row in r: already_names.add(row[0]) @@ -668,13 +733,12 @@ def _yield_all_sigs(queries, ksize, moltype): for filename in queries: mi = MultiIndex.load_from_path(filename, False) mi = mi.select(ksize=ksize, moltype=moltype) - for ss, loc in mi.signatures_with_location(): - yield ss, loc + yield from mi.signatures_with_location() csv_w = None csv_fp = None if args.csv: - csv_fp = open(args.csv, 'w', newline='') + csv_fp = open(args.csv, "w", newline="") csv_w = csv.writer(csv_fp) search_obj = make_jaccard_search_query(threshold=args.threshold) @@ -683,7 +747,9 @@ def _yield_all_sigs(queries, ksize, moltype): if loc in already_names: continue - notify(f'loaded query: {str(orig_query)[:30]}... (k={orig_query.minhash.ksize}, {orig_query.minhash.moltype})') + notify( + f"loaded query: {str(orig_query)[:30]}... (k={orig_query.minhash.ksize}, {orig_query.minhash.moltype})" + ) if args.ignore_abundance and orig_query.minhash.track_abundance: query = orig_query.copy() @@ -691,7 +757,9 @@ def _yield_all_sigs(queries, ksize, moltype): query.minhash = query.minhash.flatten() else: if orig_query.minhash.track_abundance: - notify("ERROR: this search cannot be done on signatures calculated with abundance.") + notify( + "ERROR: this search cannot be done on signatures calculated with abundance." + ) notify("ERROR: please specify --ignore-abundance.") sys.exit(-1) @@ -700,19 +768,18 @@ def _yield_all_sigs(queries, ksize, moltype): results = [] for sr in db.find(search_obj, query): match = sr.signature - if match.md5sum() != query.md5sum(): # ignore self. + if match.md5sum() != query.md5sum(): # ignore self. results.append((orig_query.similarity(match), match)) if results: - results.sort(key=lambda x: -x[0]) # reverse sort on similarity + results.sort(key=lambda x: -x[0]) # reverse sort on similarity best_hit_sim, best_hit_query = results[0] - notify(f'for {query}, found: {best_hit_sim:.2f} {best_hit_query}') + notify(f"for {query}, found: {best_hit_sim:.2f} {best_hit_query}") best_hit_query_name = best_hit_query.name if csv_w: - csv_w.writerow([loc, query, best_hit_query_name, - best_hit_sim]) + csv_w.writerow([loc, query, best_hit_query_name, best_hit_sim]) else: - notify(f'for {query}, no match found') + notify(f"for {query}, no match found") if csv_fp: csv_fp.close() @@ -727,43 +794,49 @@ def gather(args): pattern_search = sourmash_args.load_include_exclude_db_patterns(args) # load the query signature & figure out all the things - query = sourmash_args.load_query_signature(args.query, - ksize=args.ksize, - select_moltype=moltype, - select_md5=args.md5) - notify(f'loaded query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})') + query = sourmash_args.load_query_signature( + args.query, ksize=args.ksize, select_moltype=moltype, select_md5=args.md5 + ) + notify( + f"loaded query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})" + ) # verify signature was computed right. if not query.minhash.scaled: - error('query signature needs to be created with --scaled') + error("query signature needs to be created with --scaled") sys.exit(-1) if args.scaled and args.scaled != query.minhash.scaled: - notify(f'downsampling query from scaled={query.minhash.scaled} to {int(args.scaled)}') + notify( + f"downsampling query from scaled={query.minhash.scaled} to {int(args.scaled)}" + ) with query.update() as query: query.minhash = query.minhash.downsample(scaled=args.scaled) # empty? if not len(query.minhash): - error('no query hashes!? exiting.') + error("no query hashes!? exiting.") sys.exit(-1) # set up the search databases cache_size = args.cache_size if args.cache_size == 0: cache_size = None - databases = sourmash_args.load_dbs_and_sigs(args.databases, query, False, - cache_size=cache_size, - picklist=picklist, - pattern=pattern_search, - fail_on_empty_database=args.fail_on_empty_database) - - - if args.linear: # force linear traversal? - databases = [ LazyLinearIndex(db) for db in databases ] + databases = sourmash_args.load_dbs_and_sigs( + args.databases, + query, + False, + cache_size=cache_size, + picklist=picklist, + pattern=pattern_search, + fail_on_empty_database=args.fail_on_empty_database, + ) + + if args.linear: # force linear traversal? + databases = [LazyLinearIndex(db) for db in databases] size_may_be_inaccurate = False - if args.prefetch: # note: on by default! + if args.prefetch: # note: on by default! notify("Starting prefetch sweep across databases.") prefetch_query = query.copy() if prefetch_query.minhash.track_abundance: @@ -800,14 +873,21 @@ def gather(args): ident_mh.add_many(union_found) noident_mh.remove_many(union_found) - # optionally calculate and output prefetch info to csv + # optionally calculate and output prefetch info to csv if prefetch_csvout_fp: for found_sig in counter.signatures(): # calculate intersection stats and info - prefetch_result = PrefetchResult(prefetch_query, found_sig, cmp_scaled=scaled, - threshold_bp=args.threshold_bp, estimate_ani_ci=args.estimate_ani_ci) + prefetch_result = PrefetchResult( + prefetch_query, + found_sig, + cmp_scaled=scaled, + threshold_bp=args.threshold_bp, + estimate_ani_ci=args.estimate_ani_ci, + ) if prefetch_csvout_w is None: - prefetch_csvout_w = prefetch_result.init_dictwriter(prefetch_csvout_fp) + prefetch_csvout_w = prefetch_result.init_dictwriter( + prefetch_csvout_fp + ) prefetch_result.write(prefetch_csvout_w) counters.append(counter) @@ -817,7 +897,9 @@ def gather(args): prefetch_csvout_fp.flush() display_bp = format_bp(args.threshold_bp) - notify(f"Prefetch found {len(save_prefetch)} signatures with overlap >= {display_bp}.") + notify( + f"Prefetch found {len(save_prefetch)} signatures with overlap >= {display_bp}." + ) save_prefetch.close() if prefetch_csvout_fp: prefetch_csvout_fp.close() @@ -831,20 +913,22 @@ def gather(args): notify("Doing gather to generate minimum metagenome cover.") found = 0 - weighted_missed = 1 is_abundance = query.minhash.track_abundance and not args.ignore_abundance orig_query_mh = query.minhash if not orig_query_mh.size_is_accurate(): size_may_be_inaccurate = True - gather_iter = GatherDatabases(query, counters, - threshold_bp=args.threshold_bp, - ignore_abundance=args.ignore_abundance, - noident_mh=noident_mh, - ident_mh=ident_mh, - estimate_ani_ci=args.estimate_ani_ci) + gather_iter = GatherDatabases( + query, + counters, + threshold_bp=args.threshold_bp, + ignore_abundance=args.ignore_abundance, + noident_mh=noident_mh, + ident_mh=ident_mh, + estimate_ani_ci=args.estimate_ani_ci, + ) screen_width = _get_screen_width() - sum_f_uniq_found = 0. + sum_f_uniq_found = 0.0 result = None ### open output handles as needed for (1) saving CSV (2) saving matches @@ -867,7 +951,7 @@ def gather(args): found += 1 sum_f_uniq_found += result.f_unique_to_query - if found == 1: # first result? print header. + if found == 1: # first result? print header. if is_abundance: print_results("") print_results("overlap p_query p_match avg_abund") @@ -877,22 +961,30 @@ def gather(args): print_results("overlap p_query p_match") print_results("--------- ------- -------") - # print interim result & save in `found` list for later use - pct_query = '{:.1f}%'.format(result.f_unique_weighted*100) - pct_genome = '{:.1f}%'.format(result.f_match*100) + pct_query = f"{result.f_unique_weighted * 100:.1f}%" + pct_genome = f"{result.f_match * 100:.1f}%" if is_abundance: name = result.match._display_name(screen_width - 41) - average_abund ='{:.1f}'.format(result.average_abund) - print_results('{:9} {:>7} {:>7} {:>9} {}', - format_bp(result.intersect_bp), pct_query, pct_genome, - average_abund, name) + average_abund = f"{result.average_abund:.1f}" + print_results( + "{:9} {:>7} {:>7} {:>9} {}", + format_bp(result.intersect_bp), + pct_query, + pct_genome, + average_abund, + name, + ) else: name = result.match._display_name(screen_width - 31) - print_results('{:9} {:>7} {:>7} {}', - format_bp(result.intersect_bp), pct_query, pct_genome, - name) + print_results( + "{:9} {:>7} {:>7} {}", + format_bp(result.intersect_bp), + pct_query, + pct_genome, + name, + ) # write out CSV if args.output: @@ -915,13 +1007,15 @@ def gather(args): # report on thresholding - if gather_iter.query: # if still a query, then we failed the threshold. - notify(f'found less than {format_bp(args.threshold_bp)} in common. => exiting') + notify(f"found less than {format_bp(args.threshold_bp)} in common. => exiting") # basic reporting: if found: - print_results(f'\nfound {found} matches total;') + print_results(f"\nfound {found} matches total;") if found == args.num_results: - print_results(f'(truncated gather because --num-results={args.num_results})') + print_results( + f"(truncated gather because --num-results={args.num_results})" + ) else: display_bp = format_bp(args.threshold_bp) notify(f"\nNo matches found for --threshold-bp at {display_bp}.") @@ -930,13 +1024,19 @@ def gather(args): if is_abundance and result: p_covered = result.sum_weighted_found / result.total_weighted_hashes p_covered *= 100 - print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query.') + print_results( + f"the recovered matches hit {p_covered:.1f}% of the abundance-weighted query." + ) - print_results(f'the recovered matches hit {sum_f_uniq_found*100:.1f}% of the query k-mers (unweighted).') + print_results( + f"the recovered matches hit {sum_f_uniq_found*100:.1f}% of the query k-mers (unweighted)." + ) - print_results('') + print_results("") if gather_iter.scaled != query.minhash.scaled: - print_results(f'WARNING: final scaled was {gather_iter.scaled}, vs query scaled of {query.minhash.scaled}') + print_results( + f"WARNING: final scaled was {gather_iter.scaled}, vs query scaled of {query.minhash.scaled}" + ) # save CSV? if (found and args.output) or args.create_empty_results: @@ -947,7 +1047,7 @@ def gather(args): if args.output_unassigned: remaining_query = gather_iter.query if not (remaining_query.minhash or noident_mh): - notify('no unassigned hashes to save with --output-unassigned!') + notify("no unassigned hashes to save with --output-unassigned!") else: notify(f"saving unassigned hashes to '{args.output_unassigned}'") @@ -967,7 +1067,9 @@ def gather(args): sourmash_args.report_picklist(args, picklist) if size_may_be_inaccurate: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.") + notify( + "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." + ) # DONE w/gather function. @@ -979,11 +1081,11 @@ def multigather(args): moltype = sourmash_args.calculate_moltype(args) if not args.db: - error('Error! must specify at least one database with --db') + error("Error! must specify at least one database with --db") sys.exit(-1) if not args.query and not args.query_from_file: - error('Error! must specify at least one query signature with --query') + error("Error! must specify at least one query signature with --query") sys.exit(-1) # flatten --db and --query @@ -994,36 +1096,49 @@ def multigather(args): inp_files.extend(more_files) # need a query to get ksize, moltype for db loading - query = next(iter(sourmash_args.load_file_as_signatures(inp_files[0], ksize=args.ksize, select_moltype=moltype))) - - notify(f'loaded first query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})') - - databases = sourmash_args.load_dbs_and_sigs(args.db, query, False, - fail_on_empty_database=args.fail_on_empty_database) + query = next( + iter( + sourmash_args.load_file_as_signatures( + inp_files[0], ksize=args.ksize, select_moltype=moltype + ) + ) + ) + + notify( + f"loaded first query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})" + ) + + databases = sourmash_args.load_dbs_and_sigs( + args.db, query, False, fail_on_empty_database=args.fail_on_empty_database + ) # run gather on all the queries. - n=0 + n = 0 size_may_be_inaccurate = False for queryfile in inp_files: # load the query signature(s) & figure out all the things - for query in sourmash_args.load_file_as_signatures(queryfile, - ksize=args.ksize, - select_moltype=moltype): - notify(f'loaded query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})') + for query in sourmash_args.load_file_as_signatures( + queryfile, ksize=args.ksize, select_moltype=moltype + ): + notify( + f"loaded query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})" + ) # verify signature was computed right. if not query.minhash.scaled: - error('query signature needs to be created with --scaled; skipping') + error("query signature needs to be created with --scaled; skipping") continue if args.scaled and args.scaled != query.minhash.scaled: - notify(f'downsampling query from scaled={query.minhash.scaled} to {int(args.scaled)}') + notify( + f"downsampling query from scaled={query.minhash.scaled} to {int(args.scaled)}" + ) with query.update() as query: query.minhash = query.minhash.downsample(scaled=args.scaled) # empty? if not len(query.minhash): - error('no query hashes!? skipping to next..') + error("no query hashes!? skipping to next..") continue counters = [] @@ -1050,17 +1165,19 @@ def multigather(args): ident_mh.add_many(union_found) found = 0 - weighted_missed = 1 is_abundance = query.minhash.track_abundance and not args.ignore_abundance orig_query_mh = query.minhash - gather_iter = GatherDatabases(query, counters, - threshold_bp=args.threshold_bp, - ignore_abundance=args.ignore_abundance, - noident_mh=noident_mh, - ident_mh=ident_mh) + gather_iter = GatherDatabases( + query, + counters, + threshold_bp=args.threshold_bp, + ignore_abundance=args.ignore_abundance, + noident_mh=noident_mh, + ident_mh=ident_mh, + ) screen_width = _get_screen_width() - sum_f_uniq_found = 0. + sum_f_uniq_found = 0.0 result = None query_filename = query.filename @@ -1071,9 +1188,9 @@ def multigather(args): output_base = os.path.basename(query_filename) if args.output_dir: output_base = os.path.join(args.output_dir, output_base) - output_csv = output_base + '.csv' + output_csv = output_base + ".csv" - output_matches = output_base + '.matches.sig' + output_matches = output_base + ".matches.sig" save_sig_obj = SaveSignaturesToLocation(output_matches) save_sig = save_sig_obj.__enter__() notify(f"saving all matching signatures to '{output_matches}'") @@ -1087,7 +1204,7 @@ def multigather(args): for result in gather_iter: found += 1 sum_f_uniq_found += result.f_unique_to_query - if found == 1: # first result? print header. + if found == 1: # first result? print header. if is_abundance: print_results("") print_results("overlap p_query p_match avg_abund") @@ -1097,22 +1214,30 @@ def multigather(args): print_results("overlap p_query p_match") print_results("--------- ------- -------") - # print interim result & save in a list for later use - pct_query = '{:.1f}%'.format(result.f_unique_weighted*100) - pct_genome = '{:.1f}%'.format(result.f_match*100) + pct_query = f"{result.f_unique_weighted * 100:.1f}%" + pct_genome = f"{result.f_match * 100:.1f}%" if is_abundance: name = result.match._display_name(screen_width - 41) - average_abund ='{:.1f}'.format(result.average_abund) - print_results('{:9} {:>7} {:>7} {:>9} {}', - format_bp(result.intersect_bp), pct_query, pct_genome, - average_abund, name) + average_abund = f"{result.average_abund:.1f}" + print_results( + "{:9} {:>7} {:>7} {:>9} {}", + format_bp(result.intersect_bp), + pct_query, + pct_genome, + average_abund, + name, + ) else: name = result.match._display_name(screen_width - 31) - print_results('{:9} {:>7} {:>7} {}', - format_bp(result.intersect_bp), pct_query, pct_genome, - name) + print_results( + "{:9} {:>7} {:>7} {}", + format_bp(result.intersect_bp), + pct_query, + pct_genome, + name, + ) ## @CTB if csv_writer is None: @@ -1128,10 +1253,12 @@ def multigather(args): # report on thresholding - if gather_iter.query.minhash: # if still a query, then we failed the threshold. - notify(f'found less than {format_bp(args.threshold_bp)} in common. => exiting') + notify( + f"found less than {format_bp(args.threshold_bp)} in common. => exiting" + ) # basic reporting - print_results('\nfound {} matches total;', found) + print_results("\nfound {} matches total;", found) # close saving etc. save_sig_obj.close() @@ -1143,17 +1270,21 @@ def multigather(args): if is_abundance and result: p_covered = result.sum_weighted_found / result.total_weighted_hashes p_covered *= 100 - print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query.') + print_results( + f"the recovered matches hit {p_covered:.1f}% of the abundance-weighted query." + ) - print_results(f'the recovered matches hit {sum_f_uniq_found*100:.1f}% of the query k-mers (unweighted).') - print_results('') + print_results( + f"the recovered matches hit {sum_f_uniq_found*100:.1f}% of the query k-mers (unweighted)." + ) + print_results("") if found == 0: - notify('nothing found... skipping.') + notify("nothing found... skipping.") continue - output_unassigned = output_base + '.unassigned.sig' - with open(output_unassigned, 'wt') as fp: + output_unassigned = output_base + ".unassigned.sig" + with open(output_unassigned, "w"): remaining_query = gather_iter.query if noident_mh: remaining_mh = remaining_query.minhash.to_mutable() @@ -1165,9 +1296,9 @@ def multigather(args): remaining_query.minhash = abund_query_mh if found == 0: - notify('nothing found - entire query signature unassigned.') + notify("nothing found - entire query signature unassigned.") elif not remaining_query: - notify('no unassigned hashes! not saving.') + notify("no unassigned hashes! not saving.") else: notify(f'saving unassigned hashes to "{output_unassigned}"') @@ -1177,9 +1308,11 @@ def multigather(args): n += 1 # fini, next query! - notify(f'\nconducted gather searches on {n} signatures') + notify(f"\nconducted gather searches on {n} signatures") if size_may_be_inaccurate: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.") + notify( + "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." + ) def watch(args): @@ -1187,7 +1320,7 @@ def watch(args): set_quiet(args.quiet) if args.input_is_protein and args.dna: - notify('WARNING: input is protein, turning off nucleotide hashing.') + notify("WARNING: input is protein, turning off nucleotide hashing.") args.dna = False args.protein = True @@ -1195,22 +1328,22 @@ def watch(args): notify('ERROR: cannot use "watch" with both nucleotide and protein.') if args.dna: - moltype = 'DNA' + moltype = "DNA" is_protein = False dayhoff = False hp = False elif args.protein: - moltype = 'protein' + moltype = "protein" is_protein = True dayhoff = False hp = False elif args.dayhoff: - moltype = 'dayhoff' + moltype = "dayhoff" is_protein = True dayhoff = True hp = False else: - moltype = 'hp' + moltype = "hp" is_protein = True dayhoff = False hp = True @@ -1224,23 +1357,27 @@ def watch(args): tree_mh = leaf.data.minhash ksize = tree_mh.ksize - E = MinHash(ksize=ksize, n=args.num_hashes, is_protein=is_protein, dayhoff=dayhoff, hp=hp) + E = MinHash( + ksize=ksize, n=args.num_hashes, is_protein=is_protein, dayhoff=dayhoff, hp=hp + ) - notify(f'Computing signature for k={ksize}, {moltype} from stdin') + notify(f"Computing signature for k={ksize}, {moltype} from stdin") def do_search(): results = [] - streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name) - for similarity, match, _ in tree.search(streamsig, - threshold=args.threshold, - best_only=True, - ignore_abundance=True, - do_containment=False): + streamsig = sig.SourmashSignature(E, filename="stdin", name=args.name) + for similarity, match, _ in tree.search( + streamsig, + threshold=args.threshold, + best_only=True, + ignore_abundance=True, + do_containment=False, + ): results.append((similarity, match)) return results - notify('reading sequences from stdin') + notify("reading sequences from stdin") watermark = WATERMARK_SIZE # iterate over input records @@ -1249,7 +1386,7 @@ def do_search(): for n, record in enumerate(screed_iter): # at each watermark, print status & check cardinality if n >= watermark: - notify(f'\r... read {n} sequences', end='') + notify(f"\r... read {n} sequences", end="") watermark += WATERMARK_SIZE if do_search(): @@ -1262,16 +1399,15 @@ def do_search(): results = do_search() if not results: - notify(f'... read {n} sequences, no matches found.') + notify(f"... read {n} sequences, no matches found.") else: - results.sort(key=lambda x: -x[0]) # take best + results.sort(key=lambda x: -x[0]) # take best similarity, found_sig = results[0] - print_results('FOUND: {}, at {:.3f}', found_sig, - similarity) + print_results("FOUND: {}, at {:.3f}", found_sig, similarity) if args.output: notify(f"saving signature to '{args.output}'") - streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name) + streamsig = sig.SourmashSignature(E, filename="stdin", name=args.name) with SaveSignaturesToLocation(args.output) as save_sig: save_sig.add(streamsig) @@ -1296,9 +1432,15 @@ def prefetch(args): notify("ERROR: no databases or signatures to search!?") sys.exit(-1) - if not (args.save_unmatched_hashes or args.save_matching_hashes or - args.save_matches or args.output): - notify("WARNING: no output(s) specified! Nothing will be saved from this prefetch!") + if not ( + args.save_unmatched_hashes + or args.save_matching_hashes + or args.save_matches + or args.output + ): + notify( + "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" + ) # figure out what k-mer size and molecule type we're looking for here ksize = args.ksize @@ -1307,15 +1449,16 @@ def prefetch(args): pattern_search = sourmash_args.load_include_exclude_db_patterns(args) # load the query signature & figure out all the things - query = sourmash_args.load_query_signature(args.query, - ksize=args.ksize, - select_moltype=moltype, - select_md5=args.md5) - notify(f'loaded query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})') + query = sourmash_args.load_query_signature( + args.query, ksize=args.ksize, select_moltype=moltype, select_md5=args.md5 + ) + notify( + f"loaded query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})" + ) # verify signature was computed with scaled. if not query.minhash.scaled: - error('query signature needs to be created with --scaled') + error("query signature needs to be created with --scaled") sys.exit(-1) # if with track_abund, flatten me @@ -1325,15 +1468,19 @@ def prefetch(args): query_mh = query_mh.flatten() if args.scaled and args.scaled != query_mh.scaled: - notify(f'downsampling query from scaled={query_mh.scaled} to {int(args.scaled)}') + notify( + f"downsampling query from scaled={query_mh.scaled} to {int(args.scaled)}" + ) query_mh = query_mh.downsample(scaled=args.scaled) - notify(f"query sketch has scaled={query_mh.scaled}; will be dynamically downsampled as needed.") + notify( + f"query sketch has scaled={query_mh.scaled}; will be dynamically downsampled as needed." + ) common_scaled = query_mh.scaled # empty? if not len(query_mh): - error('no query hashes!? exiting.') + error("no query hashes!? exiting.") sys.exit(-1) with query.update() as query: @@ -1357,12 +1504,12 @@ def prefetch(args): ident_mh = query_mh.copy_and_clear() noident_mh = query_mh.to_mutable() - did_a_search = False # track whether we did _any_ search at all! + did_a_search = False # track whether we did _any_ search at all! size_may_be_inaccurate = False total_signatures_loaded = 0 sum_signatures_after_select = 0 for dbfilename in args.databases: - notify(f"loading signatures from '{dbfilename}'", end='\r') + notify(f"loading signatures from '{dbfilename}'", end="\r") db = sourmash_args.load_file_as_index(dbfilename) total_signatures_loaded += len(db) @@ -1371,24 +1518,25 @@ def prefetch(args): if args.linear: db = LazyLinearIndex(db) - db = db.select(ksize=ksize, moltype=moltype, - containment=True, scaled=True) + db = db.select(ksize=ksize, moltype=moltype, containment=True, scaled=True) sum_signatures_after_select += len(db) - db = sourmash_args.apply_picklist_and_pattern(db, picklist, - pattern_search) + db = sourmash_args.apply_picklist_and_pattern(db, picklist, pattern_search) if not db: notify(f"...no compatible signatures in '{dbfilename}'; skipping") continue - for result in prefetch_database(query, db, args.threshold_bp, estimate_ani_ci= args.estimate_ani_ci): + for result in prefetch_database( + query, db, args.threshold_bp, estimate_ani_ci=args.estimate_ani_ci + ): match = result.match # ensure we're all on the same page wrt scaled resolution: - common_scaled = max(match.minhash.scaled, query.minhash.scaled, - common_scaled) + common_scaled = max( + match.minhash.scaled, query.minhash.scaled, common_scaled + ) query_mh = query.minhash.downsample(scaled=common_scaled) match_mh = match.minhash.downsample(scaled=common_scaled) @@ -1412,8 +1560,10 @@ def prefetch(args): matches_out.add(match) if matches_out.count % 10 == 0: - notify(f"total of {matches_out.count} matching signatures so far.", - end="\r") + notify( + f"total of {matches_out.count} matching signatures so far.", + end="\r", + ) # keep track of inaccurate size estimation if not size_may_be_inaccurate and result.size_may_be_inaccurate: @@ -1429,11 +1579,17 @@ def prefetch(args): del db notify("--") - notify(f"loaded {total_signatures_loaded} total signatures from {len(args.databases)} locations.") - notify(f"after selecting signatures compatible with search, {sum_signatures_after_select} remain.") + notify( + f"loaded {total_signatures_loaded} total signatures from {len(args.databases)} locations." + ) + notify( + f"after selecting signatures compatible with search, {sum_signatures_after_select} remain." + ) if not did_a_search: - notify("ERROR in prefetch: after picklists and patterns, no signatures to search!?") + notify( + "ERROR in prefetch: after picklists and patterns, no signatures to search!?" + ) sys.exit(-1) notify("--") @@ -1445,7 +1601,9 @@ def prefetch(args): csvout_fp.close() assert len(query_mh) == len(ident_mh) + len(noident_mh) - notify(f"of {len(query_mh)} distinct query hashes, {len(ident_mh)} were found in matches above threshold.") + notify( + f"of {len(query_mh)} distinct query hashes, {len(ident_mh)} were found in matches above threshold." + ) notify(f"a total of {len(noident_mh)} query hashes remain unmatched.") notify(f"final scaled value (max across query and all matches) is {common_scaled}") @@ -1453,7 +1611,7 @@ def prefetch(args): filename = args.save_matching_hashes notify(f"saving {len(ident_mh)} matched hashes to '{filename}'") - sig_name = '' + sig_name = "" if query.name: sig_name = f"{query.name}-known" @@ -1468,7 +1626,7 @@ def prefetch(args): if args.save_unmatched_hashes: filename = args.save_unmatched_hashes - sig_name = '' + sig_name = "" if query.name: sig_name = f"{query.name}-unknown" @@ -1486,6 +1644,8 @@ def prefetch(args): sourmash_args.report_picklist(args, picklist) if size_may_be_inaccurate: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.") + notify( + "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." + ) return 0 diff --git a/src/sourmash/compare.py b/src/sourmash/compare.py index 35b8639cb5..85928dc8a4 100644 --- a/src/sourmash/compare.py +++ b/src/sourmash/compare.py @@ -39,22 +39,28 @@ def compare_serial(siglist, ignore_abundance, *, downsample=False, return_ani=Fa for i, j in iterator: if return_ani: - ani_result = siglist[i].jaccard_ani(siglist[j],downsample=downsample) + ani_result = siglist[i].jaccard_ani(siglist[j], downsample=downsample) if not potential_false_negatives and ani_result.p_exceeds_threshold: potential_false_negatives = True if not jaccard_ani_untrustworthy and ani_result.je_exceeds_threshold: jaccard_ani_untrustworthy = True ani = ani_result.ani - if ani == None: + if ani is None: ani = 0.0 similarities[i][j] = similarities[j][i] = ani else: - similarities[i][j] = similarities[j][i] = siglist[i].similarity(siglist[j], ignore_abundance=ignore_abundance, downsample=downsample) + similarities[i][j] = similarities[j][i] = siglist[i].similarity( + siglist[j], ignore_abundance=ignore_abundance, downsample=downsample + ) if jaccard_ani_untrustworthy: - notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.") + notify( + "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." + ) if potential_false_negatives: - notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") + notify( + "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." + ) return similarities @@ -78,19 +84,24 @@ def compare_serial_containment(siglist, *, downsample=False, return_ani=False): if i == j: containments[i][j] = 1 elif return_ani: - ani_result = siglist[j].containment_ani(siglist[i], downsample=downsample) + ani_result = siglist[j].containment_ani( + siglist[i], downsample=downsample + ) ani = ani_result.ani if not potential_false_negatives and ani_result.p_exceeds_threshold: potential_false_negatives = True - if ani == None: + if ani is None: ani = 0.0 containments[i][j] = ani else: - containments[i][j] = siglist[j].contained_by(siglist[i], - downsample=downsample) + containments[i][j] = siglist[j].contained_by( + siglist[i], downsample=downsample + ) if potential_false_negatives: - notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") + notify( + "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." + ) return containments @@ -115,18 +126,23 @@ def compare_serial_max_containment(siglist, *, downsample=False, return_ani=Fals for i, j in iterator: if return_ani: - ani_result = siglist[j].max_containment_ani(siglist[i], downsample=downsample) + ani_result = siglist[j].max_containment_ani( + siglist[i], downsample=downsample + ) ani = ani_result.ani if not potential_false_negatives and ani_result.p_exceeds_threshold: potential_false_negatives = True - if ani == None: + if ani is None: ani = 0.0 containments[i][j] = containments[j][i] = ani else: - containments[i][j] = containments[j][i] = siglist[j].max_containment(siglist[i], - downsample=downsample) + containments[i][j] = containments[j][i] = siglist[j].max_containment( + siglist[i], downsample=downsample + ) if potential_false_negatives: - notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") + notify( + "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." + ) return containments @@ -153,17 +169,20 @@ def compare_serial_avg_containment(siglist, *, downsample=False, return_ani=Fals if return_ani: cmp = FracMinHashComparison(siglist[j].minhash, siglist[i].minhash) ani = cmp.avg_containment_ani - if ani == None: + if ani is None: ani = 0.0 if not potential_false_negatives and cmp.potential_false_negative: potential_false_negatives = True containments[i][j] = containments[j][i] = ani else: - containments[i][j] = containments[j][i] = siglist[j].avg_containment(siglist[i], - downsample=downsample) + containments[i][j] = containments[j][i] = siglist[j].avg_containment( + siglist[i], downsample=downsample + ) if potential_false_negatives: - notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") + notify( + "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." + ) return containments @@ -174,16 +193,18 @@ def similarity_args_unpack(args, ignore_abundance, *, downsample, return_ani=Fal sig1, sig2 = args if return_ani: ani = sig1.jaccard_ani(sig2, downsample=downsample).ani - if ani == None: + if ani is None: ani = 0.0 return ani else: - return sig1.similarity(sig2, - ignore_abundance=ignore_abundance, - downsample=downsample) + return sig1.similarity( + sig2, ignore_abundance=ignore_abundance, downsample=downsample + ) -def get_similarities_at_index(index, ignore_abundance, downsample, siglist, *, return_ani=False): +def get_similarities_at_index( + index, ignore_abundance, downsample, siglist, *, return_ani=False +): """Returns similarities of all the combinations of signature at index in the siglist with the rest of the indices starting at index + 1. Doesn't redundantly calculate signatures with all the other indices prior to @@ -202,18 +223,24 @@ def get_similarities_at_index(index, ignore_abundance, downsample, siglist, *, r with rest of the signatures from index+1 """ startt = time.time() - sig_iterator = itertools.product([siglist[index]], siglist[index + 1:]) - func = partial(similarity_args_unpack, - ignore_abundance=ignore_abundance, - downsample=downsample, - return_ani=return_ani) + sig_iterator = itertools.product([siglist[index]], siglist[index + 1 :]) + func = partial( + similarity_args_unpack, + ignore_abundance=ignore_abundance, + downsample=downsample, + return_ani=return_ani, + ) similarity_list = list(map(func, sig_iterator)) notify( - f"comparison for index {index} done in {time.time() - startt:.5f} seconds", end='\r') + f"comparison for index {index} done in {time.time() - startt:.5f} seconds", + end="\r", + ) return similarity_list -def compare_parallel(siglist, ignore_abundance, downsample, n_jobs, *, return_ani=False): +def compare_parallel( + siglist, ignore_abundance, downsample, n_jobs, *, return_ani=False +): """Compare all combinations of signatures and return a matrix of similarities. Processes combinations parallely on number of processes given by n_jobs @@ -256,7 +283,8 @@ def compare_parallel(siglist, ignore_abundance, downsample, n_jobs, *, return_an siglist=siglist, ignore_abundance=ignore_abundance, downsample=downsample, - return_ani=return_ani) + return_ani=return_ani, + ) notify("Created similarity func") # Initialize multiprocess.pool @@ -279,19 +307,27 @@ def compare_parallel(siglist, ignore_abundance, downsample, n_jobs, *, return_an startt = time.time() col_idx = index + 1 for idx_condensed, item in enumerate(l): - memmap_similarities[index, col_idx + idx_condensed] = memmap_similarities[idx_condensed + col_idx, index] = item + memmap_similarities[index, col_idx + idx_condensed] = memmap_similarities[ + idx_condensed + col_idx, index + ] = item notify( - f"Setting similarities matrix for index {index} done in {time.time() - startt:.5f} seconds", end='\r') + f"Setting similarities matrix for index {index} done in {time.time() - startt:.5f} seconds", + end="\r", + ) notify("Setting similarities completed") pool.close() pool.join() - notify(f"Time taken to compare all pairs parallely is {time.time() - start_initial:.5f} seconds ") + notify( + f"Time taken to compare all pairs parallely is {time.time() - start_initial:.5f} seconds " + ) return np.memmap(filename, dtype=np.float64, shape=(length_siglist, length_siglist)) -def compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=None, return_ani=False): +def compare_all_pairs( + siglist, ignore_abundance, downsample=False, n_jobs=None, return_ani=False +): """Compare all combinations of signatures and return a matrix of similarities. Processes combinations either serially or based on parallely on number of processes given by n_jobs @@ -309,7 +345,14 @@ def compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=None, :return: np.array similarity matrix """ if n_jobs is None or n_jobs == 1: - similarities = compare_serial(siglist, ignore_abundance=ignore_abundance, downsample=downsample, return_ani=return_ani) + similarities = compare_serial( + siglist, + ignore_abundance=ignore_abundance, + downsample=downsample, + return_ani=return_ani, + ) else: - similarities = compare_parallel(siglist, ignore_abundance, downsample, n_jobs, return_ani=return_ani) + similarities = compare_parallel( + siglist, ignore_abundance, downsample, n_jobs, return_ani=return_ani + ) return similarities diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index 66feb6259c..9106bd8812 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -12,12 +12,14 @@ from .logging import notify + def check_distance(dist): if not 0 <= dist <= 1: raise ValueError(f"Error: distance value {dist :.4f} is not between 0 and 1!") else: return dist + def check_prob_threshold(val, threshold=1e-3): """ Check likelihood of no shared hashes based on chance alone (false neg). @@ -29,15 +31,18 @@ def check_prob_threshold(val, threshold=1e-3): exceeds_threshold = True return val, exceeds_threshold + def check_jaccard_error(val, threshold=1e-4): exceeds_threshold = False if threshold is not None and val > threshold: exceeds_threshold = True return val, exceeds_threshold + @dataclass class ANIResult: """Base class for distance/ANI from k-mer containment.""" + dist: float p_nothing_in_common: float p_threshold: float = 1e-3 @@ -47,7 +52,9 @@ class ANIResult: def check_dist_and_p_threshold(self): # check values self.dist = check_distance(self.dist) - self.p_nothing_in_common, self.p_exceeds_threshold = check_prob_threshold(self.p_nothing_in_common, self.p_threshold) + self.p_nothing_in_common, self.p_exceeds_threshold = check_prob_threshold( + self.p_nothing_in_common, self.p_threshold + ) def __post_init__(self): self.check_dist_and_p_threshold() @@ -62,6 +69,7 @@ def ani(self): @dataclass class jaccardANIResult(ANIResult): """Class for distance/ANI from jaccard (includes jaccard_error).""" + jaccard_error: float = None je_threshold: float = 1e-4 @@ -70,7 +78,9 @@ def __post_init__(self): self.check_dist_and_p_threshold() # check jaccard error if self.jaccard_error is not None: - self.jaccard_error, self.je_exceeds_threshold = check_jaccard_error(self.jaccard_error, self.je_threshold) + self.jaccard_error, self.je_exceeds_threshold = check_jaccard_error( + self.jaccard_error, self.je_threshold + ) else: raise ValueError("Error: jaccard_error cannot be None.") @@ -89,6 +99,7 @@ class ciANIResult(ANIResult): Set CI defaults to None, just in case CI can't be estimated for given sample. """ + dist_low: float = None dist_high: float = None @@ -128,7 +139,7 @@ def var_n_mutated(L, k, r1, *, q=None): if r1 == 0: return 0.0 r1 = float(r1) - if q == None: # we assume that if q is provided, it is correct for r1 + if q is None: # we assume that if q is provided, it is correct for r1 q = r1_to_q(k, r1) varN = ( L * (1 - q) * (q * (2 * k + (2 / r1) - 1) - 2 * k) @@ -158,7 +169,9 @@ def handle_seqlen_nkmers(ksize, *, sequence_len_bp=None, n_unique_kmers=None): return n_unique_kmers elif sequence_len_bp is None: # both are None, raise ValueError - raise ValueError("Error: distance estimation requires input of either 'sequence_len_bp' or 'n_unique_kmers'") + raise ValueError( + "Error: distance estimation requires input of either 'sequence_len_bp' or 'n_unique_kmers'" + ) else: n_unique_kmers = sequence_len_bp - (ksize - 1) return n_unique_kmers @@ -175,7 +188,7 @@ def set_size_chernoff(set_size, scaled, *, relative_error=0.05): @param relative_error: the desired relative error (defaults to 5%) @return: float (the upper bound probability) """ - upper_bound = 1 - 2 * np.exp(- relative_error**2*set_size/(scaled * 3)) + upper_bound = 1 - 2 * np.exp(-(relative_error**2) * set_size / (scaled * 3)) return upper_bound @@ -190,14 +203,17 @@ def set_size_exact_prob(set_size, scaled, *, relative_error=0.05): @return: float (the upper bound probability) """ # Need to check if the edge case is an integer or not. If not, don't include it in the equation - pmf_arg = -set_size/scaled * (relative_error - 1) + pmf_arg = -set_size / scaled * (relative_error - 1) if pmf_arg == int(pmf_arg): - prob = binom.cdf(set_size/scaled * (relative_error + 1), set_size, 1/scaled) - \ - binom.cdf(-set_size/scaled * (relative_error - 1), set_size, 1/scaled) + \ - binom.pmf(-set_size/scaled * (relative_error - 1), set_size, 1/scaled) + prob = ( + binom.cdf(set_size / scaled * (relative_error + 1), set_size, 1 / scaled) + - binom.cdf(-set_size / scaled * (relative_error - 1), set_size, 1 / scaled) + + binom.pmf(-set_size / scaled * (relative_error - 1), set_size, 1 / scaled) + ) else: - prob = binom.cdf(set_size / scaled * (relative_error + 1), set_size, 1 / scaled) - \ - binom.cdf(-set_size / scaled * (relative_error - 1), set_size, 1 / scaled) + prob = binom.cdf( + set_size / scaled * (relative_error + 1), set_size, 1 / scaled + ) - binom.cdf(-set_size / scaled * (relative_error - 1), set_size, 1 / scaled) return prob @@ -225,7 +241,9 @@ def get_exp_probability_nothing_common( Arguments: n_unique_kmers, ksize, mutation_rate, scaled Returns: float - expected likelihood that nothing is common between sketches """ - n_unique_kmers = handle_seqlen_nkmers(ksize, sequence_len_bp=sequence_len_bp,n_unique_kmers=n_unique_kmers) + n_unique_kmers = handle_seqlen_nkmers( + ksize, sequence_len_bp=sequence_len_bp, n_unique_kmers=n_unique_kmers + ) f_scaled = 1.0 / float(scaled) if mutation_rate == 1.0: return 1.0 @@ -251,12 +269,14 @@ def containment_to_distance( Containment --> distance CI (one step) """ sol1, sol2, point_estimate = None, None, None - n_unique_kmers = handle_seqlen_nkmers(ksize, sequence_len_bp = sequence_len_bp, n_unique_kmers=n_unique_kmers) + n_unique_kmers = handle_seqlen_nkmers( + ksize, sequence_len_bp=sequence_len_bp, n_unique_kmers=n_unique_kmers + ) if containment == 0: - #point_estimate = 1.0 + # point_estimate = 1.0 point_estimate = sol1 = sol2 = 1.0 elif containment == 1: - #point_estimate = 0.0 + # point_estimate = 0.0 point_estimate = sol1 = sol2 = 0.0 else: point_estimate = 1.0 - containment ** (1.0 / ksize) @@ -273,25 +293,33 @@ def containment_to_distance( term_1 = (1.0 - f_scaled) / ( f_scaled * n_unique_kmers**3 * bias_factor**2 ) - term_2 = lambda pest: n_unique_kmers * exp_n_mutated( - n_unique_kmers, ksize, pest - ) - exp_n_mutated_squared(n_unique_kmers, ksize, pest) - term_3 = lambda pest: var_n_mutated(n_unique_kmers, ksize, pest) / ( - n_unique_kmers**2 - ) - var_direct = lambda pest: term_1 * term_2(pest) + term_3(pest) - - f1 = ( - lambda pest: (1 - pest) ** ksize - + z_alpha * np.sqrt(var_direct(pest)) - - containment - ) - f2 = ( - lambda pest: (1 - pest) ** ksize - - z_alpha * np.sqrt(var_direct(pest)) - - containment - ) + def term_2(pest): + return n_unique_kmers * exp_n_mutated( + n_unique_kmers, ksize, pest + ) - exp_n_mutated_squared(n_unique_kmers, ksize, pest) + + def term_3(pest): + return ( + var_n_mutated(n_unique_kmers, ksize, pest) / n_unique_kmers**2 + ) + + def var_direct(pest): + return term_1 * term_2(pest) + term_3(pest) + + def f1(pest): + return ( + (1 - pest) ** ksize + + z_alpha * np.sqrt(var_direct(pest)) + - containment + ) + + def f2(pest): + return ( + (1 - pest) ** ksize + - z_alpha * np.sqrt(var_direct(pest)) + - containment + ) sol1 = brentq(f1, 0.0000001, 0.9999999) sol2 = brentq(f2, 0.0000001, 0.9999999) @@ -308,7 +336,13 @@ def containment_to_distance( prob_nothing_in_common = get_exp_probability_nothing_common( point_estimate, ksize, scaled, n_unique_kmers=n_unique_kmers ) - return ciANIResult(point_estimate, prob_nothing_in_common, dist_low=sol2, dist_high=sol1, p_threshold=prob_threshold) + return ciANIResult( + point_estimate, + prob_nothing_in_common, + dist_low=sol2, + dist_high=sol1, + p_threshold=prob_threshold, + ) def jaccard_to_distance( @@ -341,7 +375,9 @@ def jaccard_to_distance( useful for determining whether scaled is sufficient for these comparisons. """ error_lower_bound = None - n_unique_kmers = handle_seqlen_nkmers(ksize, sequence_len_bp=sequence_len_bp, n_unique_kmers=n_unique_kmers) + n_unique_kmers = handle_seqlen_nkmers( + ksize, sequence_len_bp=sequence_len_bp, n_unique_kmers=n_unique_kmers + ) if jaccard == 0: point_estimate = 1.0 error_lower_bound = 0.0 @@ -361,4 +397,10 @@ def jaccard_to_distance( prob_nothing_in_common = get_exp_probability_nothing_common( point_estimate, ksize, scaled, n_unique_kmers=n_unique_kmers ) - return jaccardANIResult(point_estimate, prob_nothing_in_common, jaccard_error=error_lower_bound, p_threshold=prob_threshold, je_threshold=err_threshold) + return jaccardANIResult( + point_estimate, + prob_nothing_in_common, + jaccard_error=error_lower_bound, + p_threshold=prob_threshold, + je_threshold=err_threshold, + ) diff --git a/src/sourmash/exceptions.py b/src/sourmash/exceptions.py index b2f18c12d2..002fbafdfc 100644 --- a/src/sourmash/exceptions.py +++ b/src/sourmash/exceptions.py @@ -1,7 +1,7 @@ from ._lowlevel import lib -__all__ = ['SourmashError'] +__all__ = ["SourmashError"] exceptions_by_code = {} @@ -16,13 +16,15 @@ def __init__(self, msg): def __str__(self): rv = self.message if self.rust_info is not None: - return u'%s\n\n%s' % (rv, self.rust_info) + return f"{rv}\n\n{self.rust_info}" return rv class IndexNotSupported(SourmashError): def __init__(self): - SourmashError.__init__(self, "This index format is not supported in this version of sourmash") + SourmashError.__init__( + self, "This index format is not supported in this version of sourmash" + ) class IndexNotLoaded(SourmashError): @@ -55,7 +57,7 @@ def _get_error_base(error_name): def _make_exceptions(): for attr in dir(lib): - if not attr.startswith('SOURMASH_ERROR_CODE_'): + if not attr.startswith("SOURMASH_ERROR_CODE_"): continue code = getattr(lib, attr) @@ -69,4 +71,5 @@ def _make_exceptions(): else: exceptions_by_code[code] = ValueError + _make_exceptions() diff --git a/src/sourmash/fig.py b/src/sourmash/fig.py index 4454ef64d9..9ca96f1aab 100644 --- a/src/sourmash/fig.py +++ b/src/sourmash/fig.py @@ -3,6 +3,7 @@ Make plots using the distance matrix+labels output by `sourmash compare`. """ from .logging import error, notify + try: import numpy import pylab @@ -10,18 +11,20 @@ except (RuntimeError, ImportError): pass + def load_matrix_and_labels(basefile): """Load the comparison matrix and associated labels. Returns a square numpy matrix & list of labels. """ - D = numpy.load(open(basefile, 'rb')) - labeltext = [x.strip() for x in open(basefile + '.labels.txt')] + D = numpy.load(open(basefile, "rb")) + labeltext = [x.strip() for x in open(basefile + ".labels.txt")] return (D, labeltext) -def plot_composite_matrix(D, labeltext, show_labels=True, - vmax=1.0, vmin=0.0, force=False): +def plot_composite_matrix( + D, labeltext, show_labels=True, vmax=1.0, vmin=0.0, force=False +): """Build a composite plot showing dendrogram + distance matrix/heatmap. Returns a matplotlib figure. @@ -30,25 +33,34 @@ def plot_composite_matrix(D, labeltext, show_labels=True, shown on the plot. """ if D.max() > 1.0 or D.min() < 0.0: - error('This matrix doesn\'t look like a distance matrix - min value {}, max value {}', D.min(), D.max()) + error( + "This matrix doesn't look like a distance matrix - min value {}, max value {}", + D.min(), + D.max(), + ) if not force: raise ValueError("not a distance matrix") else: - notify('force is set; scaling to [0, 1]') + notify("force is set; scaling to [0, 1]") D -= D.min() D /= D.max() if show_labels: - show_indices = True + pass fig = pylab.figure(figsize=(11, 8)) ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6]) # plot dendrogram - Y = sch.linkage(D, method='single') # centroid - - Z1 = sch.dendrogram(Y, orientation='left', labels=labeltext, - no_labels=not show_labels, get_leaves=True) + Y = sch.linkage(D, method="single") # centroid + + Z1 = sch.dendrogram( + Y, + orientation="left", + labels=labeltext, + no_labels=not show_labels, + get_leaves=True, + ) ax1.set_xticks([]) xstart = 0.45 @@ -58,8 +70,8 @@ def plot_composite_matrix(D, labeltext, show_labels=True, scale_xstart = xstart + width + 0.01 # re-order labels along rows, top to bottom - idx1 = Z1['leaves'] - reordered_labels = [ labeltext[i] for i in idx1 ] + idx1 = Z1["leaves"] + reordered_labels = [labeltext[i] for i in idx1] # reorder D by the clustering in the dendrogram D = D[idx1, :] @@ -68,8 +80,9 @@ def plot_composite_matrix(D, labeltext, show_labels=True, # show matrix axmatrix = fig.add_axes([xstart, 0.1, width, 0.6]) - im = axmatrix.matshow(D, aspect='auto', origin='lower', - cmap=pylab.cm.YlGnBu, vmin=vmin, vmax=vmax) + im = axmatrix.matshow( + D, aspect="auto", origin="lower", cmap=pylab.cm.YlGnBu, vmin=vmin, vmax=vmax + ) axmatrix.set_xticks([]) axmatrix.set_yticks([]) diff --git a/src/sourmash/hll.py b/src/sourmash/hll.py index c98ded5e8b..8a78049b34 100644 --- a/src/sourmash/hll.py +++ b/src/sourmash/hll.py @@ -32,7 +32,7 @@ def add_sequence(self, sequence, force=False): def add_kmer(self, kmer): "Add a kmer into the sketch." if len(kmer) != self.ksize: - raise ValueError("kmer to add is not {} in length".format(self.ksize)) + raise ValueError(f"kmer to add is not {self.ksize} in length") self.add_sequence(kmer) def add(self, h): diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py index 08068255e5..154f37c126 100644 --- a/src/sourmash/index/__init__.py +++ b/src/sourmash/index/__init__.py @@ -39,18 +39,23 @@ from abc import abstractmethod, ABC from collections import namedtuple, Counter -from sourmash.search import (make_jaccard_search_query, - make_containment_query, - calc_threshold_from_bp) +from sourmash.search import ( + make_jaccard_search_query, + make_containment_query, + calc_threshold_from_bp, +) from sourmash.manifest import CollectionManifest from sourmash.logging import debug_literal from sourmash.signature import load_signatures, save_signatures -from sourmash.minhash import (flatten_and_downsample_scaled, - flatten_and_downsample_num, - flatten_and_intersect_scaled) +from sourmash.minhash import ( + flatten_and_downsample_scaled, + flatten_and_downsample_num, + flatten_and_intersect_scaled, +) # generic return tuple for Index.search and Index.gather -IndexSearchResult = namedtuple('Result', 'score, signature, location') +IndexSearchResult = namedtuple("Result", "score, signature, location") + class Index(ABC): # this will be removed soon; see sourmash#1894. @@ -103,8 +108,7 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): @classmethod @abstractmethod - def load(cls, location, leaf_loader=None, storage=None, - print_version_warning=True): + def load(cls, location, leaf_loader=None, storage=None, print_version_warning=True): """ """ def find(self, search_fn, query, **kwargs): @@ -133,7 +137,7 @@ def prepare_subject(subj_mh): def prepare_query(query_mh, subj_mh): return flatten_and_downsample_scaled(query_mh, subj_mh.scaled) - else: # num + else: # num query_num = query_mh.num def prepare_subject(subj_mh): @@ -156,10 +160,7 @@ def prepare_query(query_mh, subj_mh): query_size = len(query_mh) subj_size = len(subj_mh) - score = search_fn.score_fn(query_size, - shared_size, - subj_size, - total_size) + score = search_fn.score_fn(query_size, shared_size, subj_size, total_size) if search_fn.passes(score): # note: here we yield the original signature, not the @@ -173,7 +174,9 @@ def search_abund(self, query, *, threshold=None, **kwargs): Results will be sorted by similarity, highest to lowest. """ if not query.minhash.track_abundance: - raise TypeError("'search_abund' requires query signature with abundance information") + raise TypeError( + "'search_abund' requires query signature with abundance information" + ) # check arguments if threshold is None: @@ -184,7 +187,9 @@ def search_abund(self, query, *, threshold=None, **kwargs): matches = [] for subj, loc in self.signatures_with_location(): if not subj.minhash.track_abundance: - raise TypeError("'search_abund' requires subject signatures with abundance information") + raise TypeError( + "'search_abund' requires subject signatures with abundance information" + ) score = query.similarity(subj, downsample=True) if score >= threshold: matches.append(IndexSearchResult(score, subj, loc)) @@ -193,9 +198,16 @@ def search_abund(self, query, *, threshold=None, **kwargs): matches.sort(key=lambda x: -x.score) return matches - def search(self, query, *, threshold=None, - do_containment=False, do_max_containment=False, - best_only=False, **kwargs): + def search( + self, + query, + *, + threshold=None, + do_containment=False, + do_max_containment=False, + best_only=False, + **kwargs, + ): """Return list of IndexSearchResult with similarity above 'threshold'. Results will be sorted by similarity, highest to lowest. @@ -211,10 +223,12 @@ def search(self, query, *, threshold=None, raise TypeError("'search' requires 'threshold'") threshold = float(threshold) - search_obj = make_jaccard_search_query(do_containment=do_containment, - do_max_containment=do_max_containment, - best_only=best_only, - threshold=threshold) + search_obj = make_jaccard_search_query( + do_containment=do_containment, + do_max_containment=do_max_containment, + best_only=best_only, + threshold=threshold, + ) # do the actual search: matches = list(self.find(search_obj, query, **kwargs)) @@ -228,17 +242,17 @@ def prefetch(self, query, threshold_bp, **kwargs): Generator. Returns 0 or more IndexSearchResult namedtuples. """ - if not self: # empty database? quit. + if not self: # empty database? quit. raise ValueError("no signatures to search") # default best_only to False - best_only = kwargs.get('best_only', False) + best_only = kwargs.get("best_only", False) - search_fn = make_containment_query(query.minhash, threshold_bp, - best_only=best_only) + search_fn = make_containment_query( + query.minhash, threshold_bp, best_only=best_only + ) - for sr in self.find(search_fn, query, **kwargs): - yield sr + yield from self.find(search_fn, query, **kwargs) def best_containment(self, query, threshold_bp=None, **kwargs): """Return the match with the best Jaccard containment in the Index. @@ -247,8 +261,7 @@ def best_containment(self, query, threshold_bp=None, **kwargs): """ results = self.prefetch(query, threshold_bp, best_only=True, **kwargs) - results = sorted(results, - key=lambda x: (-x.score, x.signature.md5sum())) + results = sorted(results, key=lambda x: (-x.score, x.signature.md5sum())) try: return next(iter(results)) @@ -277,8 +290,7 @@ def peek(self, query_mh, *, threshold_bp=0): return [] # if matches, calculate intersection & return. - intersect_mh = flatten_and_intersect_scaled(result.signature.minhash, - query_mh) + intersect_mh = flatten_and_intersect_scaled(result.signature.minhash, query_mh) return [result, intersect_mh] @@ -307,8 +319,15 @@ def counter_gather(self, query, threshold_bp, **kwargs): return counter @abstractmethod - def select(self, ksize=None, moltype=None, scaled=None, num=None, - abund=None, containment=None): + def select( + self, + ksize=None, + moltype=None, + scaled=None, + num=None, + abund=None, + containment=None, + ): """Return Index containing only signatures that match requirements. Current arguments can be any or all of: @@ -326,8 +345,17 @@ def select(self, ksize=None, moltype=None, scaled=None, num=None, """ -def select_signature(ss, *, ksize=None, moltype=None, scaled=0, num=0, - containment=False, abund=None, picklist=None): +def select_signature( + ss, + *, + ksize=None, + moltype=None, + scaled=0, + num=0, + containment=False, + abund=None, + picklist=None, +): "Check that the given signature matches the specified requirements." # ksize match? if ksize and ksize != ss.minhash.ksize: @@ -372,6 +400,7 @@ class LinearIndex(Index): Concrete class; signatures held in memory; does not use manifests. """ + def __init__(self, _signatures=None, filename=None): self._signatures = [] if _signatures: @@ -395,7 +424,7 @@ def insert(self, node): self._signatures.append(node) def save(self, path): - with open(path, 'wt') as fp: + with open(path, "w") as fp: save_signatures(self.signatures(), fp) @classmethod @@ -404,7 +433,7 @@ def load(cls, location, filename=None): si = load_signatures(location, do_raise=True) if filename is None: - filename=location + filename = location lidx = LinearIndex(si, filename=filename) return lidx @@ -449,14 +478,12 @@ def __init__(self, db, selection_dict={}): def signatures(self): "Return the selected signatures." db = self.db.select(**self.selection_dict) - for ss in db.signatures(): - yield ss + yield from db.signatures() def signatures_with_location(self): "Return the selected signatures, with a location." db = self.db.select(**self.selection_dict) - for tup in db.signatures_with_location(): - yield tup + yield from db.signatures_with_location() def __bool__(self): try: @@ -502,10 +529,18 @@ class ZipFileLinearIndex(Index): Concrete class; signatures dynamically loaded from disk; uses manifests. """ + is_database = True - def __init__(self, storage, *, selection_dict=None, - traverse_yield_all=False, manifest=None, use_manifest=True): + def __init__( + self, + storage, + *, + selection_dict=None, + traverse_yield_all=False, + manifest=None, + use_manifest=True, + ): self.storage = storage self.selection_dict = selection_dict self.traverse_yield_all = traverse_yield_all @@ -514,7 +549,7 @@ def __init__(self, storage, *, selection_dict=None, # do we have a manifest already? if not, try loading. if use_manifest: if manifest is not None: - debug_literal('ZipFileLinearIndex using passed-in manifest') + debug_literal("ZipFileLinearIndex using passed-in manifest") self.manifest = manifest else: self._load_manifest() @@ -529,15 +564,16 @@ def __init__(self, storage, *, selection_dict=None, def _load_manifest(self): "Load a manifest if one exists" try: - manifest_data = self.storage.load('SOURMASH-MANIFEST.csv') + manifest_data = self.storage.load("SOURMASH-MANIFEST.csv") except (KeyError, FileNotFoundError): self.manifest = None else: - debug_literal(f'found manifest on load for {self.storage.path}') + debug_literal(f"found manifest on load for {self.storage.path}") # load manifest! from io import StringIO - manifest_data = manifest_data.decode('utf-8') + + manifest_data = manifest_data.decode("utf-8") manifest_fp = StringIO(manifest_data) self.manifest = CollectionManifest.load_from_csv(manifest_fp) @@ -584,8 +620,9 @@ def load(cls, location, traverse_yield_all=False, use_manifest=True): raise FileNotFoundError(location) storage = ZipStorage(location) - return cls(storage, traverse_yield_all=traverse_yield_all, - use_manifest=use_manifest) + return cls( + storage, traverse_yield_all=traverse_yield_all, use_manifest=use_manifest + ) def _signatures_with_internal(self): """Return an iterator of tuples (ss, internal_location). @@ -596,9 +633,11 @@ def _signatures_with_internal(self): # 'Storage' does not provide a way to list all the files, so :shrug:. for filename in self.storage._filenames(): # should we load this file? if it ends in .sig OR we are forcing: - if filename.endswith('.sig') or \ - filename.endswith('.sig.gz') or \ - self.traverse_yield_all: + if ( + filename.endswith(".sig") + or filename.endswith(".sig.gz") + or self.traverse_yield_all + ): sig_data = self.storage.load(filename) for ss in load_signatures(sig_data): yield ss, filename @@ -628,14 +667,19 @@ def signatures(self): # ad-hoc zipfiles that have no manifests.) for filename in storage._filenames(): # should we load this file? if it ends in .sig OR force: - if filename.endswith('.sig') or \ - filename.endswith('.sig.gz') or \ - self.traverse_yield_all: + if ( + filename.endswith(".sig") + or filename.endswith(".sig.gz") + or self.traverse_yield_all + ): if selection_dict: - select = lambda x: select_signature(x, - **selection_dict) + + def select(x): + return select_signature(x, **selection_dict) else: - select = lambda x: True + + def select(x): + return True data = self.storage.load(filename) for ss in load_signatures(data): @@ -651,11 +695,13 @@ def select(self, **kwargs): if manifest is not None: manifest = manifest.select_to_manifest(**kwargs) - return ZipFileLinearIndex(self.storage, - selection_dict=None, - traverse_yield_all=traverse_yield_all, - manifest=manifest, - use_manifest=True) + return ZipFileLinearIndex( + self.storage, + selection_dict=None, + traverse_yield_all=traverse_yield_all, + manifest=manifest, + use_manifest=True, + ) else: # no manifest? just pass along all the selection kwargs to # the new ZipFileLinearIndex. @@ -671,11 +717,13 @@ def select(self, **kwargs): d[k] = v kwargs = d - return ZipFileLinearIndex(self.storage, - selection_dict=kwargs, - traverse_yield_all=traverse_yield_all, - manifest=None, - use_manifest=False) + return ZipFileLinearIndex( + self.storage, + selection_dict=kwargs, + traverse_yield_all=traverse_yield_all, + manifest=None, + use_manifest=False, + ) class CounterGather: @@ -699,11 +747,12 @@ class CounterGather: duplicate md5s are collapsed inside the class, because we use the md5sum as a key into the dictionary used to store matches. """ + def __init__(self, query): "Constructor - takes a query SourmashSignature." query_mh = query.minhash if not query_mh.scaled: - raise ValueError('gather requires scaled signatures') + raise ValueError("gather requires scaled signatures") # track query self.orig_query_mh = query_mh.copy().flatten() @@ -746,8 +795,7 @@ def downsample(self, scaled): def signatures(self): "Return all signatures." - for ss in self.siglist.values(): - yield ss + yield from self.siglist.values() @property def union_found(self): @@ -763,8 +811,7 @@ def union_found(self): # for each match, intersect match with query & then add to found_mh. for ss in self.siglist.values(): - intersect_mh = flatten_and_intersect_scaled(ss.minhash, - orig_query_mh) + intersect_mh = flatten_and_intersect_scaled(ss.minhash, orig_query_mh) found_mh.add_many(intersect_mh) return found_mh @@ -784,7 +831,7 @@ def peek(self, cur_query_mh, *, threshold_bp=0): scaled = self.downsample(cur_query_mh.scaled) cur_query_mh = cur_query_mh.downsample(scaled=scaled) - if not cur_query_mh: # empty query? quit. + if not cur_query_mh: # empty query? quit. return [] # CTB: could probably remove this check unless debug requested. @@ -841,7 +888,7 @@ def consume(self, intersect_mh): # Prepare counter for finding the next match by decrementing # all hashes found in the current match in other datasets; # remove empty datasets from counter, too. - for (dataset_id, _) in most_common: + for dataset_id, _ in most_common: # CTB: note, remaining_mh may not be at correct scaled here. # this means that counters that _should_ be empty might not # _be_ empty in some situations. This does not @@ -849,8 +896,7 @@ def consume(self, intersect_mh): # 'counter' objects. The tradeoffs to fixing this would # need to be examined! (This could be fixed in self.downsample().) remaining_mh = siglist[dataset_id].minhash - intersect_count = intersect_mh.count_common(remaining_mh, - downsample=True) + intersect_count = intersect_mh.count_common(remaining_mh, downsample=True) if intersect_count: counter[dataset_id] -= intersect_count if counter[dataset_id] == 0: @@ -881,6 +927,7 @@ class MultiIndex(Index): Concrete class; signatures held in memory; builds and uses manifests. """ + def __init__(self, manifest, parent, *, prepend_location=False): """Constructor; takes manifest containing signatures, together with the top-level location. @@ -898,16 +945,16 @@ def location(self): def signatures(self): for row in self.manifest.rows: - yield row['signature'] + yield row["signature"] def signatures_with_location(self): for row in self.manifest.rows: - loc = row['internal_location'] + loc = row["internal_location"] # here, 'parent' may have been removed from internal_location # for directories; if so, add it back in. if self.prepend_location: loc = os.path.join(self.parent, loc) - yield row['signature'], loc + yield row["signature"], loc def _signatures_with_internal(self): """Return an iterator of tuples (ss, location) @@ -916,8 +963,7 @@ def _signatures_with_internal(self): index. This is a special feature of this (in memory) class. """ for row in self.manifest.rows: - yield row['signature'], row['internal_location'] - + yield row["signature"], row["internal_location"] def __len__(self): if self.manifest is None: @@ -986,18 +1032,17 @@ def load_from_directory(cls, pathname, *, force=False): rel = os.path.relpath(thisfile, pathname) source_list.append(rel) - except (IOError, sourmash.exceptions.SourmashError) as exc: + except (OSError, sourmash.exceptions.SourmashError) as exc: if force: - continue # ignore error + continue # ignore error else: - raise ValueError(exc) # stop loading! + raise ValueError(exc) # stop loading! # did we load anything? if not, error if not index_list: raise ValueError(f"no signatures to load under directory '{pathname}'") - return cls.load(index_list, source_list, pathname, - prepend_location=True) + return cls.load(index_list, source_list, pathname, prepend_location=True) @classmethod def load_from_path(cls, pathname, force=False): @@ -1010,7 +1055,7 @@ def load_from_path(cls, pathname, force=False): if not os.path.exists(pathname): raise ValueError(f"'{pathname}' must exist.") - if os.path.isdir(pathname): # traverse + if os.path.isdir(pathname): # traverse return cls.load_from_directory(pathname, force=force) # load as a .sig/JSON file @@ -1020,7 +1065,7 @@ def load_from_path(cls, pathname, force=False): idx = LinearIndex.load(pathname) index_list = [idx] source_list = [pathname] - except (IOError, sourmash.exceptions.SourmashError): + except (OSError, sourmash.exceptions.SourmashError): if not force: raise ValueError(f"no signatures to load from '{pathname}'") return None @@ -1035,8 +1080,8 @@ def load_from_pathlist(cls, filename): including zip collections, etc; it uses 'load_file_as_index' underneath. """ - from ..sourmash_args import (load_pathlist_from_file, - load_file_as_index) + from ..sourmash_args import load_pathlist_from_file, load_file_as_index + idx_list = [] src_list = [] @@ -1056,8 +1101,9 @@ def save(self, *args): def select(self, **kwargs): "Run 'select' on the manifest." new_manifest = self.manifest.select_to_manifest(**kwargs) - return MultiIndex(new_manifest, self.parent, - prepend_location=self.prepend_location) + return MultiIndex( + new_manifest, self.parent, prepend_location=self.prepend_location + ) class StandaloneManifestIndex(Index): @@ -1085,6 +1131,7 @@ class StandaloneManifestIndex(Index): objects. However, this class does not store any signatures in memory, unlike MultiIndex. """ + is_database = True def __init__(self, manifest, location, *, prefix=None): @@ -1119,8 +1166,7 @@ def location(self): def signatures_with_location(self): "Return an iterator over all signatures and their locations." - for ss, loc in self._signatures_with_internal(): - yield ss, loc + yield from self._signatures_with_internal() def signatures(self): "Return an iterator over all signatures." @@ -1140,7 +1186,7 @@ def _signatures_with_internal(self): picklist = self.manifest.to_picklist() for iloc in self.manifest.locations(): # prepend location with prefix? - if not iloc.startswith('/') and self.prefix: + if not iloc.startswith("/") and self.prefix: iloc = os.path.join(self.prefix, iloc) idx = sourmash.load_file_as_index(iloc) @@ -1165,5 +1211,4 @@ def insert(self, *args): def select(self, **kwargs): "Run 'select' on the manifest." new_manifest = self.manifest.select_to_manifest(**kwargs) - return StandaloneManifestIndex(new_manifest, self._location, - prefix=self.prefix) + return StandaloneManifestIndex(new_manifest, self._location, prefix=self.prefix) diff --git a/src/sourmash/index/revindex.py b/src/sourmash/index/revindex.py index 2f7074b53f..01f808783d 100644 --- a/src/sourmash/index/revindex.py +++ b/src/sourmash/index/revindex.py @@ -123,9 +123,9 @@ def signatures(self): for sig in sigs: yield sig - #if self._signatures: + # if self._signatures: # yield from self._signatures - #else: + # else: # raise NotImplementedError("Call into Rust and retrieve sigs") def __len__(self): @@ -156,81 +156,81 @@ def select(self, ksize=None, moltype=None, **kwargs): # TODO: deal with None/default values self.template = MinHash(ksize=ksize, moltype=moltype) -# def search(self, query, *args, **kwargs): -# """Return set of matches with similarity above 'threshold'. -# -# Results will be sorted by similarity, highest to lowest. -# -# Optional arguments: -# * do_containment: default False. If True, use Jaccard containment. -# * ignore_abundance: default False. If True, and query signature -# and database support k-mer abundances, ignore those abundances. -# -# Note, the "best only" hint is ignored by LCA_Database -# """ -# if not query.minhash: -# return [] -# -# # check arguments -# if "threshold" not in kwargs: -# raise TypeError("'search' requires 'threshold'") -# threshold = kwargs["threshold"] -# do_containment = kwargs.get("do_containment", False) -# ignore_abundance = kwargs.get("ignore_abundance", False) -# -# self._init_inner() -# -# size = ffi.new("uintptr_t *") -# results_ptr = self._methodcall( -# lib.revindex_search, -# query._get_objptr(), -# threshold, -# do_containment, -# ignore_abundance, -# size, -# ) -# -# size = size[0] -# if size == 0: -# return [] -# -# results = [] -# for i in range(size): -# match = SearchResult._from_objptr(results_ptr[i]) -# if match.score >= threshold: -# results.append(IndexSearchResult(match.score, match.signature, match.filename)) -# -# return results -# -# def gather(self, query, *args, **kwargs): -# "Return the match with the best Jaccard containment in the database." -# if not query.minhash: -# return [] -# -# self._init_inner() -# -# threshold_bp = kwargs.get("threshold_bp", 0.0) -# threshold = threshold_bp / (len(query.minhash) * self.scaled) -# -# results = [] -# size = ffi.new("uintptr_t *") -# results_ptr = self._methodcall( -# lib.revindex_gather, query._get_objptr(), threshold, True, True, size -# ) -# size = size[0] -# if size == 0: -# return [] -# -# results = [] -# for i in range(size): -# match = SearchResult._from_objptr(results_ptr[i]) -# if match.score >= threshold: -# results.append(IndexSearchResult(match.score, match.signature, match.filename)) -# -# results.sort(reverse=True, -# key=lambda x: (x.score, x.signature.md5sum())) -# -# return results[:1] + # def search(self, query, *args, **kwargs): + # """Return set of matches with similarity above 'threshold'. + # + # Results will be sorted by similarity, highest to lowest. + # + # Optional arguments: + # * do_containment: default False. If True, use Jaccard containment. + # * ignore_abundance: default False. If True, and query signature + # and database support k-mer abundances, ignore those abundances. + # + # Note, the "best only" hint is ignored by LCA_Database + # """ + # if not query.minhash: + # return [] + # + # # check arguments + # if "threshold" not in kwargs: + # raise TypeError("'search' requires 'threshold'") + # threshold = kwargs["threshold"] + # do_containment = kwargs.get("do_containment", False) + # ignore_abundance = kwargs.get("ignore_abundance", False) + # + # self._init_inner() + # + # size = ffi.new("uintptr_t *") + # results_ptr = self._methodcall( + # lib.revindex_search, + # query._get_objptr(), + # threshold, + # do_containment, + # ignore_abundance, + # size, + # ) + # + # size = size[0] + # if size == 0: + # return [] + # + # results = [] + # for i in range(size): + # match = SearchResult._from_objptr(results_ptr[i]) + # if match.score >= threshold: + # results.append(IndexSearchResult(match.score, match.signature, match.filename)) + # + # return results + # + # def gather(self, query, *args, **kwargs): + # "Return the match with the best Jaccard containment in the database." + # if not query.minhash: + # return [] + # + # self._init_inner() + # + # threshold_bp = kwargs.get("threshold_bp", 0.0) + # threshold = threshold_bp / (len(query.minhash) * self.scaled) + # + # results = [] + # size = ffi.new("uintptr_t *") + # results_ptr = self._methodcall( + # lib.revindex_gather, query._get_objptr(), threshold, True, True, size + # ) + # size = size[0] + # if size == 0: + # return [] + # + # results = [] + # for i in range(size): + # match = SearchResult._from_objptr(results_ptr[i]) + # if match.score >= threshold: + # results.append(IndexSearchResult(match.score, match.signature, match.filename)) + # + # results.sort(reverse=True, + # key=lambda x: (x.score, x.signature.md5sum())) + # + # return results[:1] @property def scaled(self): diff --git a/src/sourmash/index/sqlite_index.py b/src/sourmash/index/sqlite_index.py index b16eb00b59..458d40919d 100644 --- a/src/sourmash/index/sqlite_index.py +++ b/src/sourmash/index/sqlite_index.py @@ -95,9 +95,15 @@ # converters for unsigned 64-bit ints: if over MAX_SQLITE_INT, # convert to signed int. -MAX_SQLITE_INT = 2 ** 63 - 1 -convert_hash_to = lambda x: BitArray(uint=x, length=64).int if x > MAX_SQLITE_INT else x -convert_hash_from = lambda x: BitArray(int=x, length=64).uint if x < 0 else x +MAX_SQLITE_INT = 2**63 - 1 + + +def convert_hash_to(x): + return BitArray(uint=x, length=64).int if x > MAX_SQLITE_INT else x + + +def convert_hash_from(x): + return BitArray(int=x, length=64).uint if x < 0 else x def load_sqlite_index(filename, *, request_manifest=False): @@ -126,27 +132,29 @@ def load_sqlite_index(filename, *, request_manifest=False): is_manifest = False is_lca_db = False - if 'SqliteIndex' in internal_d: - v = internal_d['SqliteIndex'] - if v != '1.0': + if "SqliteIndex" in internal_d: + v = internal_d["SqliteIndex"] + if v != "1.0": raise IndexNotSupported is_index = True debug_literal("load_sqlite_index: it's an index!") - if is_index and 'SqliteLineage' in internal_d: - v = internal_d['SqliteLineage'] - if v != '1.0': + if is_index and "SqliteLineage" in internal_d: + v = internal_d["SqliteLineage"] + if v != "1.0": raise IndexNotSupported is_lca_db = True debug_literal("load_sqlite_index: it's got a lineage table!") - if 'SqliteManifest' in internal_d: - v = internal_d['SqliteManifest'] - if v != '1.0': + if "SqliteManifest" in internal_d: + v = internal_d["SqliteManifest"] + if v != "1.0": raise IndexNotSupported is_manifest = True - debug_literal(f"load_sqlite_index: it's a manifest! request_manifest: {request_manifest}") + debug_literal( + f"load_sqlite_index: it's a manifest! request_manifest: {request_manifest}" + ) # every Index is a Manifest! if is_index or is_lca_db: @@ -163,10 +171,10 @@ def load_sqlite_index(filename, *, request_manifest=False): debug_literal("load_sqlite_index: returning SqliteIndex") idx = SqliteIndex(filename) elif is_manifest: - managed_by_index=False + managed_by_index = False if is_index: assert request_manifest - managed_by_index=True + managed_by_index = True prefix = os.path.dirname(filename) mf = SqliteCollectionManifest(conn, managed_by_index=managed_by_index) @@ -178,7 +186,7 @@ def load_sqlite_index(filename, *, request_manifest=False): class SqliteIndex(Index): is_database = True - + # NOTE: we do not need _signatures_with_internal for this class # because it supplies a manifest directly :tada:. @@ -192,8 +200,7 @@ def __init__(self, dbfile, *, sqlite_manifest=None, conn=None): # build me a SQLite manifest class to use for selection. if sqlite_manifest is None: - sqlite_manifest = SqliteCollectionManifest(conn, - managed_by_index=True) + sqlite_manifest = SqliteCollectionManifest(conn, managed_by_index=True) self.manifest = sqlite_manifest self.conn = conn @@ -202,7 +209,9 @@ def __init__(self, dbfile, *, sqlite_manifest=None, conn=None): c.execute("SELECT DISTINCT scaled FROM sourmash_sketches") scaled_vals = c.fetchall() if len(scaled_vals) > 1: - raise ValueError("this database has multiple scaled values, which is not currently allowed") + raise ValueError( + "this database has multiple scaled values, which is not currently allowed" + ) if scaled_vals: self.scaled = scaled_vals[0][0] @@ -247,28 +256,35 @@ def create(cls, dbfile, *, append=False): def _create_tables(cls, c, *, ignore_exists=False): "Create sqlite tables for SqliteIndex" try: - sqlite_utils.add_sourmash_internal(c, 'SqliteIndex', '1.0') + sqlite_utils.add_sourmash_internal(c, "SqliteIndex", "1.0") SqliteCollectionManifest._create_tables(c) - c.execute(""" + c.execute( + """ CREATE TABLE IF NOT EXISTS sourmash_hashes ( hashval INTEGER NOT NULL, sketch_id INTEGER NOT NULL, FOREIGN KEY (sketch_id) REFERENCES sourmash_sketches (id) ) - """) - c.execute(""" + """ + ) + c.execute( + """ CREATE INDEX IF NOT EXISTS sourmash_hashval_idx ON sourmash_hashes ( hashval, sketch_id ) - """) - c.execute(""" + """ + ) + c.execute( + """ CREATE INDEX IF NOT EXISTS sourmash_hashval_idx2 ON sourmash_hashes ( hashval ) - """) - c.execute(""" + """ + ) + c.execute( + """ CREATE INDEX IF NOT EXISTS sourmash_sketch_idx ON sourmash_hashes ( sketch_id ) @@ -312,18 +328,21 @@ def insert(self, ss, *, cursor=None, commit=True): raise ValueError("cannot store signatures with abundance in SqliteIndex") if self.scaled is not None and self.scaled != ss.minhash.scaled: - raise ValueError(f"this database can only store scaled values={self.scaled}") + raise ValueError( + f"this database can only store scaled values={self.scaled}" + ) elif self.scaled is None: self.scaled = ss.minhash.scaled # ok, first create and insert a manifest row - row = BaseCollectionManifest.make_manifest_row(ss, None, - include_signature=False) + row = BaseCollectionManifest.make_manifest_row( + ss, None, include_signature=False + ) self.manifest._insert_row(c, row, call_is_from_index=True) # retrieve ID of row for retrieving hashes: c.execute("SELECT last_insert_rowid()") - sketch_id, = c.fetchone() + (sketch_id,) = c.fetchone() # insert all the hashes hashes_to_sketch = [] @@ -331,8 +350,10 @@ def insert(self, ss, *, cursor=None, commit=True): hh = convert_hash_to(h) hashes_to_sketch.append((hh, sketch_id)) - c.executemany("INSERT INTO sourmash_hashes (hashval, sketch_id) VALUES (?, ?)", - hashes_to_sketch) + c.executemany( + "INSERT INTO sourmash_hashes (hashval, sketch_id) VALUES (?, ?)", + hashes_to_sketch, + ) if commit: self.conn.commit() @@ -366,30 +387,31 @@ def find(self, search_fn, query, **kwargs): picklist = None if self.manifest.selection_dict: - picklist = self.manifest.selection_dict.get('picklist') + picklist = self.manifest.selection_dict.get("picklist") c1 = self.conn.cursor() c2 = self.conn.cursor() - debug_literal('running _get_matching_sketches...') + debug_literal("running _get_matching_sketches...") t0 = time.time() - xx = self._get_matching_sketches(c1, query_mh.hashes, - query_mh._max_hash) + xx = self._get_matching_sketches(c1, query_mh.hashes, query_mh._max_hash) for sketch_id, n_matching_hashes in xx: - debug_literal(f"...got sketch {sketch_id}, with {n_matching_hashes} matching hashes in {time.time() - t0:.2f}") + debug_literal( + f"...got sketch {sketch_id}, with {n_matching_hashes} matching hashes in {time.time() - t0:.2f}" + ) # # first, estimate sketch size using sql results. # query_size = len(query_mh) - subj_size = self._load_sketch_size(c2, sketch_id, - query_mh._max_hash) + subj_size = self._load_sketch_size(c2, sketch_id, query_mh._max_hash) total_size = query_size + subj_size - n_matching_hashes shared_size = n_matching_hashes - score = search_fn.score_fn(query_size, shared_size, subj_size, - total_size) + score = search_fn.score_fn(query_size, shared_size, subj_size, total_size) - debug_literal(f"APPROX RESULT: score={score} qsize={query_size}, ssize={subj_size} total={total_size} overlap={shared_size}") + debug_literal( + f"APPROX RESULT: score={score} qsize={query_size}, ssize={subj_size} total={total_size} overlap={shared_size}" + ) # do we pass? if not search_fn.passes(score): @@ -415,8 +437,7 @@ def _select(self, *, num=0, track_abundance=False, **kwargs): # create manifest if needed manifest = self.manifest if manifest is None: - manifest = SqliteCollectionManifest(self.conn, - managed_by_index=True) + manifest = SqliteCollectionManifest(self.conn, managed_by_index=True) # modify manifest manifest = manifest.select_to_manifest(**kwargs) @@ -427,9 +448,7 @@ def select(self, *args, **kwargs): sqlite_manifest = self._select(*args, **kwargs) # return a new SqliteIndex with a new manifest, but same old conn. - return SqliteIndex(self.dbfile, - sqlite_manifest=sqlite_manifest, - conn=self.conn) + return SqliteIndex(self.dbfile, sqlite_manifest=sqlite_manifest, conn=self.conn) # # Actual SQL queries, etc. @@ -438,53 +457,77 @@ def select(self, *args, **kwargs): def _load_sketch_size(self, c1, sketch_id, max_hash): "Get sketch size for given sketch, downsampled by max_hash." if max_hash <= MAX_SQLITE_INT: - c1.execute(""" + c1.execute( + """ SELECT COUNT(hashval) FROM sourmash_hashes WHERE sketch_id=? AND hashval >= 0 AND hashval <= ?""", - (sketch_id, max_hash)) + (sketch_id, max_hash), + ) else: - c1.execute('SELECT COUNT(hashval) FROM sourmash_hashes WHERE sketch_id=?', - (sketch_id,)) + c1.execute( + "SELECT COUNT(hashval) FROM sourmash_hashes WHERE sketch_id=?", + (sketch_id,), + ) - n_hashes, = c1.fetchone() + (n_hashes,) = c1.fetchone() return n_hashes def _load_sketch(self, c, sketch_id, *, match_scaled=None): "Load an individual sketch. If match_scaled is set, downsample." start = time.time() - c.execute(""" + c.execute( + """ SELECT id, name, scaled, ksize, filename, moltype, seed - FROM sourmash_sketches WHERE id=?""", (sketch_id,)) - debug_literal(f"load sketch {sketch_id}: got sketch info in {time.time() - start:.2f}") + FROM sourmash_sketches WHERE id=?""", + (sketch_id,), + ) + debug_literal( + f"load sketch {sketch_id}: got sketch info in {time.time() - start:.2f}" + ) sketch_id, name, scaled, ksize, filename, moltype, seed = c.fetchone() if match_scaled is not None: scaled = max(scaled, match_scaled) - is_protein = 1 if moltype=='protein' else 0 - is_dayhoff = 1 if moltype=='dayhoff' else 0 - is_hp = 1 if moltype=='hp' else 0 - - mh = MinHash(n=0, ksize=ksize, scaled=scaled, seed=seed, - is_protein=is_protein, dayhoff=is_dayhoff, hp=is_hp) - + is_protein = 1 if moltype == "protein" else 0 + is_dayhoff = 1 if moltype == "dayhoff" else 0 + is_hp = 1 if moltype == "hp" else 0 + + mh = MinHash( + n=0, + ksize=ksize, + scaled=scaled, + seed=seed, + is_protein=is_protein, + dayhoff=is_dayhoff, + hp=is_hp, + ) template_values = [sketch_id] hash_constraint_str = "" max_hash = mh._max_hash if max_hash <= MAX_SQLITE_INT: - hash_constraint_str = "sourmash_hashes.hashval >= 0 AND sourmash_hashes.hashval <= ? AND" + hash_constraint_str = ( + "sourmash_hashes.hashval >= 0 AND sourmash_hashes.hashval <= ? AND" + ) template_values.insert(0, max_hash) else: - debug_literal('NOT EMPLOYING hash_constraint_str') + debug_literal("NOT EMPLOYING hash_constraint_str") - debug_literal(f"finding hashes for sketch {sketch_id} in {time.time() - start:.2f}") - c.execute(f"SELECT hashval FROM sourmash_hashes WHERE {hash_constraint_str} sourmash_hashes.sketch_id=?", template_values) + debug_literal( + f"finding hashes for sketch {sketch_id} in {time.time() - start:.2f}" + ) + c.execute( + f"SELECT hashval FROM sourmash_hashes WHERE {hash_constraint_str} sourmash_hashes.sketch_id=?", + template_values, + ) - debug_literal(f"loading hashes for sketch {sketch_id} in {time.time() - start:.2f}") - for hashval, in c: + debug_literal( + f"loading hashes for sketch {sketch_id} in {time.time() - start:.2f}" + ) + for (hashval,) in c: hh = convert_hash_from(hashval) mh.add_hash(hh) @@ -495,29 +538,36 @@ def _load_sketch(self, c, sketch_id, *, match_scaled=None): def _load_sketches(self, c): "Load sketches based on manifest _id column." for row in self.manifest.rows: - sketch_id = row['_id'] - assert row['num'] == 0 - - moltype = row['moltype'] - is_protein = 1 if moltype=='protein' else 0 - is_dayhoff = 1 if moltype=='dayhoff' else 0 - is_hp = 1 if moltype=='hp' else 0 - - ksize = row['ksize'] - scaled = row['scaled'] - seed = row['seed'] - - mh = MinHash(n=0, ksize=ksize, scaled=scaled, seed=seed, - is_protein=is_protein, dayhoff=is_dayhoff, hp=is_hp) + sketch_id = row["_id"] + assert row["num"] == 0 + + moltype = row["moltype"] + is_protein = 1 if moltype == "protein" else 0 + is_dayhoff = 1 if moltype == "dayhoff" else 0 + is_hp = 1 if moltype == "hp" else 0 + + ksize = row["ksize"] + scaled = row["scaled"] + seed = row["seed"] + + mh = MinHash( + n=0, + ksize=ksize, + scaled=scaled, + seed=seed, + is_protein=is_protein, + dayhoff=is_dayhoff, + hp=is_hp, + ) - c.execute("SELECT hashval FROM sourmash_hashes WHERE sketch_id=?", - (sketch_id,)) + c.execute( + "SELECT hashval FROM sourmash_hashes WHERE sketch_id=?", (sketch_id,) + ) - for hashval, in c: + for (hashval,) in c: mh.add_hash(convert_hash_from(hashval)) - ss = SourmashSignature(mh, name=row['name'], - filename=row['filename']) + ss = SourmashSignature(mh, name=row["name"], filename=row["filename"]) yield ss, self.dbfile, sketch_id def _get_matching_sketches(self, c, hashes, max_hash): @@ -529,11 +579,14 @@ def _get_matching_sketches(self, c, hashes, max_hash): because it slows things down in practice. """ c.execute("DROP TABLE IF EXISTS sourmash_hash_query") - c.execute("CREATE TEMPORARY TABLE sourmash_hash_query (hashval INTEGER PRIMARY KEY)") + c.execute( + "CREATE TEMPORARY TABLE sourmash_hash_query (hashval INTEGER PRIMARY KEY)" + ) - hashvals = [ (convert_hash_to(h),) for h in hashes ] - c.executemany("INSERT OR IGNORE INTO sourmash_hash_query (hashval) VALUES (?)", - hashvals) + hashvals = [(convert_hash_to(h),) for h in hashes] + c.executemany( + "INSERT OR IGNORE INTO sourmash_hash_query (hashval) VALUES (?)", hashvals + ) # # set up SELECT conditions @@ -550,15 +603,18 @@ def _get_matching_sketches(self, c, hashes, max_hash): template_values.append(max_hash) # format conditions - conditions.append('sourmash_hashes.hashval=sourmash_hash_query.hashval') + conditions.append("sourmash_hashes.hashval=sourmash_hash_query.hashval") conditions = " AND ".join(conditions) - c.execute(f""" + c.execute( + f""" SELECT DISTINCT sourmash_hashes.sketch_id,COUNT(sourmash_hashes.hashval) as CNT FROM sourmash_hashes, sourmash_hash_query WHERE {conditions} GROUP BY sourmash_hashes.sketch_id ORDER BY CNT DESC - """, template_values) + """, + template_values, + ) return c @@ -578,6 +634,7 @@ class SqliteCollectionManifest(BaseCollectionManifest): In the latter case, the SqliteCollectionManifest is created with managed_by_index set to True. """ + def __init__(self, conn, *, selection_dict=None, managed_by_index=False): """ Here, 'conn' should already be connected and configured. @@ -617,8 +674,9 @@ def create_or_open(cls, filename): @classmethod def load_from_manifest(cls, manifest, *, dbfile=":memory:", append=False): "Create a new sqlite manifest from an existing manifest object." - return cls._create_manifest_from_rows(manifest.rows, location=dbfile, - append=append) + return cls._create_manifest_from_rows( + manifest.rows, location=dbfile, append=append + ) @classmethod def create_manifest(cls, locations_iter, *, include_signature=False): @@ -629,10 +687,10 @@ def create_manifest(cls, locations_iter, *, include_signature=False): Note: do NOT catch exceptions here, so this passes through load excs. Note: this method ignores 'include_signature'. """ + def rows_iter(): for ss, location in locations_iter: - row = cls.make_manifest_row(ss, location, - include_signature=False) + row = cls.make_manifest_row(ss, location, include_signature=False) yield row return cls._create_manifest_from_rows(rows_iter()) @@ -643,8 +701,9 @@ def _create_tables(cls, cursor): # this is a class method so that it can be used by SqliteIndex to # create manifest-compatible tables. - sqlite_utils.add_sourmash_internal(cursor, 'SqliteManifest', '1.0') - cursor.execute(""" + sqlite_utils.add_sourmash_internal(cursor, "SqliteManifest", "1.0") + cursor.execute( + """ CREATE TABLE sourmash_sketches (id INTEGER PRIMARY KEY, name TEXT, @@ -660,7 +719,8 @@ def _create_tables(cls, cursor): internal_location TEXT, UNIQUE(internal_location, md5sum) ) - """) + """ + ) def add_row(self, row): c = self.conn.cursor() @@ -674,18 +734,21 @@ def _insert_row(self, cursor, row, *, call_is_from_index=False): raise Exception("must use SqliteIndex.insert to add to this manifest") row = dict(row) - if 'seed' not in row: - row['seed'] = 42 + if "seed" not in row: + row["seed"] = 42 - cursor.execute(""" + cursor.execute( + """ INSERT OR IGNORE INTO sourmash_sketches (name, num, scaled, ksize, filename, md5sum, moltype, seed, n_hashes, with_abundance, internal_location) VALUES (:name, :num, :scaled, :ksize, :filename, :md5, :moltype, :seed, :n_hashes, :with_abundance, - :internal_location)""", row) + :internal_location)""", + row, + ) - self._num_rows = None # reset cache + self._num_rows = None # reset cache def __bool__(self): "Is this manifest empty?" @@ -700,7 +763,7 @@ def __bool__(self): def __eq__(self, other): "Check equality on a row-by-row basis. May fail on out-of-order rows." - for (a, b) in itertools.zip_longest(self.rows, other.rows): + for a, b in itertools.zip_longest(self.rows, other.rows): # ignore non-required keys. for k in self.required_keys: if a[k] != b[k]: @@ -749,21 +812,21 @@ def _make_select(self): picklist = None if self.selection_dict: select_d = self.selection_dict - if 'ksize' in select_d and select_d['ksize']: + if "ksize" in select_d and select_d["ksize"]: conditions.append("sourmash_sketches.ksize = ?") - values.append(select_d['ksize']) - if 'num' in select_d and select_d['num'] > 0: + values.append(select_d["ksize"]) + if "num" in select_d and select_d["num"] > 0: conditions.append("sourmash_sketches.num > 0") - if 'scaled' in select_d and select_d['scaled'] > 0: + if "scaled" in select_d and select_d["scaled"] > 0: conditions.append("sourmash_sketches.scaled > 0") - if 'containment' in select_d and select_d['containment']: + if "containment" in select_d and select_d["containment"]: conditions.append("sourmash_sketches.scaled > 0") - if 'moltype' in select_d and select_d['moltype'] is not None: - moltype = select_d['moltype'] - assert moltype in ('DNA', 'protein', 'dayhoff', 'hp'), moltype + if "moltype" in select_d and select_d["moltype"] is not None: + moltype = select_d["moltype"] + assert moltype in ("DNA", "protein", "dayhoff", "hp"), moltype conditions.append(f"sourmash_sketches.moltype = '{moltype}'") - picklist = select_d.get('picklist') + picklist = select_d.get("picklist") return conditions, values, picklist @@ -784,10 +847,10 @@ def select_to_manifest(self, **kwargs): new_mf = SqliteCollectionManifest(self.conn, selection_dict=kwargs) # if picklist, make sure we fill in 'found'. - picklist = kwargs.get('picklist') + picklist = kwargs.get("picklist") if picklist is not None: debug_literal("sqlite manifest: iterating through picklist") - _ = len(self) # this forces iteration through rows. + _ = len(self) # this forces iteration through rows. return new_mf @@ -803,19 +866,43 @@ def rows(self): conditions = "" debug_literal(f"sqlite manifest rows: executing select with '{conditions}'") - c1.execute(f""" + c1.execute( + f""" SELECT id, name, md5sum, num, scaled, ksize, filename, moltype, seed, n_hashes, internal_location FROM sourmash_sketches {conditions} - """, values) + """, + values, + ) debug_literal("sqlite manifest: entering row yield loop") - for (_id, name, md5sum, num, scaled, ksize, filename, moltype, - seed, n_hashes, iloc) in c1: - row = dict(num=num, scaled=scaled, name=name, filename=filename, - n_hashes=n_hashes, with_abundance=0, ksize=ksize, - md5=md5sum, internal_location=iloc, - moltype=moltype, md5short=md5sum[:8], - seed=seed, _id=_id) + for ( + _id, + name, + md5sum, + num, + scaled, + ksize, + filename, + moltype, + seed, + n_hashes, + iloc, + ) in c1: + row = dict( + num=num, + scaled=scaled, + name=name, + filename=filename, + n_hashes=n_hashes, + with_abundance=0, + ksize=ksize, + md5=md5sum, + internal_location=iloc, + moltype=moltype, + md5short=md5sum[:8], + seed=seed, + _id=_id, + ) if picklist is None or picklist.matches_manifest_row(row): yield row @@ -824,6 +911,7 @@ def filter_rows(self, row_filter_fn): This is done in memory, inserting each row one at a time. """ + def rows_iter(): for row in self.rows: if row_filter_fn(row): @@ -833,9 +921,11 @@ def rows_iter(): def filter_on_columns(self, col_filter_fn, col_names): "Create a new manifest based on column matches." + def row_filter_fn(row): - x = [ row[col] for col in col_names if row[col] is not None ] + x = [row[col] for col in col_names if row[col] is not None] return col_filter_fn(x) + return self.filter_rows(row_filter_fn) def locations(self): @@ -856,20 +946,22 @@ def locations(self): else: conditions = "" - c1.execute(f""" + c1.execute( + f""" SELECT DISTINCT internal_location FROM sourmash_sketches {conditions} - """, values) + """, + values, + ) - return ( iloc for iloc, in c1 ) + return (iloc for (iloc,) in c1) def __contains__(self, ss): "Check to see if signature 'ss' is in this manifest." md5 = ss.md5sum() c = self.conn.cursor() - c.execute('SELECT COUNT(*) FROM sourmash_sketches WHERE md5sum=?', - (md5,)) - val, = c.fetchone() + c.execute("SELECT COUNT(*) FROM sourmash_sketches WHERE md5sum=?", (md5,)) + (val,) = c.fetchone() if bool(val): picklist = self.picklist @@ -880,18 +972,19 @@ def __contains__(self, ss): def picklist(self): "Return the picklist, if any." if self.selection_dict: - return self.selection_dict.get('picklist') + return self.selection_dict.get("picklist") return None def to_picklist(self): "Convert this manifest to a picklist." - pl = SignaturePicklist('manifest') - pl.pickset = { pl._get_value_for_manifest_row(row) for row in self.rows } + pl = SignaturePicklist("manifest") + pl.pickset = {pl._get_value_for_manifest_row(row) for row in self.rows} return pl @classmethod - def _create_manifest_from_rows(cls, rows_iter, *, location=":memory:", - append=False): + def _create_manifest_from_rows( + cls, rows_iter, *, location=":memory:", append=False + ): """Create a SqliteCollectionManifest from a rows iterator. Internal utility function. @@ -903,7 +996,9 @@ def _create_manifest_from_rows(cls, rows_iter, *, location=":memory:", mf = cls.create(location) except (sqlite3.OperationalError, sqlite3.DatabaseError) as exc: if not append: - raise Exception(f"cannot create sqlite3 db at '{location}'; exception: {str(exc)}") + raise Exception( + f"cannot create sqlite3 db at '{location}'; exception: {str(exc)}" + ) db = load_sqlite_index(location, request_manifest=True) mf = db.manifest @@ -920,6 +1015,7 @@ class LCA_SqliteDatabase(SqliteIndex): """ A wrapper class for SqliteIndex + lineage db => LCA_Database functionality. """ + is_database = True def __init__(self, dbfile, *, lineage_db=None, sqlite_manifest=None): @@ -929,10 +1025,12 @@ def __init__(self, dbfile, *, lineage_db=None, sqlite_manifest=None): c = self.conn.cursor() - c.execute('SELECT DISTINCT ksize, moltype FROM sourmash_sketches') + c.execute("SELECT DISTINCT ksize, moltype FROM sourmash_sketches") res = list(c) if len(res) > 1: - raise TypeError("can only have one ksize & moltype in an LCA_SqliteDatabase") + raise TypeError( + "can only have one ksize & moltype in an LCA_SqliteDatabase" + ) if len(res) == 0: raise ValueError("cannot load an LCA_SqliteDatabase") @@ -996,20 +1094,20 @@ def _build_index(self): lid_to_lineage = {} for row in mf.rows: - name = row['name'] + name = row["name"] if name: # this is a bit of a hack. we try identifiers _with_ and # _without_ versions, and take whichever works. There is # definitely a better way to do this, but I can't think # of one right now. - ident = name.split(' ')[0] + ident = name.split(" ")[0] - lineage = lineage_db.get(ident) # try with identifier version - if lineage is None: # nope - remove version.x - ident = name.split('.')[0] + lineage = lineage_db.get(ident) # try with identifier version + if lineage is None: # nope - remove version.x + ident = name.split(".")[0] lineage = lineage_db.get(ident) - idx = row['_id'] # this is only present in sqlite manifests. + idx = row["_id"] # this is only present in sqlite manifests. ident_to_idx[ident] = idx if lineage: @@ -1038,16 +1136,16 @@ def insert(self, *args, **kwargs): def select(self, *args, **kwargs): sqlite_manifest = self._select(*args, **kwargs) - return LCA_SqliteDatabase(self.dbfile, - sqlite_manifest=sqlite_manifest, - lineage_db=self.lineage_db) + return LCA_SqliteDatabase( + self.dbfile, sqlite_manifest=sqlite_manifest, lineage_db=self.lineage_db + ) ### LCA_Database API/protocol. def downsample_scaled(self, scaled): "Downsample the scaled for querying." if scaled < self.scaled: - raise ValueError("cannot decrease scaled from {} to {}".format(self.scaled, scaled)) + raise ValueError(f"cannot decrease scaled from {self.scaled} to {scaled}") # CTB: maybe return a new LCA_Database? Right now this isn't how # the lca_db protocol works tho. @@ -1097,17 +1195,18 @@ def get_identifiers_for_hashval(self, hashval): class _SqliteIndexHashvalToIndex: """ - Internal wrapper class to retrieve keys and key/value pairs for + Internal wrapper class to retrieve keys and key/value pairs for hashval -> [ list of idx ]. """ + def __init__(self, sqlidx): self.sqlidx = sqlidx def __iter__(self): "Get all hashvals." c = self.sqlidx.conn.cursor() - c.execute('SELECT DISTINCT hashval FROM sourmash_hashes') - for hashval, in c: + c.execute("SELECT DISTINCT hashval FROM sourmash_hashes") + for (hashval,) in c: yield hashval def get(self, key, dv=None): @@ -1117,10 +1216,9 @@ def get(self, key, dv=None): hh = convert_hash_to(key) - c.execute('SELECT sketch_id FROM sourmash_hashes WHERE hashval=?', - (hh,)) + c.execute("SELECT sketch_id FROM sourmash_hashes WHERE hashval=?", (hh,)) - x = [ convert_hash_from(h) for h, in c ] + x = [convert_hash_from(h) for (h,) in c] return x or dv def __getitem__(self, key): diff --git a/src/sourmash/lca/__init__.py b/src/sourmash/lca/__init__.py index b2a9af2589..82b468c424 100644 --- a/src/sourmash/lca/__init__.py +++ b/src/sourmash/lca/__init__.py @@ -1,13 +1,18 @@ "LCA and reverse index utilities." from .lca_db import LCA_Database -from .lca_utils import (taxlist, zip_lineage, build_tree, find_lca, - gather_assignments, display_lineage, - count_lca_for_assignments) +from .lca_utils import ( + taxlist, + zip_lineage, + build_tree, + find_lca, + gather_assignments, + display_lineage, + count_lca_for_assignments, +) from .command_index import index from .command_classify import classify from .command_summarize import summarize_main from .command_rankinfo import rankinfo_main from .__main__ import main - diff --git a/src/sourmash/lca/__main__.py b/src/sourmash/lca/__main__.py index b02b891771..73faa36019 100644 --- a/src/sourmash/lca/__main__.py +++ b/src/sourmash/lca/__main__.py @@ -9,7 +9,7 @@ from .command_compare_csv import compare_csv from ..logging import set_quiet, error -usage=''' +usage = """ sourmash lca [] - work with taxonomic information. ** Commands can be: @@ -23,14 +23,15 @@ ** Use '-h' to get subcommand-specific help, e.g. sourmash lca index -h -''' +""" + def main(arglist=None): args = sourmash.cli.get_parser().parse_args(arglist) submod = getattr(sourmash.cli.sig, args.subcmd) - mainmethod = getattr(submod, 'main') + mainmethod = getattr(submod, "main") return mainmethod(args) -if __name__ == '__main__': +if __name__ == "__main__": main(sys.argv) diff --git a/src/sourmash/lca/command_classify.py b/src/sourmash/lca/command_classify.py index cf5605be72..4ea5ae69ec 100644 --- a/src/sourmash/lca/command_classify.py +++ b/src/sourmash/lca/command_classify.py @@ -11,7 +11,7 @@ from . import lca_utils from .lca_utils import check_files_exist -DEFAULT_THRESHOLD=5 # how many counts of a taxid at min +DEFAULT_THRESHOLD = 5 # how many counts of a taxid at min def classify_signature(query_sig, dblist, threshold, majority): @@ -33,10 +33,9 @@ def classify_signature(query_sig, dblist, threshold, majority): shows up, and filter out low-abundance ones (under threshold). Then, determine the LCA of all of those. - """ + """ # gather assignments from across all the databases - assignments = lca_utils.gather_assignments(query_sig.minhash.hashes, - dblist) + assignments = lca_utils.gather_assignments(query_sig.minhash.hashes, dblist) # now convert to trees -> do LCA & counts counts = lca_utils.count_lca_for_assignments(assignments) @@ -59,20 +58,20 @@ def classify_signature(query_sig, dblist, threshold, majority): # update tree with this set of assignments lca_utils.build_tree([lca], tree) - status = 'nomatch' + status = "nomatch" if not tree: return [], status # now find lowest-common-ancestor of the resulting tree. lca, reason = lca_utils.find_lca(tree) - if reason == 0: # leaf node - debug('END', lca) - status = 'found' - else: # internal node => disagreement - debug('MULTI', lca) - status = 'disagree' + if reason == 0: # leaf node + debug("END", lca) + status = "found" + else: # internal node => disagreement + debug("MULTI", lca) + status = "disagree" - debug('lineage is:', lca) + debug("lineage is:", lca) return lca, status @@ -82,7 +81,7 @@ def classify(args): main single-genome classification function. """ if not args.db: - error('Error! must specify at least one LCA database with --db') + error("Error! must specify at least one LCA database with --db") sys.exit(-1) set_quiet(args.quiet, args.debug) @@ -98,7 +97,7 @@ def classify(args): dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled) # find all the queries - notify('finding query signatures...') + notify("finding query signatures...") inp_files = list(args.query) if args.query_from_file: more_files = sourmash_args.load_pathlist_from_file(args.query_from_file) @@ -108,7 +107,9 @@ def classify(args): sys.exit(-1) if not inp_files: - error('Error! must specify at least one query signature with --query or --query-from-file') + error( + "Error! must specify at least one query signature with --query or --query-from-file" + ) sys.exit(-1) # set up output @@ -117,7 +118,7 @@ def classify(args): with sourmash_args.FileOutputCSV(args.output) as outfp: csvfp = csv.writer(outfp) - csvfp.writerow(['ID','status'] + list(lca_utils.taxlist())) + csvfp.writerow(["ID", "status"] + list(lca_utils.taxlist())) # for each query, gather all the matches across databases total_count = 0 @@ -125,11 +126,10 @@ def classify(args): total_n = len(inp_files) for query_filename in inp_files: n += 1 - for query_sig in load_file_as_signatures(query_filename, - ksize=ksize): - notify(u'\r\033[K', end=u'') - notify(f'... classifying {query_sig} (file {n} of {total_n})', end='\r') - debug('classifying', query_sig) + for query_sig in load_file_as_signatures(query_filename, ksize=ksize): + notify("\r\033[K", end="") + notify(f"... classifying {query_sig} (file {n} of {total_n})", end="\r") + debug("classifying", query_sig) total_count += 1 # make sure we're looking at the same scaled value as database @@ -139,8 +139,9 @@ def classify(args): query_sig.minhash = downsample_mh # do the classification - lineage, status = classify_signature(query_sig, dblist, - args.threshold, args.majority) + lineage, status = classify_signature( + query_sig, dblist, args.threshold, args.majority + ) debug(lineage) # output each classification to the spreadsheet @@ -149,12 +150,12 @@ def classify(args): # when outputting to stdout, make output intelligible if not args.output: - notify(u'\r\033[K', end=u'') + notify("\r\033[K", end="") csvfp.writerow(row) - notify(u'\r\033[K', end=u'') - notify(f'classified {total_count} signatures total') + notify("\r\033[K", end="") + notify(f"classified {total_count} signatures total") -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(classify(sys.argv[1:])) diff --git a/src/sourmash/lca/command_compare_csv.py b/src/sourmash/lca/command_compare_csv.py index 99b7f8211a..c8018256f0 100644 --- a/src/sourmash/lca/command_compare_csv.py +++ b/src/sourmash/lca/command_compare_csv.py @@ -13,44 +13,50 @@ def compare_csv(args): if args.start_column < 2: - error('error, --start-column cannot be less than 2') + error("error, --start-column cannot be less than 2") sys.exit(-1) set_quiet(args.quiet, args.debug) # first, load classify-style spreadsheet - notify(f'loading classify output from: {args.csv1}') - assignments0, num_rows0 = load_taxonomy_assignments(args.csv1, - start_column=3, - force=args.force) + notify(f"loading classify output from: {args.csv1}") + assignments0, num_rows0 = load_taxonomy_assignments( + args.csv1, start_column=3, force=args.force + ) - notify(f'loaded {len(set(assignments0.values()))} distinct lineages, {num_rows0} rows') - notify('----') + notify( + f"loaded {len(set(assignments0.values()))} distinct lineages, {num_rows0} rows" + ) + notify("----") # next, load custom taxonomy spreadsheet - delimiter = ',' + delimiter = "," if args.tabs: - delimiter = '\t' + delimiter = "\t" - notify(f'loading custom spreadsheet from: {args.csv2}') - assignments, num_rows = load_taxonomy_assignments(args.csv2, - delimiter=delimiter, - start_column=args.start_column, - use_headers=not args.no_headers, - force=args.force) - notify(f'loaded {len(set(assignments.values()))} distinct lineages, {num_rows} rows') + notify(f"loading custom spreadsheet from: {args.csv2}") + assignments, num_rows = load_taxonomy_assignments( + args.csv2, + delimiter=delimiter, + start_column=args.start_column, + use_headers=not args.no_headers, + force=args.force, + ) + notify( + f"loaded {len(set(assignments.values()))} distinct lineages, {num_rows} rows" + ) # now, compute basic differences: missing_1 = set(assignments0.keys()) - set(assignments.keys()) missing_2 = set(assignments.keys()) - set(assignments0.keys()) if missing_2: - notify(f'missing {len(missing_2)} assignments in classify spreadsheet.') + notify(f"missing {len(missing_2)} assignments in classify spreadsheet.") if missing_1: - notify(f'missing {len(missing_1)} assignments in custom spreadsheet.') + notify(f"missing {len(missing_1)} assignments in custom spreadsheet.") if missing_1 or missing_2: - notify('(these will not be evaluated any further)') + notify("(these will not be evaluated any further)") else: - notify('note: all IDs are in both spreadsheets!') + notify("note: all IDs are in both spreadsheets!") # next, look at differences in lineages common = set(assignments0.keys()) @@ -71,7 +77,7 @@ def compare_csv(args): lca_utils.build_tree([v1], tree) lca, reason = lca_utils.find_lca(tree) - if reason == 0: # compatible lineages + if reason == 0: # compatible lineages n_compat += 1 print_results("{},compatible,{}", k, ";".join(zip_lineage(lca))) else: @@ -88,8 +94,8 @@ def compare_csv(args): if n_incompat: for rank in lca_utils.taxlist(): - notify(f'{incompat_rank[rank]} incompatible at rank {rank}') - + notify(f"{incompat_rank[rank]} incompatible at rank {rank}") -if __name__ == '__main__': + +if __name__ == "__main__": sys.exit(compare_csv(sys.argv[1:])) diff --git a/src/sourmash/lca/command_index.py b/src/sourmash/lca/command_index.py index 3ee13164a8..f75a0ec8f2 100644 --- a/src/sourmash/lca/command_index.py +++ b/src/sourmash/lca/command_index.py @@ -15,10 +15,16 @@ from sourmash.sourmash_args import DEFAULT_LOAD_K -def load_taxonomy_assignments(filename, *, delimiter=',', start_column=2, - use_headers=True, force=False, - split_identifiers=False, - keep_identifier_versions=False): +def load_taxonomy_assignments( + filename, + *, + delimiter=",", + start_column=2, + use_headers=True, + force=False, + split_identifiers=False, + keep_identifier_versions=False, +): """ Load a taxonomy assignment spreadsheet into a dictionary. @@ -26,34 +32,35 @@ def load_taxonomy_assignments(filename, *, delimiter=',', start_column=2, lineage tuples. """ from sourmash.tax.tax_utils import LineagePair + # parse spreadsheet! # CTB note: can't easily switch to FileInputCSV, because of # janky way we do/don't handle headers here. See issue #2198. - fp = open(filename, newline='') + fp = open(filename, newline="") r = csv.reader(fp, delimiter=delimiter) - row_headers = ['identifiers'] - row_headers += ['_skip_']*(start_column - 2) + row_headers = ["identifiers"] + row_headers += ["_skip_"] * (start_column - 2) row_headers += list(lca_utils.taxlist()) # first check that headers are interpretable. if use_headers: - notify('examining spreadsheet headers...') + notify("examining spreadsheet headers...") first_row = next(iter(r)) n_disagree = 0 - for (column, value) in zip(row_headers, first_row): - if column == '_skip_': + for column, value in zip(row_headers, first_row): + if column == "_skip_": continue if column.lower() != value.lower(): notify(f"** assuming column '{value}' is {column} in spreadsheet") n_disagree += 1 if n_disagree > 2: - error('whoa, too many assumptions. are the headers right?') - error('expecting {}', ",".join(row_headers)) + error("whoa, too many assumptions. are the headers right?") + error("expecting {}", ",".join(row_headers)) if not force: sys.exit(-1) - notify('...continue, because --force was specified.') + notify("...continue, because --force was specified.") # convert into a lineage pair assignments = {} @@ -61,27 +68,27 @@ def load_taxonomy_assignments(filename, *, delimiter=',', start_column=2, n_species = 0 n_strains = 0 for row in r: - if row and row[0].strip(): # want non-empty row + if row and row[0].strip(): # want non-empty row num_rows += 1 lineage = list(zip(row_headers, row)) - lineage = [ x for x in lineage if x[0] != '_skip_' ] + lineage = [x for x in lineage if x[0] != "_skip_"] ident = lineage[0][1] lineage = lineage[1:] # fold, spindle, and mutilate ident? if split_identifiers: - ident = ident.split(' ')[0] + ident = ident.split(" ")[0] if not keep_identifier_versions: - ident = ident.split('.')[0] + ident = ident.split(".")[0] # clean lineage of null names, replace with 'unassigned' - lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] - lineage = [ LineagePair(a, b) for (a, b) in lineage ] + lineage = [(a, lca_utils.filter_null(b)) for (a, b) in lineage] + lineage = [LineagePair(a, b) for (a, b) in lineage] # remove end nulls - while lineage and lineage[-1].name == 'unassigned': + while lineage and lineage[-1].name == "unassigned": lineage = lineage[:-1] # store lineage tuple @@ -90,13 +97,13 @@ def load_taxonomy_assignments(filename, *, delimiter=',', start_column=2, if ident in assignments: if assignments[ident] != tuple(lineage): if not force: - raise Exception("multiple lineages for identifier {}".format(ident)) + raise Exception(f"multiple lineages for identifier {ident}") else: assignments[ident] = tuple(lineage) - if lineage[-1].rank == 'species': + if lineage[-1].rank == "species": n_species += 1 - elif lineage[-1].rank == 'strain': + elif lineage[-1].rank == "strain": n_species += 1 n_strains += 1 @@ -106,35 +113,50 @@ def load_taxonomy_assignments(filename, *, delimiter=',', start_column=2, # any more, when building a large GTDB-based database :) --CTB if len(assignments) * 0.2 > n_species and len(assignments) > 50: if not force: - error('') + error("") error("ERROR: fewer than 20% of lineages have species-level resolution!?") - error("({} species assignments found, of {} assignments total)", - n_species, len(assignments)) + error( + "({} species assignments found, of {} assignments total)", + n_species, + len(assignments), + ) error("** If this is intentional, re-run the command with -f.") sys.exit(-1) return assignments, num_rows -def generate_report(record_duplicates, record_no_lineage, record_remnants, - unused_lineages, unused_identifiers, filename): +def generate_report( + record_duplicates, + record_no_lineage, + record_remnants, + unused_lineages, + unused_identifiers, + filename, +): """ Output a report of anomalies from building the index. """ - with open(filename, 'wt') as fp: - print(f'Duplicate signatures: {len(record_duplicates)}', file=fp) + with open(filename, "w") as fp: + print(f"Duplicate signatures: {len(record_duplicates)}", file=fp) fp.write("\n".join(record_duplicates)) fp.write("\n") - print(f'----\nUnused identifiers: {len(unused_identifiers)}', file=fp) + print(f"----\nUnused identifiers: {len(unused_identifiers)}", file=fp) fp.write("\n".join(unused_identifiers)) fp.write("\n") - print(f'----\nNo lineage provided for these identifiers: {len(record_no_lineage)}', file=fp) + print( + f"----\nNo lineage provided for these identifiers: {len(record_no_lineage)}", + file=fp, + ) fp.write("\n".join(record_no_lineage)) fp.write("\n") - print(f'----\nNo signatures found for these identifiers: {len(record_remnants)}', file=fp) - fp.write('\n'.join(record_remnants)) + print( + f"----\nNo signatures found for these identifiers: {len(record_remnants)}", + file=fp, + ) + fp.write("\n".join(record_remnants)) fp.write("\n") - print(f'----\nUnused lineages: {len(unused_lineages)}', file=fp) + print(f"----\nUnused lineages: {len(unused_lineages)}", file=fp) for lineage in unused_lineages: fp.write(";".join(lca_utils.zip_lineage(lineage))) fp.write("\n") @@ -145,7 +167,7 @@ def index(args): main function for building an LCA database. """ if args.start_column < 2: - error('error, --start-column cannot be less than 2') + error("error, --start-column cannot be less than 2") sys.exit(-1) set_quiet(args.quiet, args.debug) @@ -155,42 +177,50 @@ def index(args): if args.ksize is None: args.ksize = DEFAULT_LOAD_K - moltype = sourmash_args.calculate_moltype(args, default='DNA') + moltype = sourmash_args.calculate_moltype(args, default="DNA") picklist = sourmash_args.load_picklist(args) db_outfile = args.lca_db_out - if args.database_format == 'json': - if not (db_outfile.endswith('.lca.json') or \ - db_outfile.endswith('.lca.json.gz')): # logic -> db.save - db_outfile += '.lca.json' + if args.database_format == "json": + if not ( + db_outfile.endswith(".lca.json") or db_outfile.endswith(".lca.json.gz") + ): # logic -> db.save + db_outfile += ".lca.json" else: - assert args.database_format == 'sql' - if not db_outfile.endswith('.lca.sql'): - db_outfile += '.lca.sql' + assert args.database_format == "sql" + if not db_outfile.endswith(".lca.sql"): + db_outfile += ".lca.sql" if os.path.exists(db_outfile): error(f"ERROR: output file {db_outfile} already exists. Not overwriting.") sys.exit(-1) - notify(f'saving to LCA DB: {format(db_outfile)}') + notify(f"saving to LCA DB: {format(db_outfile)}") - notify(f'Building LCA database with ksize={args.ksize} scaled={args.scaled} moltype={moltype}.') + notify( + f"Building LCA database with ksize={args.ksize} scaled={args.scaled} moltype={moltype}." + ) # first, load taxonomy spreadsheet - delimiter = ',' + delimiter = "," if args.tabs: - delimiter = '\t' - assignments, num_rows = load_taxonomy_assignments(args.csv, - delimiter=delimiter, - start_column=args.start_column, - use_headers=not args.no_headers, - force=args.force, - split_identifiers=args.split_identifiers, - keep_identifier_versions=args.keep_identifier_versions + delimiter = "\t" + assignments, num_rows = load_taxonomy_assignments( + args.csv, + delimiter=delimiter, + start_column=args.start_column, + use_headers=not args.no_headers, + force=args.force, + split_identifiers=args.split_identifiers, + keep_identifier_versions=args.keep_identifier_versions, ) - notify(f'{len(assignments)} distinct identities in spreadsheet out of {num_rows} rows.') - notify(f'{len(set(assignments.values()))} distinct lineages in spreadsheet out of {num_rows} rows.') + notify( + f"{len(assignments)} distinct identities in spreadsheet out of {num_rows} rows." + ) + notify( + f"{len(set(assignments.values()))} distinct lineages in spreadsheet out of {num_rows} rows." + ) db = LCA_Database(args.ksize, args.scaled, moltype) @@ -216,18 +246,28 @@ def index(args): n_skipped = 0 for filename in inp_files: n += 1 - it = load_file_as_signatures(filename, ksize=args.ksize, - select_moltype=moltype, - picklist=picklist, - yield_all_files=args.force) + it = load_file_as_signatures( + filename, + ksize=args.ksize, + select_moltype=moltype, + picklist=picklist, + yield_all_files=args.force, + ) for sig in it: - notify(u'\r\033[K', end=u'') - notify(f'\r... loading signature {str(sig)[:30]} ({n} of {total_n}); skipped {n_skipped} so far', end='') + notify("\r\033[K", end="") + notify( + f"\r... loading signature {str(sig)[:30]} ({n} of {total_n}); skipped {n_skipped} so far", + end="", + ) debug(filename, sig) # block off duplicates. if sig.md5sum() in md5_to_name: - debug('WARNING: in file {}, duplicate md5sum: {}; skipping', filename, sig.md5sum()) + debug( + "WARNING: in file {}, duplicate md5sum: {}; skipping", + filename, + sig.md5sum(), + ) record_duplicates.add(sig.name) continue @@ -240,13 +280,13 @@ def index(args): ident = sig.filename orig_ident = ident - if args.split_identifiers: # hack for NCBI-style names, etc. + if args.split_identifiers: # hack for NCBI-style names, etc. # split on space... - ident = ident.split(' ')[0] + ident = ident.split(" ")[0] if not args.keep_identifier_versions: # ...and on period. - ident = ident.split('.')[0] + ident = ident.split(".")[0] lineage = assignments.get(ident) @@ -257,7 +297,7 @@ def index(args): if args.split_identifiers: notify(f"(Identifier extracted from name: '{orig_ident})')") sys.exit(-1) - debug('(skipping, because --require-taxonomy was specified)') + debug("(skipping, because --require-taxonomy was specified)") n_skipped += 1 continue @@ -265,8 +305,12 @@ def index(args): try: db.insert(sig, ident=ident, lineage=lineage) except ValueError as e: - error("ERROR: cannot insert signature '{}' (md5 {}, loaded from '{}') into database.", - sig, sig.md5sum()[:8], filename) + error( + "ERROR: cannot insert signature '{}' (md5 {}, loaded from '{}') into database.", + sig, + sig.md5sum()[:8], + filename, + ) error("ERROR: {}", str(e)) sys.exit(-1) @@ -280,35 +324,43 @@ def index(args): # track lineage info - either no lineage, or this lineage used. else: - debug('WARNING: no lineage assignment for {}.', ident) + debug("WARNING: no lineage assignment for {}.", ident) record_no_lineage.append(ident) # end main add signatures loop if n_skipped: - notify(f'... loaded {total_n} signatures; skipped {n_skipped} because of --require-taxonomy.') + notify( + f"... loaded {total_n} signatures; skipped {n_skipped} because of --require-taxonomy." + ) else: - notify(f'... loaded {total_n} signatures.') + notify(f"... loaded {total_n} signatures.") # check -- did we find any signatures? if n == 0: - error('ERROR: no signatures found. ??') + error("ERROR: no signatures found. ??") sys.exit(1) # check -- did the signatures we found have any hashes? if not db.hashvals: - error('ERROR: no hash values found - are there any signatures?') + error("ERROR: no hash values found - are there any signatures?") sys.exit(1) - notify(f'loaded {len(db.hashvals)} hashes at ksize={args.ksize} scaled={args.scaled}') + notify( + f"loaded {len(db.hashvals)} hashes at ksize={args.ksize} scaled={args.scaled}" + ) if picklist: sourmash_args.report_picklist(args, picklist) # summarize: - notify(f'{len(record_used_lineages)} assigned lineages out of {len(set(assignments.values()))} distinct lineages in spreadsheet.') + notify( + f"{len(record_used_lineages)} assigned lineages out of {len(set(assignments.values()))} distinct lineages in spreadsheet." + ) unused_lineages = set(assignments.values()) - record_used_lineages - notify(f'{len(record_used_idents)} identifiers used out of {len(set(assignments))} distinct identifiers in spreadsheet.') + notify( + f"{len(record_used_idents)} identifiers used out of {len(set(assignments))} distinct identifiers in spreadsheet." + ) assert record_used_idents.issubset(set(assignments)) unused_identifiers = set(assignments) - record_used_idents @@ -321,25 +373,34 @@ def index(args): # output a record of stuff if requested/available: if record_duplicates or record_no_lineage or record_remnants or unused_lineages: if record_duplicates: - notify(f'WARNING: {len(record_duplicates)} duplicate signatures.') + notify(f"WARNING: {len(record_duplicates)} duplicate signatures.") if record_no_lineage: - notify(f'WARNING: no lineage provided for {len(record_no_lineage)} signatures.') + notify( + f"WARNING: no lineage provided for {len(record_no_lineage)} signatures." + ) if record_remnants: - notify(f'WARNING: no signatures for {len(record_remnants)} spreadsheet rows.') + notify( + f"WARNING: no signatures for {len(record_remnants)} spreadsheet rows." + ) if unused_lineages: - notify(f'WARNING: {len(unused_lineages)} unused lineages.') + notify(f"WARNING: {len(unused_lineages)} unused lineages.") if unused_identifiers: - notify(f'WARNING: {len(unused_identifiers)} unused identifiers.') + notify(f"WARNING: {len(unused_identifiers)} unused identifiers.") if args.report: notify(f"generating a report and saving in '{args.report}'") - generate_report(record_duplicates, record_no_lineage, - record_remnants, unused_lineages, - unused_identifiers, args.report) + generate_report( + record_duplicates, + record_no_lineage, + record_remnants, + unused_lineages, + unused_identifiers, + args.report, + ) else: - notify('(You can use --report to generate a detailed report.)') + notify("(You can use --report to generate a detailed report.)") -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(index(sys.argv[1:])) diff --git a/src/sourmash/lca/command_rankinfo.py b/src/sourmash/lca/command_rankinfo.py index 8cd4c95a71..af0dbfa9d9 100644 --- a/src/sourmash/lca/command_rankinfo.py +++ b/src/sourmash/lca/command_rankinfo.py @@ -27,7 +27,6 @@ def make_lca_counts(dblist, min_num=0): # now convert to trees -> do LCA & counts counts = defaultdict(int) for hashval, lineages in assignments.items(): - # for each list of tuple_info [(rank, name), ...] build # a tree that lets us discover lowest-common-ancestor. debug(lineages) @@ -46,7 +45,7 @@ def rankinfo_main(args): rankinfo! """ if not args.db: - error('Error! must specify at least one LCA database with --db') + error("Error! must specify at least one LCA database with --db") sys.exit(-1) set_quiet(args.quiet, args.debug) @@ -74,8 +73,8 @@ def rankinfo_main(args): else: for rank in lca_utils.taxlist(): count = counts_by_rank.get(rank, 0) - print('{}: {} ({:.1f}%)'.format(rank, count, count / total * 100.)) + print(f"{rank}: {count} ({count / total * 100.0:.1f}%)") -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(rankinfo_main(sys.argv[1:])) diff --git a/src/sourmash/lca/command_summarize.py b/src/sourmash/lca/command_summarize.py index c571d7e141..02b57e60e1 100644 --- a/src/sourmash/lca/command_summarize.py +++ b/src/sourmash/lca/command_summarize.py @@ -13,7 +13,7 @@ from sourmash.index import MultiIndex -DEFAULT_THRESHOLD=5 +DEFAULT_THRESHOLD = 5 def summarize(hashvals, dblist, threshold, ignore_abundance): @@ -32,7 +32,7 @@ def summarize(hashvals, dblist, threshold, ignore_abundance): # now convert to trees -> do LCA & counts if not ignore_abundance: counts = lca_utils.count_lca_for_assignments(assignments, hashvals) - else: # flatten + else: # flatten counts = lca_utils.count_lca_for_assignments(assignments, None) debug(counts.most_common()) @@ -69,9 +69,10 @@ def load_singletons_and_count(filenames, ksize, scaled, ignore_abundance): idx = idx.select(ksize=ksize) for query_sig, query_filename in idx.signatures_with_location(): - notify(u'\r\033[K', end=u'') - notify(f'... loading {query_sig} (file {n} of {total_n})', - total_n, end='\r') + notify("\r\033[K", end="") + notify( + f"... loading {query_sig} (file {n} of {total_n})", total_n, end="\r" + ) total_count += 1 if ignore_abundance and query_sig.minhash.track_abundance: @@ -82,8 +83,8 @@ def load_singletons_and_count(filenames, ksize, scaled, ignore_abundance): count_signature(query_sig, scaled, hashvals) yield query_filename, query_sig, hashvals - notify(u'\r\033[K', end=u'') - notify(f'loaded {total_count} signatures from {n} files total.') + notify("\r\033[K", end="") + notify(f"loaded {total_count} signatures from {n} files total.") def count_signature(sig, scaled, hashvals): @@ -104,32 +105,34 @@ def output_results(lineage_counts, total_counts, filename=None, sig=None): Output results in ~human-readable format. """ - for (lineage, count) in lineage_counts.items(): + for lineage, count in lineage_counts.items(): if lineage: lineage = lca_utils.zip_lineage(lineage, truncate_empty=True) - lineage = ';'.join(lineage) + lineage = ";".join(lineage) else: - lineage = '(root)' + lineage = "(root)" - p = count / total_counts * 100. - p = '{:.1f}%'.format(p) + p = count / total_counts * 100.0 + p = f"{p:.1f}%" - print_results('{:5} {:>5} {} {}:{} {}'.format(p, count, lineage, filename, sig.md5sum()[:8], sig)) + print_results( + f"{p:5} {count:>5} {lineage} {filename}:{sig.md5sum()[:8]} {sig}" + ) -def output_csv(lineage_counts, total_counts, csv_fp, filename, sig, - write_header=True): + +def output_csv(lineage_counts, total_counts, csv_fp, filename, sig, write_header=True): """\ Output results in CSV. """ w = csv.writer(csv_fp) if write_header: - headers = ['count'] + list(lca_utils.taxlist()) - headers += ['filename', 'sig_name', 'sig_md5', 'total_counts'] + headers = ["count"] + list(lca_utils.taxlist()) + headers += ["filename", "sig_name", "sig_md5", "total_counts"] w.writerow(headers) - for (lineage, count) in lineage_counts.items(): - debug('lineage:', lineage) + for lineage, count in lineage_counts.items(): + debug("lineage:", lineage) row = [count] + lca_utils.zip_lineage(lineage, truncate_empty=False) row += [filename, sig.name, sig.md5sum(), total_counts] w.writerow(row) @@ -140,7 +143,7 @@ def summarize_main(args): main summarization function. """ if not args.db: - error('Error! must specify at least one LCA database with --db') + error("Error! must specify at least one LCA database with --db") sys.exit(-1) set_quiet(args.quiet, args.debug) @@ -160,10 +163,12 @@ def summarize_main(args): # load all the databases dblist, ksize, scaled = lca_utils.load_databases(args.db, args.scaled) if ignore_abundance: - notify("Ignoring any k-mer abundances in query, since --ignore-abundance given.") + notify( + "Ignoring any k-mer abundances in query, since --ignore-abundance given." + ) # find all the queries - notify('finding query signatures...') + notify("finding query signatures...") inp_files = args.query if args.query_from_file: @@ -171,7 +176,7 @@ def summarize_main(args): inp_files.extend(more_files) if not inp_files: - error('Error! must specify at least one query signature with --query') + error("Error! must specify at least one query signature with --query") sys.exit(-1) if not check_files_exist(*inp_files): @@ -181,31 +186,37 @@ def summarize_main(args): csv_fp = None write_header = True if args.output: - csv_fp = open(args.output, 'w', newline='') + csv_fp = open(args.output, "w", newline="") try: - for filename, sig, hashvals in \ - load_singletons_and_count(inp_files, ksize, scaled, ignore_abundance): - + for filename, sig, hashvals in load_singletons_and_count( + inp_files, ksize, scaled, ignore_abundance + ): # get the full counted list of lineage counts in this signature - lineage_counts = summarize(hashvals, dblist, args.threshold, - ignore_abundance) + lineage_counts = summarize( + hashvals, dblist, args.threshold, ignore_abundance + ) if not ignore_abundance: total = float(sum(hashvals.values())) else: total = float(len(hashvals)) - output_results(lineage_counts, total, - filename=filename, sig=sig) + output_results(lineage_counts, total, filename=filename, sig=sig) if csv_fp: - output_csv(lineage_counts, total, csv_fp, filename, sig, - write_header=write_header) + output_csv( + lineage_counts, + total, + csv_fp, + filename, + sig, + write_header=write_header, + ) write_header = False finally: if csv_fp: csv_fp.close() -if __name__ == '__main__': +if __name__ == "__main__": sys.exit(summarize_main(sys.argv[1:])) diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index daabe3cb70..78855c71b8 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -14,6 +14,7 @@ def cached_property(fun): """A memoize decorator for class properties.""" + @functools.wraps(fun) def get(self): try: @@ -24,6 +25,7 @@ def get(self): pass ret = self._cache[fun] = fun(self) return ret + return property(get) @@ -56,13 +58,14 @@ class LCA_Database(Index): `_hashval_to_idx` is a dictionary from individual hash values to sets of `idx`. """ + is_database = True # we set manifest to None to avoid implication of fast on-disk access to # sketches. This may be revisited later. manifest = None - def __init__(self, ksize, scaled, moltype='DNA'): + def __init__(self, ksize, scaled, moltype="DNA"): self.ksize = int(ksize) self.scaled = int(scaled) self.filename = None @@ -98,7 +101,7 @@ def _invalidate_cache(self): Internal method. """ - if hasattr(self, '_cache'): + if hasattr(self, "_cache"): del self._cache def _get_ident_index(self, ident, fail_on_duplicate=False): @@ -108,7 +111,7 @@ def _get_ident_index(self, ident, fail_on_duplicate=False): """ idx = self._ident_to_idx.get(ident) if fail_on_duplicate: - assert idx is None # should be no duplicate identities + assert idx is None # should be no duplicate identities if idx is None: idx = self._next_index @@ -153,10 +156,18 @@ def insert(self, sig, ident=None, lineage=None): minhash = sig.minhash if minhash.ksize != self.ksize: - raise ValueError("cannot insert signature with ksize {} into DB (ksize {})".format(minhash.ksize, self.ksize)) + raise ValueError( + "cannot insert signature with ksize {} into DB (ksize {})".format( + minhash.ksize, self.ksize + ) + ) if minhash.moltype != self.moltype: - raise ValueError("cannot insert signature with moltype {} into DB (moltype {})".format(minhash.moltype, self.moltype)) + raise ValueError( + "cannot insert signature with moltype {} into DB (moltype {})".format( + minhash.moltype, self.moltype + ) + ) # downsample to specified scaled; this has the side effect of # making sure they're all at the same scaled value! @@ -169,7 +180,7 @@ def insert(self, sig, ident=None, lineage=None): ident = str(sig) if ident in self._ident_to_name: - raise ValueError("signature '{}' is already in this LCA db.".format(ident)) + raise ValueError(f"signature '{ident}' is already in this LCA db.") # before adding, invalide any caching from @cached_property self._invalidate_cache() @@ -189,7 +200,7 @@ def insert(self, sig, ident=None, lineage=None): # map idx to lid as well. self._idx_to_lid[idx] = lid except TypeError: - raise ValueError('lineage cannot be used as a key?!') + raise ValueError("lineage cannot be used as a key?!") for hashval in minhash.hashes: self._hashval_to_idx[hashval].add(idx) @@ -197,7 +208,7 @@ def insert(self, sig, ident=None, lineage=None): return len(minhash) def __repr__(self): - return "LCA_Database('{}')".format(self.filename) + return f"LCA_Database('{self.filename}')" def signatures(self): """Return all of the signatures in this LCA database. @@ -224,8 +235,16 @@ def _signatures_with_internal(self): for idx, ss in self._signatures.items(): yield ss, idx - def select(self, ksize=None, moltype=None, num=0, scaled=0, abund=None, - containment=False, picklist=None): + def select( + self, + ksize=None, + moltype=None, + num=0, + scaled=0, + abund=None, + containment=False, + picklist=None, + ): """Select a subset of signatures to search. As with SBTs, queries with higher scaled values than the database @@ -239,12 +258,18 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0, abund=None, raise ValueError("cannot use 'num' MinHashes to search LCA database") if scaled > self.scaled and not containment: - raise ValueError(f"cannot use scaled={scaled} on this database (scaled={self.scaled})") + raise ValueError( + f"cannot use scaled={scaled} on this database (scaled={self.scaled})" + ) if ksize is not None and self.ksize != ksize: - raise ValueError(f"ksize on this database is {self.ksize}; this is different from requested ksize of {ksize}") + raise ValueError( + f"ksize on this database is {self.ksize}; this is different from requested ksize of {ksize}" + ) if moltype is not None and moltype != self.moltype: - raise ValueError(f"moltype on this database is {self.moltype}; this is different from requested moltype of {moltype}") + raise ValueError( + f"moltype on this database is {self.moltype}; this is different from requested moltype of {moltype}" + ) if abund: raise ValueError("LCA databases do not support sketches with abund=True") @@ -252,7 +277,9 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0, abund=None, if picklist is not None: self.picklists.append(picklist) if len(self.picklists) > 1: - raise ValueError("we do not (yet) support multiple picklists for LCA databases") + raise ValueError( + "we do not (yet) support multiple picklists for LCA databases" + ) return self @@ -266,24 +293,27 @@ def load(cls, db_name): from sourmash.tax.tax_utils import LineagePair if not os.path.isfile(db_name): - raise ValueError(f"'{db_name}' is not a file and cannot be loaded as an LCA database") + raise ValueError( + f"'{db_name}' is not a file and cannot be loaded as an LCA database" + ) try: from sourmash.index.sqlite_index import LCA_SqliteDatabase + return LCA_SqliteDatabase.load(db_name) except ValueError: pass xopen = open - if db_name.endswith('.gz'): + if db_name.endswith(".gz"): xopen = gzip.open - with xopen(db_name, 'rt') as fp: + with xopen(db_name, "rt") as fp: try: first_ch = fp.read(1) except ValueError: - first_ch = 'X' - if not first_ch or first_ch[0] != '{': + first_ch = "X" + if not first_ch or first_ch[0] != "{": raise ValueError(f"'{db_name}' is not an LCA database file.") fp.seek(0) @@ -295,41 +325,45 @@ def load(cls, db_name): pass if not load_d: - raise ValueError("cannot parse database file '{}' as JSON; invalid format.") + raise ValueError( + "cannot parse database file '{}' as JSON; invalid format." + ) version = None db_type = None try: - version = load_d.get('version') - db_type = load_d.get('type') + version = load_d.get("version") + db_type = load_d.get("type") except AttributeError: pass - if db_type != 'sourmash_lca': - raise ValueError("database file '{}' is not an LCA db.".format(db_name)) + if db_type != "sourmash_lca": + raise ValueError(f"database file '{db_name}' is not an LCA db.") version = float(version) - if version < 2.0 or 'lid_to_lineage' not in load_d: - raise ValueError("Error! This is an old-style LCA DB. You'll need to rebuild or download a newer one.") - - ksize = int(load_d['ksize']) - scaled = int(load_d['scaled']) - moltype = load_d.get('moltype', 'DNA') - if moltype != 'DNA': + if version < 2.0 or "lid_to_lineage" not in load_d: + raise ValueError( + "Error! This is an old-style LCA DB. You'll need to rebuild or download a newer one." + ) + + ksize = int(load_d["ksize"]) + scaled = int(load_d["scaled"]) + moltype = load_d.get("moltype", "DNA") + if moltype != "DNA": assert ksize % 3 == 0 ksize = int(ksize / 3) db = cls(ksize, scaled, moltype) # convert lineage_dict to proper lineages (tuples of LineagePairs) - lid_to_lineage_2 = load_d['lid_to_lineage'] + lid_to_lineage_2 = load_d["lid_to_lineage"] lid_to_lineage = {} lineage_to_lid = {} for k, v in lid_to_lineage_2.items(): - v = dict( ((x[0], x[1]) for x in v) ) + v = dict((x[0], x[1]) for x in v) vv = [] for rank in taxlist(): - name = v.get(rank, '') + name = v.get(rank, "") vv.append(LineagePair(rank, name)) vv = tuple(vv) @@ -340,18 +374,18 @@ def load(cls, db_name): # convert hashval -> lineage index keys to integers (looks like # JSON doesn't have a 64 bit type so stores them as strings) - hashval_to_idx_2 = load_d['hashval_to_idx'] + hashval_to_idx_2 = load_d["hashval_to_idx"] hashval_to_idx = {} for k, v in hashval_to_idx_2.items(): hashval_to_idx[int(k)] = v db._hashval_to_idx = hashval_to_idx - db._ident_to_name = load_d['ident_to_name'] - db._ident_to_idx = load_d['ident_to_idx'] + db._ident_to_name = load_d["ident_to_name"] + db._ident_to_idx = load_d["ident_to_idx"] db._idx_to_lid = {} - for k, v in load_d['idx_to_lid'].items(): + for k, v in load_d["idx_to_lid"].items(): db._idx_to_lid[int(k)] = v if db._ident_to_idx: @@ -367,11 +401,11 @@ def load(cls, db_name): return db - def save(self, db_name, *, format='json'): - if format == 'sql': + def save(self, db_name, *, format="json"): + if format == "sql": self.save_to_sql(db_name) else: - assert format == 'json' + assert format == "json" self.save_to_json(db_name) def save_to_json(self, db_name): @@ -380,42 +414,45 @@ def save_to_json(self, db_name): Method specific to this class. """ if os.path.exists(db_name): - raise ValueError(f"LCA database {db_name} already exists; not overwriting or appending") + raise ValueError( + f"LCA database {db_name} already exists; not overwriting or appending" + ) xopen = open - if db_name.endswith('.gz'): + if db_name.endswith(".gz"): xopen = gzip.open - with xopen(db_name, 'wt') as fp: + with xopen(db_name, "wt") as fp: # use an OrderedDict to preserve output order save_d = OrderedDict() - save_d['version'] = '2.1' - save_d['type'] = 'sourmash_lca' - save_d['license'] = 'CC0' + save_d["version"] = "2.1" + save_d["type"] = "sourmash_lca" + save_d["license"] = "CC0" - if self.moltype != 'DNA': - ksize = self.ksize*3 + if self.moltype != "DNA": + ksize = self.ksize * 3 else: ksize = self.ksize - save_d['ksize'] = ksize - save_d['scaled'] = self.scaled - save_d['moltype'] = self.moltype + save_d["ksize"] = ksize + save_d["scaled"] = self.scaled + save_d["moltype"] = self.moltype # convert lineage internals from tuples to dictionaries d = OrderedDict() for k, v in self._lid_to_lineage.items(): - d[k] = dict([ (vv.rank, vv.name) for vv in v ]) - save_d['lid_to_lineage'] = d + d[k] = dict([(vv.rank, vv.name) for vv in v]) + save_d["lid_to_lineage"] = d # convert values from sets to lists, so that JSON knows how to save - save_d['hashval_to_idx'] = \ - dict((k, list(v)) for (k, v) in self._hashval_to_idx.items()) - - save_d['ident_to_name'] = self._ident_to_name - save_d['ident_to_idx'] = self._ident_to_idx - save_d['idx_to_lid'] = self._idx_to_lid - save_d['lid_to_lineage'] = self._lid_to_lineage - + save_d["hashval_to_idx"] = dict( + (k, list(v)) for (k, v) in self._hashval_to_idx.items() + ) + + save_d["ident_to_name"] = self._ident_to_name + save_d["ident_to_idx"] = self._ident_to_idx + save_d["idx_to_lid"] = self._idx_to_lid + save_d["lid_to_lineage"] = self._lid_to_lineage + json.dump(save_d, fp) def save_to_sql(self, dbname): @@ -424,11 +461,13 @@ def save_to_sql(self, dbname): from sourmash.tax.tax_utils import LineageDB if os.path.exists(dbname): - raise ValueError(f"LCA database {dbname} already exists; not overwriting or appending") + raise ValueError( + f"LCA database {dbname} already exists; not overwriting or appending" + ) # create a new in-memory lineage db... assignments = {} - available_ranks = set() # track ranks, too + available_ranks = set() # track ranks, too for ident, idx in self._ident_to_idx.items(): lid = self._idx_to_lid.get(idx) if lid is not None: @@ -454,7 +493,7 @@ def downsample_scaled(self, scaled): if scaled == self.scaled: return elif scaled < self.scaled: - raise ValueError("cannot decrease scaled from {} to {}".format(self.scaled, scaled)) + raise ValueError(f"cannot decrease scaled from {self.scaled} to {scaled}") self._invalidate_cache() @@ -513,22 +552,28 @@ def _signatures(self): is_protein = False is_hp = False is_dayhoff = False - if self.moltype == 'protein': + if self.moltype == "protein": is_protein = True - elif self.moltype == 'hp': + elif self.moltype == "hp": is_hp = True - elif self.moltype == 'dayhoff': + elif self.moltype == "dayhoff": is_dayhoff = True - minhash = MinHash(n=0, ksize=self.ksize, scaled=self.scaled, - is_protein=is_protein, hp=is_hp, dayhoff=is_dayhoff) + minhash = MinHash( + n=0, + ksize=self.ksize, + scaled=self.scaled, + is_protein=is_protein, + hp=is_hp, + dayhoff=is_dayhoff, + ) - debug('creating signatures for LCA DB...') + debug("creating signatures for LCA DB...") mhd = defaultdict(minhash.copy_and_clear) temp_vals = defaultdict(list) # invert the hashval_to_idx dictionary - for (hashval, idlist) in self._hashval_to_idx.items(): + for hashval, idlist in self._hashval_to_idx.items(): for idx in idlist: temp_hashes = temp_vals[idx] temp_hashes.append(hashval) @@ -559,7 +604,7 @@ def _signatures(self): if passes_all_picklists(ss, self.picklists): sigd[idx] = ss - debug('=> {} signatures!', len(sigd)) + debug("=> {} signatures!", len(sigd)) return sigd def find(self, search_fn, query, **kwargs): @@ -582,9 +627,13 @@ def find(self, search_fn, query, **kwargs): if self.scaled > query_scaled: query_mh = query_mh.downsample(scaled=self.scaled) query_scaled = query_mh.scaled - prepare_subject = lambda x: x # identity + + def prepare_subject(x): + return x # identity else: - prepare_subject = lambda subj: subj.downsample(scaled=query_scaled) + + def prepare_subject(subj): + return subj.downsample(scaled=query_scaled) # collect matching hashes for the query: c = Counter() @@ -594,7 +643,7 @@ def find(self, search_fn, query, **kwargs): for idx in idx_list: c[idx] += 1 - debug('number of matching signatures for hashes: {}', len(c)) + debug("number of matching signatures for hashes: {}", len(c)) # for each match, in order of largest overlap, for idx, count in c.most_common(): @@ -604,7 +653,7 @@ def find(self, search_fn, query, **kwargs): # this piecemeal by iterating across all the hashes, instead. subj = self._signatures.get(idx) - if subj is None: # must be because of a picklist exclusion + if subj is None: # must be because of a picklist exclusion assert self.picklists continue @@ -616,8 +665,7 @@ def find(self, search_fn, query, **kwargs): shared_size = query_mh.count_common(subj_mh) total_size = len(query_mh + subj_mh) - score = search_fn.score_fn(query_size, shared_size, subj_size, - total_size) + score = search_fn.score_fn(query_size, shared_size, subj_size, total_size) # CTB note to self: even with JaccardSearchBestOnly, this will # still iterate over & score all signatures. We should come @@ -671,14 +719,14 @@ def load_databases(filenames, scaled=None, verbose=True): # load all the databases for db_name in filenames: if verbose: - notify(u'\r\033[K', end=u'') - notify(f'... loading database {format(db_name)}', end='\r') + notify("\r\033[K", end="") + notify(f"... loading database {format(db_name)}", end="\r") lca_db = LCA_Database.load(db_name) ksize_vals.add(lca_db.ksize) if len(ksize_vals) > 1: - raise Exception('multiple ksizes, quitting') + raise Exception("multiple ksizes, quitting") if scaled and scaled > lca_db.scaled: lca_db.downsample_scaled(scaled) @@ -686,7 +734,7 @@ def load_databases(filenames, scaled=None, verbose=True): moltype_vals.add(lca_db.moltype) if len(moltype_vals) > 1: - raise Exception('multiple moltypes, quitting') + raise Exception("multiple moltypes, quitting") dblist.append(lca_db) @@ -695,7 +743,9 @@ def load_databases(filenames, scaled=None, verbose=True): moltype = moltype_vals.pop() if verbose: - notify(u'\r\033[K', end=u'') - notify(f'loaded {len(dblist)} LCA databases. ksize={ksize}, scaled={scaled} moltype={moltype}') + notify("\r\033[K", end="") + notify( + f"loaded {len(dblist)} LCA databases. ksize={ksize}, scaled={scaled} moltype={moltype}" + ) return dblist, ksize, scaled diff --git a/src/sourmash/lca/lca_utils.py b/src/sourmash/lca/lca_utils.py index 8ee9340ed7..70b883bb7d 100644 --- a/src/sourmash/lca/lca_utils.py +++ b/src/sourmash/lca/lca_utils.py @@ -7,12 +7,23 @@ from .lca_db import LCA_Database, load_single_database, load_databases -__all__ = ['taxlist', 'zip_lineage', 'build_tree', 'find_lca', - 'load_single_database', 'load_databases', 'gather_assignments', - 'count_lca_for_assignments', 'LineagePair', 'display_lineage', - 'make_lineage', 'pop_to_rank', 'is_lineage_match'] - -try: # py2/py3 compat +__all__ = [ + "taxlist", + "zip_lineage", + "build_tree", + "find_lca", + "load_single_database", + "load_databases", + "gather_assignments", + "count_lca_for_assignments", + "LineagePair", + "display_lineage", + "make_lineage", + "pop_to_rank", + "is_lineage_match", +] + +try: # py2/py3 compat from itertools import zip_longest except ImportError: from itertools import izip_longest as zip_longest @@ -20,7 +31,7 @@ from sourmash.logging import notify, error, debug # type to store an element in a taxonomic lineage -LineagePair = namedtuple('LineagePair', ['rank', 'name']) +LineagePair = namedtuple("LineagePair", ["rank", "name"]) def check_files_exist(*files): @@ -32,8 +43,12 @@ def check_files_exist(*files): ret = False if len(not_found): - error('Error! Could not find the following files.' - ' Make sure the file paths are specified correctly.\n{}'.format('\n'.join(not_found))) + error( + "Error! Could not find the following files." + " Make sure the file paths are specified correctly.\n{}".format( + "\n".join(not_found) + ) + ) return ret @@ -43,11 +58,17 @@ def taxlist(include_strain=True): """ Provide an ordered list of taxonomic ranks. """ - for k in ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', - 'species']: - yield k + yield from [ + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + ] if include_strain: - yield 'strain' + yield "strain" # produce an ordered list of tax names from lineage @@ -67,10 +88,11 @@ def zip_lineage(lineage, include_strain=True, truncate_empty=False): ['a', '', 'c', '', '', '', '', ''] """ - empty = LineagePair(None, '') + empty = LineagePair(None, "") - pairs = zip_longest(taxlist(include_strain=include_strain), - lineage, fillvalue=empty) + pairs = zip_longest( + taxlist(include_strain=include_strain), lineage, fillvalue=empty + ) pairs = list(pairs) # eliminate empty if so requested @@ -85,22 +107,30 @@ def zip_lineage(lineage, include_strain=True, truncate_empty=False): for taxrank, lineage_tup in pairs: # validate non-empty tax, e.g. superkingdom/phylum/class in order. if lineage_tup != empty and lineage_tup.rank != taxrank: - raise ValueError('incomplete lineage at {} - is {} instead'.format(taxrank, lineage_tup.rank)) + raise ValueError( + f"incomplete lineage at {taxrank} - is {lineage_tup.rank} instead" + ) row.append(lineage_tup.name) return row def display_lineage(lineage, include_strain=True, truncate_empty=True): - return ";".join(zip_lineage(lineage, - include_strain=include_strain, - truncate_empty=truncate_empty)) + return ";".join( + zip_lineage( + lineage, include_strain=include_strain, truncate_empty=truncate_empty + ) + ) # filter function toreplace blank/na/null with 'unassigned' -filter_null = lambda x: 'unassigned' if x is None or x.strip() in \ - ('[Blank]', 'na', 'null', '') else x -null_names = set(['[Blank]', 'na', 'null']) +def filter_null(x): + return ( + "unassigned" if x is None or x.strip() in ("[Blank]", "na", "null", "") else x + ) + + +null_names = set(["[Blank]", "na", "null"]) def build_tree(assignments, initial=None): @@ -142,13 +172,13 @@ def find_lca(tree): node = tree lineage = [] while 1: - if len(node) == 1: # descend to only child; track path + if len(node) == 1: # descend to only child; track path lineage_tup = next(iter(node.keys())) lineage.append(lineage_tup) node = node[lineage_tup] - elif len(node) == 0: # at leaf; end + elif len(node) == 0: # at leaf; end return tuple(lineage), 0 - else: # len(node) > 1 => confusion!! + else: # len(node) > 1 => confusion!! return tuple(lineage), len(node) @@ -231,14 +261,14 @@ def pop_to_rank(lin, rank): return tuple(lin) - def make_lineage(lineage): "Turn a ; or ,-separated set of lineages into a tuple of LineagePair objs." from sourmash.tax.tax_utils import LineagePair - lin = lineage.split(';') + + lin = lineage.split(";") if len(lin) == 1: - lin = lineage.split(',') - lin = [ LineagePair(rank, n) for (rank, n) in zip(taxlist(), lin) ] + lin = lineage.split(",") + lin = [LineagePair(rank, n) for (rank, n) in zip(taxlist(), lin)] lin = tuple(lin) return lin diff --git a/src/sourmash/logging.py b/src/sourmash/logging.py index 2915c43f78..ad885a7aee 100644 --- a/src/sourmash/logging.py +++ b/src/sourmash/logging.py @@ -3,6 +3,8 @@ _quiet = False _debug = False + + def set_quiet(val, print_debug=False): global _quiet, _debug _quiet = bool(val) @@ -22,10 +24,9 @@ def notify(s, *args, **kwargs): if _quiet: return - print(u'\r\033[K', end=u'', file=sys.stderr) - print(s.format(*args, **kwargs), file=sys.stderr, - end=kwargs.get('end', u'\n')) - if kwargs.get('flush'): + print("\r\033[K", end="", file=sys.stderr) + print(s.format(*args, **kwargs), file=sys.stderr, end=kwargs.get("end", "\n")) + if kwargs.get("flush"): sys.stderr.flush() @@ -34,10 +35,9 @@ def debug(s, *args, **kwargs): if _quiet or not _debug: return - print(u'\r\033[K', end=u'', file=sys.stderr) - print(s.format(*args, **kwargs), file=sys.stderr, - end=kwargs.get('end', u'\n')) - if kwargs.get('flush'): + print("\r\033[K", end="", file=sys.stderr) + print(s.format(*args, **kwargs), file=sys.stderr, end=kwargs.get("end", "\n")) + if kwargs.get("flush"): sys.stderr.flush() @@ -46,17 +46,17 @@ def debug_literal(s, *args, **kwargs): if _quiet or not _debug: return - print(u'\r\033[K', end=u'', file=sys.stderr) - print(s, file=sys.stderr, end=kwargs.get('end', u'\n')) - if kwargs.get('flush'): + print("\r\033[K", end="", file=sys.stderr) + print(s, file=sys.stderr, end=kwargs.get("end", "\n")) + if kwargs.get("flush"): sys.stderr.flush() def error(s, *args, **kwargs): "A simple error logging function => stderr." - print(u'\r\033[K', end=u'', file=sys.stderr) + print("\r\033[K", end="", file=sys.stderr) print(s.format(*args, **kwargs), file=sys.stderr) - if kwargs.get('flush'): + if kwargs.get("flush"): sys.stderr.flush() @@ -67,13 +67,13 @@ def test_notify(): saveerr, sys.stderr = sys.stderr, StringIO() try: _quiet = False - notify(u'hello, world') + notify("hello, world") finally: _quiet = qsave saveerr, sys.stderr = sys.stderr, saveerr print(type(saveerr)) - assert 'hello, world\n' in saveerr.getvalue() + assert "hello, world\n" in saveerr.getvalue() def test_notify_flush(): @@ -83,13 +83,13 @@ def test_notify_flush(): saveerr, sys.stderr = sys.stderr, StringIO() try: _quiet = False - notify(u'hello, world', flush=True) + notify("hello, world", flush=True) finally: _quiet = qsave saveerr, sys.stderr = sys.stderr, saveerr print(type(saveerr)) - assert 'hello, world' in saveerr.getvalue() + assert "hello, world" in saveerr.getvalue() def test_notify_end(): @@ -99,13 +99,13 @@ def test_notify_end(): saveerr, sys.stderr = sys.stderr, StringIO() try: _quiet = False - notify(u'hello, world', end=u'FOO') + notify("hello, world", end="FOO") finally: _quiet = qsave saveerr, sys.stderr = sys.stderr, saveerr print(type(saveerr)) - assert 'hello, worldFOO' in saveerr.getvalue() + assert "hello, worldFOO" in saveerr.getvalue() def test_notify_quiet(): @@ -115,13 +115,13 @@ def test_notify_quiet(): saveerr, sys.stderr = sys.stderr, StringIO() try: _quiet = True - notify(u'hello, world') + notify("hello, world") finally: _quiet = qsave saveerr, sys.stderr = sys.stderr, saveerr print(type(saveerr)) - assert 'hello, world' not in saveerr.getvalue() + assert "hello, world" not in saveerr.getvalue() def test_error(): @@ -131,13 +131,13 @@ def test_error(): saveerr, sys.stderr = sys.stderr, StringIO() try: _quiet = False - error(u'hello, world') + error("hello, world") finally: _quiet = qsave saveerr, sys.stderr = sys.stderr, saveerr print(type(saveerr)) - assert 'hello, world\n' in saveerr.getvalue() + assert "hello, world\n" in saveerr.getvalue() def test_error_flush(): @@ -147,13 +147,13 @@ def test_error_flush(): saveerr, sys.stderr = sys.stderr, StringIO() try: _quiet = False - error(u'hello, world', flush=True) + error("hello, world", flush=True) finally: _quiet = qsave saveerr, sys.stderr = sys.stderr, saveerr print(type(saveerr)) - assert 'hello, world' in saveerr.getvalue() + assert "hello, world" in saveerr.getvalue() def test_error_quiet(): @@ -164,10 +164,10 @@ def test_error_quiet(): saveerr, sys.stderr = sys.stderr, StringIO() try: _quiet = True - error(u'hello, world') + error("hello, world") finally: _quiet = qsave saveerr, sys.stderr = sys.stderr, saveerr print(type(saveerr)) - assert 'hello, world' in saveerr.getvalue() + assert "hello, world" in saveerr.getvalue() diff --git a/src/sourmash/manifest.py b/src/sourmash/manifest.py index 466bfa8e7a..2f00f5c382 100644 --- a/src/sourmash/manifest.py +++ b/src/sourmash/manifest.py @@ -23,11 +23,21 @@ class BaseCollectionManifest: * 'locations()' returns all distinct locations for e.g. lazy loading * supports container protocol for signatures, e.g. 'if ss in manifest: ...' """ + # each manifest row must have the following, although they may be empty. - required_keys = ('internal_location', - 'md5', 'md5short', 'ksize', 'moltype', 'num', - 'scaled', 'n_hashes', 'with_abundance', - 'name', 'filename') + required_keys = ( + "internal_location", + "md5", + "md5short", + "ksize", + "moltype", + "num", + "scaled", + "n_hashes", + "with_abundance", + "name", + "filename", + ) @classmethod @abstractmethod @@ -42,12 +52,12 @@ def load_from_filename(cls, filename): return db # not a SQLite db? CTB: fix this to actually try loading this as .gz... - if filename.endswith('.gz'): + if filename.endswith(".gz"): xopen = gzip.open else: xopen = open - with xopen(filename, 'rt', newline="") as fp: + with xopen(filename, "rt", newline="") as fp: return cls.load_from_csv(fp) @classmethod @@ -55,10 +65,10 @@ def load_from_csv(cls, fp): "load a manifest from a CSV file." manifest_list = [] firstline = fp.readline().rstrip() - if not firstline.startswith('# SOURMASH-MANIFEST-VERSION: '): + if not firstline.startswith("# SOURMASH-MANIFEST-VERSION: "): raise ValueError("manifest is missing version header") - version = firstline[len('# SOURMASH-MANIFEST-VERSION: '):] + version = firstline[len("# SOURMASH-MANIFEST-VERSION: ") :] if float(version) != 1.0: raise ValueError(f"unknown manifest version number {version}") @@ -73,15 +83,15 @@ def load_from_csv(cls, fp): row = None # do row type conversion - introws = ('num', 'scaled', 'ksize', 'n_hashes') - boolrows = ('with_abundance',) + introws = ("num", "scaled", "ksize", "n_hashes") + boolrows = ("with_abundance",) for row in r: for k in introws: row[k] = int(row[k]) for k in boolrows: row[k] = bool(ast.literal_eval(str(row[k]))) - row['signature'] = None + row["signature"] = None manifest_list.append(row) return CollectionManifest(manifest_list) @@ -89,69 +99,71 @@ def load_from_csv(cls, fp): @classmethod def load_from_sql(cls, filename): from sourmash.index.sqlite_index import load_sqlite_index + db = load_sqlite_index(filename, request_manifest=True) if db is not None: return db.manifest return None - def write_to_filename(self, filename, *, database_format='csv', - ok_if_exists=False): - if database_format == 'csv': + def write_to_filename(self, filename, *, database_format="csv", ok_if_exists=False): + if database_format == "csv": from .sourmash_args import FileOutputCSV + if ok_if_exists or not os.path.exists(filename): with FileOutputCSV(filename) as fp: return self.write_to_csv(fp, write_header=True) elif os.path.exists(filename) and not ok_if_exists: raise Exception("output manifest already exists") - elif database_format == 'sql': + elif database_format == "sql": from sourmash.index.sqlite_index import SqliteCollectionManifest - SqliteCollectionManifest.load_from_manifest(self, dbfile=filename, - append=ok_if_exists) + + SqliteCollectionManifest.load_from_manifest( + self, dbfile=filename, append=ok_if_exists + ) @classmethod def write_csv_header(cls, fp): "write header for manifest CSV format" - fp.write('# SOURMASH-MANIFEST-VERSION: 1.0\n') + fp.write("# SOURMASH-MANIFEST-VERSION: 1.0\n") w = csv.DictWriter(fp, fieldnames=cls.required_keys) w.writeheader() def write_to_csv(self, fp, write_header=False): "write manifest CSV to specified file handle" - w = csv.DictWriter(fp, fieldnames=self.required_keys, - extrasaction='ignore') + w = csv.DictWriter(fp, fieldnames=self.required_keys, extrasaction="ignore") if write_header: self.write_csv_header(fp) for row in self.rows: # don't write signature! - if 'signature' in row: - del row['signature'] + if "signature" in row: + del row["signature"] w.writerow(row) @classmethod def make_manifest_row(cls, ss, location, *, include_signature=True): "make a manifest row dictionary." row = {} - row['md5'] = ss.md5sum() - row['md5short'] = row['md5'][:8] - row['ksize'] = ss.minhash.ksize - row['moltype'] = ss.minhash.moltype - row['num'] = ss.minhash.num - row['scaled'] = ss.minhash.scaled - row['n_hashes'] = len(ss.minhash) - row['with_abundance'] = 1 if ss.minhash.track_abundance else 0 - row['name'] = ss.name - row['filename'] = ss.filename - row['internal_location'] = location + row["md5"] = ss.md5sum() + row["md5short"] = row["md5"][:8] + row["ksize"] = ss.minhash.ksize + row["moltype"] = ss.minhash.moltype + row["num"] = ss.minhash.num + row["scaled"] = ss.minhash.scaled + row["n_hashes"] = len(ss.minhash) + row["with_abundance"] = 1 if ss.minhash.track_abundance else 0 + row["name"] = ss.name + row["filename"] = ss.filename + row["internal_location"] = location assert set(row.keys()) == set(cls.required_keys) # if requested, include the signature in the manifest. if include_signature: - row['signature'] = ss + row["signature"] = ss return row @classmethod @@ -164,8 +176,9 @@ def create_manifest(cls, locations_iter, *, include_signature=True): """ manifest_list = [] for ss, location in locations_iter: - row = cls.make_manifest_row(ss, location, - include_signature=include_signature) + row = cls.make_manifest_row( + ss, location, include_signature=include_signature + ) manifest_list.append(row) return cls(manifest_list) @@ -216,6 +229,7 @@ class CollectionManifest(BaseCollectionManifest): """ An in-memory manifest that simply stores the rows in a list. """ + def __init__(self, rows=[]): "Initialize from an iterable of metadata dictionaries." self.rows = [] @@ -237,7 +251,7 @@ def _add_rows(self, rows): # only iterate once, in case it's a generator for row in rows: self.rows.append(row) - md5set.add(row['md5']) + md5set.add(row["md5"]) def __iadd__(self, other): if self is other: @@ -258,7 +272,7 @@ def __len__(self): def __eq__(self, other): "Check equality on a row-by-row basis. May fail on out-of-order rows." - for (a, b) in itertools.zip_longest(self.rows, other.rows): + for a, b in itertools.zip_longest(self.rows, other.rows): if a is None or b is None: return False @@ -269,41 +283,49 @@ def __eq__(self, other): return True - def _select(self, *, ksize=None, moltype=None, scaled=0, num=0, - containment=False, abund=None, picklist=None): + def _select( + self, + *, + ksize=None, + moltype=None, + scaled=0, + num=0, + containment=False, + abund=None, + picklist=None, + ): """Yield manifest rows for sigs that match the specified requirements. Internal method; call `select_to_manifest` instead. """ matching_rows = self.rows if ksize: - matching_rows = ( row for row in matching_rows - if row['ksize'] == ksize ) + matching_rows = (row for row in matching_rows if row["ksize"] == ksize) if moltype: - matching_rows = ( row for row in matching_rows - if row['moltype'] == moltype ) + matching_rows = (row for row in matching_rows if row["moltype"] == moltype) if scaled or containment: if containment and not scaled: raise ValueError("'containment' requires 'scaled' in Index.select'") - matching_rows = ( row for row in matching_rows - if row['scaled'] and not row['num'] ) + matching_rows = ( + row for row in matching_rows if row["scaled"] and not row["num"] + ) if num: - matching_rows = ( row for row in matching_rows - if row['num'] and not row['scaled'] ) + matching_rows = ( + row for row in matching_rows if row["num"] and not row["scaled"] + ) if abund: # only need to concern ourselves if abundance is _required_ - matching_rows = ( row for row in matching_rows - if row['with_abundance'] ) + matching_rows = (row for row in matching_rows if row["with_abundance"]) if picklist: - matching_rows = ( row for row in matching_rows - if picklist.matches_manifest_row(row) ) + matching_rows = ( + row for row in matching_rows if picklist.matches_manifest_row(row) + ) # return only the internal filenames! - for row in matching_rows: - yield row + yield from matching_rows def select_to_manifest(self, **kwargs): "Do a 'select' and return a new CollectionManifest object." @@ -312,22 +334,24 @@ def select_to_manifest(self, **kwargs): def filter_rows(self, row_filter_fn): "Create a new manifest filtered through row_filter_fn." - new_rows = [ row for row in self.rows if row_filter_fn(row) ] + new_rows = [row for row in self.rows if row_filter_fn(row)] return CollectionManifest(new_rows) def filter_on_columns(self, col_filter_fn, col_names): "Create a new manifest based on column matches." + def row_filter_fn(row): - x = [ row[col] for col in col_names if row[col] is not None ] + x = [row[col] for col in col_names if row[col] is not None] return col_filter_fn(x) + return self.filter_rows(row_filter_fn) def locations(self): "Return all distinct locations." seen = set() for row in self.rows: - loc = row['internal_location'] + loc = row["internal_location"] # track/remove duplicates if loc not in seen: @@ -341,8 +365,8 @@ def __contains__(self, ss): def to_picklist(self): "Convert this manifest to a picklist." - pl = picklist.SignaturePicklist('manifest') + pl = picklist.SignaturePicklist("manifest") - pl.pickset = { pl._get_value_for_manifest_row(row) for row in self.rows } + pl.pickset = {pl._get_value_for_manifest_row(row) for row in self.rows} return pl diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index 360ca6165b..ffa879b64d 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -1,22 +1,26 @@ -# -*- coding: utf-8 -*- """ sourmash submodule that provides MinHash class and utility functions. class MinHash - core MinHash class. class FrozenMinHash - read-only MinHash class. """ -from __future__ import unicode_literals, division -from .distance_utils import jaccard_to_distance, containment_to_distance, set_size_exact_prob +from .distance_utils import ( + jaccard_to_distance, + containment_to_distance, + set_size_exact_prob, +) from .logging import notify import numpy as np -__all__ = ['get_minhash_default_seed', - 'get_minhash_max_hash', - 'hash_murmur', - 'MinHash', - 'FrozenMinHash'] +__all__ = [ + "get_minhash_default_seed", + "get_minhash_max_hash", + "hash_murmur", + "MinHash", + "FrozenMinHash", +] from collections.abc import Mapping @@ -52,20 +56,14 @@ def _get_max_hash_for_scaled(scaled): elif scaled == 1: return get_minhash_max_hash() - return min( - int(round(get_minhash_max_hash() / scaled, 0)), - MINHASH_MAX_HASH - ) + return min(int(round(get_minhash_max_hash() / scaled, 0)), MINHASH_MAX_HASH) def _get_scaled_for_max_hash(max_hash): "Convert a 'max_hash' value into a 'scaled' value." if max_hash == 0: return 0 - return min( - int(round(get_minhash_max_hash() / max_hash, 0)), - MINHASH_MAX_HASH - ) + return min(int(round(get_minhash_max_hash() / max_hash, 0)), MINHASH_MAX_HASH) def to_bytes(s): @@ -75,7 +73,7 @@ def to_bytes(s): if isinstance(s, bytes): return s - if not isinstance(s, (str, bytes, int)): + if not isinstance(s, str | bytes | int): raise TypeError("Requires a string-like sequence") if isinstance(s, str): @@ -97,8 +95,7 @@ def hash_murmur(kmer, seed=MINHASH_DEFAULT_SEED): def translate_codon(codon): "Translate a codon into an amino acid." try: - return rustcall(lib.sourmash_translate_codon, - to_bytes(codon)).decode('utf-8') + return rustcall(lib.sourmash_translate_codon, to_bytes(codon)).decode("utf-8") except SourmashError as e: raise ValueError(e.message) @@ -106,7 +103,7 @@ def translate_codon(codon): def flatten_and_downsample_scaled(mh, *scaled_vals): "Flatten MinHash object and downsample to max of scaled values." assert mh.scaled - assert all( (x > 0 for x in scaled_vals) ) + assert all(x > 0 for x in scaled_vals) mh = mh.flatten() scaled = max(scaled_vals) @@ -118,7 +115,7 @@ def flatten_and_downsample_scaled(mh, *scaled_vals): def flatten_and_downsample_num(mh, *num_vals): "Flatten MinHash object and downsample to min of num values." assert mh.num - assert all( (x > 0 for x in num_vals) ) + assert all(x > 0 for x in num_vals) mh = mh.flatten() num = min(num_vals) @@ -138,6 +135,7 @@ def flatten_and_intersect_scaled(mh1, mh2): class _HashesWrapper(Mapping): "A read-only view of the hashes contained by a MinHash object." + def __init__(self, h): self._data = h @@ -186,6 +184,7 @@ class MinHash(RustObject): >>> round(mh1.similarity(mh2), 2) 0.85 """ + __dealloc_func__ = lib.kmerminhash_free def __init__( @@ -236,13 +235,13 @@ def __init__( if dayhoff: hash_function = lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF - ksize = ksize*3 + ksize = ksize * 3 elif hp: hash_function = lib.HASH_FUNCTIONS_MURMUR64_HP - ksize = ksize*3 + ksize = ksize * 3 elif is_protein: hash_function = lib.HASH_FUNCTIONS_MURMUR64_PROTEIN - ksize = ksize*3 + ksize = ksize * 3 else: hash_function = lib.HASH_FUNCTIONS_MURMUR64_DNA @@ -281,7 +280,7 @@ def __getstate__(self): # get a ksize that makes sense to the Rust layer. See #2262. return ( self.num, - self.ksize if self.is_dna else self.ksize*3, + self.ksize if self.is_dna else self.ksize * 3, self.is_protein, self.dayhoff, self.hp, @@ -294,16 +293,29 @@ def __getstate__(self): def __setstate__(self, tup): "support pickling via __getstate__/__setstate__" - (n, ksize, is_protein, dayhoff, hp, mins, _, track_abundance, - max_hash, seed) = tup + ( + n, + ksize, + is_protein, + dayhoff, + hp, + mins, + _, + track_abundance, + max_hash, + seed, + ) = tup self.__del__() hash_function = ( - lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF if dayhoff else - lib.HASH_FUNCTIONS_MURMUR64_HP if hp else - lib.HASH_FUNCTIONS_MURMUR64_PROTEIN if is_protein else - lib.HASH_FUNCTIONS_MURMUR64_DNA + lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF + if dayhoff + else lib.HASH_FUNCTIONS_MURMUR64_HP + if hp + else lib.HASH_FUNCTIONS_MURMUR64_PROTEIN + if is_protein + else lib.HASH_FUNCTIONS_MURMUR64_DNA ) scaled = _get_scaled_for_max_hash(max_hash) @@ -335,10 +347,11 @@ def copy_and_clear(self): def add_sequence(self, sequence, force=False): "Add a sequence into the sketch." - self._methodcall(lib.kmerminhash_add_sequence, to_bytes(sequence), - force) + self._methodcall(lib.kmerminhash_add_sequence, to_bytes(sequence), force) - def seq_to_hashes(self, sequence, *, force=False, bad_kmers_as_zeroes=False, is_protein=False): + def seq_to_hashes( + self, sequence, *, force=False, bad_kmers_as_zeroes=False, is_protein=False + ): """Convert sequence to hashes without adding to the sketch. If input sequence is DNA and this is a protein, dayhoff, or hp @@ -354,10 +367,20 @@ def seq_to_hashes(self, sequence, *, force=False, bad_kmers_as_zeroes=False, is_ raise ValueError("cannot add protein sequence to DNA MinHash") if bad_kmers_as_zeroes and not force: - raise ValueError("cannot represent invalid kmers as 0 while force is not set to True") + raise ValueError( + "cannot represent invalid kmers as 0 while force is not set to True" + ) size = ffi.new("uintptr_t *") - hashes_ptr = self._methodcall(lib.kmerminhash_seq_to_hashes, to_bytes(sequence), len(sequence), force, bad_kmers_as_zeroes, is_protein, size) + hashes_ptr = self._methodcall( + lib.kmerminhash_seq_to_hashes, + to_bytes(sequence), + len(sequence), + force, + bad_kmers_as_zeroes, + is_protein, + size, + ) size = size[0] try: @@ -384,21 +407,24 @@ def kmers_and_hashes(self, sequence, *, force=False, is_protein=False): bad_kmers_as_zeroes = True sequence = sequence.upper() - hashvals = self.seq_to_hashes(sequence, - force=force, is_protein=is_protein, - bad_kmers_as_zeroes=bad_kmers_as_zeroes) + hashvals = self.seq_to_hashes( + sequence, + force=force, + is_protein=is_protein, + bad_kmers_as_zeroes=bad_kmers_as_zeroes, + ) if bad_kmers_as_zeroes: - hashvals = [ None if h == 0 else h for h in hashvals ] + hashvals = [None if h == 0 else h for h in hashvals] ksize = self.ksize translate = False - if self.moltype == 'DNA': + if self.moltype == "DNA": pass elif is_protein: pass - else: # translate input DNA sequence => aa - assert self.moltype in ('protein', 'dayhoff', 'hp') + else: # translate input DNA sequence => aa + assert self.moltype in ("protein", "dayhoff", "hp") translate = True ksize = self.ksize * 3 @@ -415,13 +441,13 @@ def kmers_and_hashes(self, sequence, *, force=False, is_protein=False): for frame in (0, 1, 2): # get forward k-mers for start in range(0, len(sequence) - ksize + 1 - frame, 3): - kmer = sequence[start + frame:start + frame + ksize] + kmer = sequence[start + frame : start + frame + ksize] yield kmer, hashvals[hash_i] hash_i += 1 # get rc k-mers for start in range(0, len(seqrc) - ksize + 1 - frame, 3): - kmer = seqrc[start + frame:start + frame + ksize] + kmer = seqrc[start + frame : start + frame + ksize] yield kmer, hashvals[hash_i] hash_i += 1 else: @@ -429,17 +455,17 @@ def kmers_and_hashes(self, sequence, *, force=False, is_protein=False): n_kmers = len(sequence) - ksize + 1 assert n_kmers == len(hashvals) for i, hashval in zip(range(0, n_kmers), hashvals): - kmer = sequence[i:i+ksize] + kmer = sequence[i : i + ksize] yield kmer, hashval def add_kmer(self, kmer): "Add a kmer into the sketch." if self.is_dna: if len(kmer) != self.ksize: - raise ValueError("kmer to add is not {} in length".format(self.ksize)) + raise ValueError(f"kmer to add is not {self.ksize} in length") else: - if len(kmer) != self.ksize*3: - raise ValueError("kmer to add is not {} in length".format(self.ksize*3)) + if len(kmer) != self.ksize * 3: + raise ValueError(f"kmer to add is not {self.ksize * 3} in length") self.add_sequence(kmer) def add_many(self, hashes): @@ -468,9 +494,12 @@ def __len__(self): "Number of hashes." return self._methodcall(lib.kmerminhash_get_mins_size) - @deprecated(deprecated_in="3.5", removed_in="5.0", - current_version=VERSION, - details='Use .hashes property instead.') + @deprecated( + deprecated_in="3.5", + removed_in="5.0", + current_version=VERSION, + details="Use .hashes property instead.", + ) def get_mins(self, with_abundance=False): """Return list of hashes or if ``with_abundance`` a list of (hash, abund). @@ -480,10 +509,12 @@ def get_mins(self, with_abundance=False): return mins.keys() return mins - - @deprecated(deprecated_in="3.5", removed_in="5.0", - current_version=VERSION, - details='Use .hashes property instead.') + @deprecated( + deprecated_in="3.5", + removed_in="5.0", + current_version=VERSION, + details="Use .hashes property instead.", + ) def get_hashes(self): "Return the list of hashes." return self.hashes.keys() @@ -500,17 +531,18 @@ def hashes(self): abunds_ptr = self._methodcall(lib.kmerminhash_get_abunds, size_abunds) size_abunds = size_abunds[0] assert size == size_abunds - result = dict(zip(ffi.unpack(mins_ptr, size), ffi.unpack(abunds_ptr, size))) + result = dict( + zip(ffi.unpack(mins_ptr, size), ffi.unpack(abunds_ptr, size)) + ) lib.kmerminhash_slice_free(abunds_ptr, size) return _HashesWrapper(result) else: d = ffi.unpack(mins_ptr, size) - return _HashesWrapper({ k : 1 for k in d }) + return _HashesWrapper({k: 1 for k in d}) finally: lib.kmerminhash_slice_free(mins_ptr, size) - @property def seed(self): return self._methodcall(lib.kmerminhash_seed) @@ -551,9 +583,12 @@ def ksize(self): return k @property - @deprecated(deprecated_in="3.5", removed_in="5.0", - current_version=VERSION, - details='Use scaled instead.') + @deprecated( + deprecated_in="3.5", + removed_in="5.0", + current_version=VERSION, + details="Use scaled instead.", + ) def max_hash(self): return self._methodcall(lib.kmerminhash_max_hash) @@ -574,7 +609,9 @@ def track_abundance(self, b): if b is False: self._methodcall(lib.kmerminhash_disable_abundance) elif len(self) > 0: - raise RuntimeError("Can only set track_abundance=True if the MinHash is empty") + raise RuntimeError( + "Can only set track_abundance=True if the MinHash is empty" + ) else: self._methodcall(lib.kmerminhash_enable_abundance) @@ -604,7 +641,9 @@ def count_common(self, other, downsample=False): """ if not isinstance(other, MinHash): raise TypeError("Must be a MinHash!") - return self._methodcall(lib.kmerminhash_count_common, other._get_objptr(), downsample) + return self._methodcall( + lib.kmerminhash_count_common, other._get_objptr(), downsample + ) def intersection_and_union_size(self, other): "Calculate intersection and union sizes between `self` and `other`." @@ -614,8 +653,9 @@ def intersection_and_union_size(self, other): raise TypeError("incompatible MinHash objects") usize = ffi.new("uint64_t *") - common = self._methodcall(lib.kmerminhash_intersection_union_size, - other._get_objptr(), usize) + common = self._methodcall( + lib.kmerminhash_intersection_union_size, other._get_objptr(), usize + ) usize = ffi.unpack(usize, 1)[0] return common, usize @@ -628,11 +668,11 @@ def downsample(self, *, num=None, scaled=None): # at least one must be specified! if num is None and scaled is None: - raise ValueError('must specify either num or scaled to downsample') + raise ValueError("must specify either num or scaled to downsample") # both cannot be specified if num is not None and scaled is not None: - raise ValueError('cannot specify both num and scaled') + raise ValueError("cannot specify both num and scaled") if num is not None: # cannot downsample a scaled MinHash with num: @@ -644,13 +684,15 @@ def downsample(self, *, num=None, scaled=None): # acceptable num value? make sure to set max_hash to 0. max_hash = 0 - + elif scaled is not None: # cannot downsample a num MinHash with scaled if self.num: raise ValueError("cannot downsample a num MinHash using scaled") if self.scaled > scaled: - raise ValueError(f"new scaled {scaled} is lower than current sample scaled {self.scaled}") + raise ValueError( + f"new scaled {scaled} is lower than current sample scaled {self.scaled}" + ) # acceptable scaled value? reconfigure max_hash, keep num 0. max_hash = _get_max_hash_for_scaled(scaled) @@ -658,10 +700,14 @@ def downsample(self, *, num=None, scaled=None): # end checks! create new object: a = MinHash( - num, self.ksize, - is_protein=self.is_protein, dayhoff=self.dayhoff, hp=self.hp, - track_abundance=self.track_abundance, seed=self.seed, - max_hash=max_hash + num, + self.ksize, + is_protein=self.is_protein, + dayhoff=self.dayhoff, + hp=self.hp, + track_abundance=self.track_abundance, + seed=self.seed, + max_hash=max_hash, ) # copy over hashes: if self.track_abundance: @@ -676,9 +722,14 @@ def flatten(self): if self.track_abundance: # create new object: a = MinHash( - self.num, self.ksize, - is_protein=self.is_protein, dayhoff=self.dayhoff, hp=self.hp, - track_abundance=False, seed=self.seed, max_hash=self._max_hash + self.num, + self.ksize, + is_protein=self.is_protein, + dayhoff=self.dayhoff, + hp=self.hp, + track_abundance=False, + seed=self.seed, + max_hash=self._max_hash, ) a.add_many(self) @@ -688,11 +739,21 @@ def flatten(self): def jaccard(self, other, downsample=False): "Calculate Jaccard similarity of two MinHash objects." if self.num != other.num: - err = "must have same num: {} != {}".format(self.num, other.num) + err = f"must have same num: {self.num} != {other.num}" raise TypeError(err) - return self._methodcall(lib.kmerminhash_similarity, other._get_objptr(), True, downsample) + return self._methodcall( + lib.kmerminhash_similarity, other._get_objptr(), True, downsample + ) - def jaccard_ani(self, other, *, downsample=False, jaccard=None, prob_threshold=1e-3, err_threshold=1e-4): + def jaccard_ani( + self, + other, + *, + downsample=False, + jaccard=None, + prob_threshold=1e-3, + err_threshold=1e-4, + ): "Use jaccard to estimate ANI between two MinHash objects." if not (self.scaled and other.scaled): raise TypeError("Error: can only calculate ANI for scaled MinHashes") @@ -705,12 +766,18 @@ def jaccard_ani(self, other, *, downsample=False, jaccard=None, prob_threshold=1 other_mh = other.downsample(scaled=scaled) if jaccard is None: jaccard = self_mh.similarity(other_mh, ignore_abundance=True) - avg_sketch_kmers = (len(self_mh) + len(other_mh))/2 - avg_n_kmers = round(avg_sketch_kmers * scaled) # would be better if hll estimate - see #1798 - j_aniresult = jaccard_to_distance(jaccard, self_mh.ksize, scaled, - n_unique_kmers=avg_n_kmers, - prob_threshold = prob_threshold, - err_threshold = err_threshold) + avg_sketch_kmers = (len(self_mh) + len(other_mh)) / 2 + avg_n_kmers = round( + avg_sketch_kmers * scaled + ) # would be better if hll estimate - see #1798 + j_aniresult = jaccard_to_distance( + jaccard, + self_mh.ksize, + scaled, + n_unique_kmers=avg_n_kmers, + prob_threshold=prob_threshold, + err_threshold=err_threshold, + ) # null out ANI if either mh size estimation is inaccurate if not self.size_is_accurate() or not other.size_is_accurate(): j_aniresult.size_is_inaccurate = True @@ -730,16 +797,20 @@ def similarity(self, other, ignore_abundance=False, downsample=False): See https://en.wikipedia.org/wiki/Cosine_similarity """ - return self._methodcall(lib.kmerminhash_similarity, - other._get_objptr(), - ignore_abundance, downsample) + return self._methodcall( + lib.kmerminhash_similarity, + other._get_objptr(), + ignore_abundance, + downsample, + ) def angular_similarity(self, other): "Calculate the angular similarity." if not (self.track_abundance and other.track_abundance): - raise TypeError("Error: Angular (cosine) similarity requires both sketches to track hash abundance.") - return self._methodcall(lib.kmerminhash_angular_similarity, - other._get_objptr()) + raise TypeError( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + ) + return self._methodcall(lib.kmerminhash_angular_similarity, other._get_objptr()) def is_compatible(self, other): return self._methodcall(lib.kmerminhash_is_compatible, other._get_objptr()) @@ -749,12 +820,16 @@ def contained_by(self, other, downsample=False): Calculate how much of self is contained by other. """ if not (self.scaled and other.scaled): - raise TypeError("Error: can only calculate containment for scaled MinHashes") + raise TypeError( + "Error: can only calculate containment for scaled MinHashes" + ) denom = len(self) if not denom: return 0.0 - total_denom = float(denom * self.scaled) # would be better if hll estimate - see #1798 - bias_factor = 1.0 - (1.0 - 1.0/self.scaled) ** total_denom + total_denom = float( + denom * self.scaled + ) # would be better if hll estimate - see #1798 + bias_factor = 1.0 - (1.0 - 1.0 / self.scaled) ** total_denom containment = self.count_common(other, downsample) / (denom * bias_factor) # debiasing containment can lead to vals outside of 0-1 range. constrain. if containment >= 1: @@ -764,8 +839,16 @@ def contained_by(self, other, downsample=False): else: return containment - - def containment_ani(self, other, *, downsample=False, containment=None, confidence=0.95, estimate_ci = False, prob_threshold=1e-3): + def containment_ani( + self, + other, + *, + downsample=False, + containment=None, + confidence=0.95, + estimate_ci=False, + prob_threshold=1e-3, + ): "Use self contained by other to estimate ANI between two MinHash objects." if not (self.scaled and other.scaled): raise TypeError("Error: can only calculate ANI for scaled MinHashes") @@ -778,11 +861,17 @@ def containment_ani(self, other, *, downsample=False, containment=None, confiden other_mh = other.downsample(scaled=scaled) if containment is None: containment = self_mh.contained_by(other_mh) - n_kmers = len(self_mh) * scaled # would be better if hll estimate - see #1798 - - c_aniresult = containment_to_distance(containment, self_mh.ksize, self_mh.scaled, - n_unique_kmers=n_kmers, confidence=confidence, - estimate_ci = estimate_ci, prob_threshold=prob_threshold) + n_kmers = len(self_mh) * scaled # would be better if hll estimate - see #1798 + + c_aniresult = containment_to_distance( + containment, + self_mh.ksize, + self_mh.scaled, + n_unique_kmers=n_kmers, + confidence=confidence, + estimate_ci=estimate_ci, + prob_threshold=prob_threshold, + ) # null out ANI if either mh size estimation is inaccurate if not self.size_is_accurate() or not other.size_is_accurate(): c_aniresult.size_is_inaccurate = True @@ -793,13 +882,19 @@ def max_containment(self, other, downsample=False): Calculate maximum containment. """ if not (self.scaled and other.scaled): - raise TypeError("Error: can only calculate containment for scaled MinHashes") + raise TypeError( + "Error: can only calculate containment for scaled MinHashes" + ) min_denom = min((len(self), len(other))) if not min_denom: return 0.0 - total_denom = float(min_denom * self.scaled) # would be better if hll estimate - see #1798 - bias_factor = 1.0 - (1.0 - 1.0/self.scaled) ** total_denom - max_containment = self.count_common(other, downsample) / (min_denom * bias_factor) + total_denom = float( + min_denom * self.scaled + ) # would be better if hll estimate - see #1798 + bias_factor = 1.0 - (1.0 - 1.0 / self.scaled) ** total_denom + max_containment = self.count_common(other, downsample) / ( + min_denom * bias_factor + ) # debiasing containment can lead to vals outside of 0-1 range. constrain. if max_containment >= 1: return 1.0 @@ -808,7 +903,16 @@ def max_containment(self, other, downsample=False): else: return max_containment - def max_containment_ani(self, other, *, downsample=False, max_containment=None, confidence=0.95, estimate_ci=False, prob_threshold=1e-3): + def max_containment_ani( + self, + other, + *, + downsample=False, + max_containment=None, + confidence=0.95, + estimate_ci=False, + prob_threshold=1e-3, + ): "Use max_containment to estimate ANI between two MinHash objects." if not (self.scaled and other.scaled): raise TypeError("Error: can only calculate ANI for scaled MinHashes") @@ -824,9 +928,15 @@ def max_containment_ani(self, other, *, downsample=False, max_containment=None, min_n_kmers = min(len(self_mh), len(other_mh)) n_kmers = min_n_kmers * scaled # would be better if hll estimate - see #1798 - c_aniresult = containment_to_distance(max_containment, self_mh.ksize, scaled, - n_unique_kmers=n_kmers,confidence=confidence, - estimate_ci = estimate_ci, prob_threshold=prob_threshold) + c_aniresult = containment_to_distance( + max_containment, + self_mh.ksize, + scaled, + n_unique_kmers=n_kmers, + confidence=confidence, + estimate_ci=estimate_ci, + prob_threshold=prob_threshold, + ) # null out ANI if either mh size estimation is inaccurate if not self.size_is_accurate() or not other.size_is_accurate(): c_aniresult.size_is_inaccurate = True @@ -838,12 +948,14 @@ def avg_containment(self, other, *, downsample=False): Note: this is average of the containments, *not* count_common/ avg_denom """ if not (self.scaled and other.scaled): - raise TypeError("Error: can only calculate containment for scaled MinHashes") + raise TypeError( + "Error: can only calculate containment for scaled MinHashes" + ) c1 = self.contained_by(other, downsample) c2 = other.contained_by(self, downsample) - return (c1 + c2)/2 + return (c1 + c2) / 2 def avg_containment_ani(self, other, *, downsample=False, prob_threshold=1e-3): """ @@ -852,11 +964,15 @@ def avg_containment_ani(self, other, *, downsample=False, prob_threshold=1e-3): """ if not (self.scaled and other.scaled): raise TypeError("Error: can only calculate ANI for scaled MinHashes") - a1 = self.containment_ani(other, downsample=downsample, prob_threshold=prob_threshold).ani - a2 = other.containment_ani(self, downsample=downsample, prob_threshold=prob_threshold).ani + a1 = self.containment_ani( + other, downsample=downsample, prob_threshold=prob_threshold + ).ani + a2 = other.containment_ani( + self, downsample=downsample, prob_threshold=prob_threshold + ).ani if any([a1 is None, a2 is None]): return None - return (a1 + a2)/2 + return (a1 + a2) / 2 def __add__(self, other): if not isinstance(other, MinHash): @@ -864,11 +980,14 @@ def __add__(self, other): if self.num and other.num: if self.num != other.num: - raise TypeError(f"incompatible num values: self={self.num} other={other.num}") + raise TypeError( + f"incompatible num values: self={self.num} other={other.num}" + ) new_obj = self.to_mutable() new_obj += other return new_obj + __or__ = __add__ def __iadd__(self, other): @@ -890,6 +1009,7 @@ def intersection(self, other): ptr = self._methodcall(lib.kmerminhash_intersection, other._get_objptr()) return MinHash._from_objptr(ptr) + __and__ = intersection def set_abundances(self, values, clear=True): @@ -904,12 +1024,14 @@ def set_abundances(self, values, clear=True): abunds = [] for h, v in values.items(): - hashes.append(h) + hashes.append(h) if v < 0: raise ValueError("Abundance cannot be set to a negative value.") abunds.append(v) - self._methodcall(lib.kmerminhash_set_abundances, hashes, abunds, len(hashes), clear) + self._methodcall( + lib.kmerminhash_set_abundances, hashes, abunds, len(hashes), clear + ) else: raise RuntimeError( "Use track_abundance=True when constructing " @@ -921,15 +1043,15 @@ def add_protein(self, sequence): self._methodcall(lib.kmerminhash_add_protein, to_bytes(sequence)) @property - def moltype(self): # TODO: test in minhash tests + def moltype(self): # TODO: test in minhash tests if self.is_protein: - return 'protein' + return "protein" elif self.dayhoff: - return 'dayhoff' + return "dayhoff" elif self.hp: - return 'hp' + return "hp" else: - return 'DNA' + return "DNA" def to_mutable(self): "Return a copy of this MinHash that can be changed." @@ -954,7 +1076,7 @@ def inflate(self, from_mh): """ if not self.track_abundance and from_mh.track_abundance: orig_abunds = from_mh.hashes - abunds = { h: orig_abunds.get(h, 0) for h in self.hashes } + abunds = {h: orig_abunds.get(h, 0) for h in self.hashes} abund_mh = from_mh.copy_and_clear() @@ -963,7 +1085,9 @@ def inflate(self, from_mh): return abund_mh else: - raise ValueError("inflate operates on a flat MinHash and takes a MinHash object with track_abundance=True") + raise ValueError( + "inflate operates on a flat MinHash and takes a MinHash object with track_abundance=True" + ) @property def sum_abundances(self): @@ -995,9 +1119,11 @@ def unique_dataset_hashes(self): Approximate total number of hashes (num_hashes *scaled). """ if not self.scaled: - raise TypeError("can only approximate unique_dataset_hashes for scaled MinHashes") + raise TypeError( + "can only approximate unique_dataset_hashes for scaled MinHashes" + ) # TODO: replace set_size with HLL estimate when that gets implemented - return len(self) * self.scaled # + (self.ksize - 1) for bp estimation + return len(self) * self.scaled # + (self.ksize - 1) for bp estimation def size_is_accurate(self, relative_error=0.20, confidence=0.95): """ @@ -1008,41 +1134,47 @@ def size_is_accurate(self, relative_error=0.20, confidence=0.95): Returns True if probability is greater than or equal to the desired confidence. """ if not self.scaled: - raise TypeError("Error: can only estimate dataset size for scaled MinHashes") + raise TypeError( + "Error: can only estimate dataset size for scaled MinHashes" + ) if any([not (0 <= relative_error <= 1), not (0 <= confidence <= 1)]): - raise ValueError("Error: relative error and confidence values must be between 0 and 1.") - # to do: replace unique_dataset_hashes with HLL estimation when it gets implemented - probability = set_size_exact_prob(self.unique_dataset_hashes, self.scaled, relative_error=relative_error) + raise ValueError( + "Error: relative error and confidence values must be between 0 and 1." + ) + # to do: replace unique_dataset_hashes with HLL estimation when it gets implemented + probability = set_size_exact_prob( + self.unique_dataset_hashes, self.scaled, relative_error=relative_error + ) return probability >= confidence class FrozenMinHash(MinHash): def add_sequence(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def add_kmer(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def add_many(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def remove_many(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def add_hash(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def add_hash_with_abundance(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def clear(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def set_abundances(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def add_protein(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def downsample(self, *, num=None, scaled=None): if scaled and self.scaled == scaled: @@ -1062,10 +1194,10 @@ def flatten(self): return flat_mh def __iadd__(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def merge(self, *args, **kwargs): - raise TypeError('FrozenMinHash does not support modification') + raise TypeError("FrozenMinHash does not support modification") def to_mutable(self): "Return a copy of this MinHash that can be changed." @@ -1085,16 +1217,29 @@ def into_frozen(self): def __setstate__(self, tup): "support pickling via __getstate__/__setstate__" - (n, ksize, is_protein, dayhoff, hp, mins, _, track_abundance, - max_hash, seed) = tup + ( + n, + ksize, + is_protein, + dayhoff, + hp, + mins, + _, + track_abundance, + max_hash, + seed, + ) = tup self.__del__() hash_function = ( - lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF if dayhoff else - lib.HASH_FUNCTIONS_MURMUR64_HP if hp else - lib.HASH_FUNCTIONS_MURMUR64_PROTEIN if is_protein else - lib.HASH_FUNCTIONS_MURMUR64_DNA + lib.HASH_FUNCTIONS_MURMUR64_DAYHOFF + if dayhoff + else lib.HASH_FUNCTIONS_MURMUR64_HP + if hp + else lib.HASH_FUNCTIONS_MURMUR64_PROTEIN + if is_protein + else lib.HASH_FUNCTIONS_MURMUR64_DNA ) scaled = _get_scaled_for_max_hash(max_hash) @@ -1108,4 +1253,5 @@ def __setstate__(self, tup): def __copy__(self): return self + copy = __copy__ diff --git a/src/sourmash/nodegraph.py b/src/sourmash/nodegraph.py index 8faa2eb874..3204e11b7e 100644 --- a/src/sourmash/nodegraph.py +++ b/src/sourmash/nodegraph.py @@ -88,6 +88,7 @@ def matches(self, mh): def to_khmer_nodegraph(self): import khmer + try: load_nodegraph = khmer.load_nodegraph except AttributeError: @@ -117,41 +118,44 @@ def extract_nodegraph_info(filename): ht_type = None occupied = None - uint_size = len(pack('I', 0)) - uchar_size = len(pack('B', 0)) - ulonglong_size = len(pack('Q', 0)) + uint_size = len(pack("I", 0)) + uchar_size = len(pack("B", 0)) + ulonglong_size = len(pack("Q", 0)) try: - with open(filename, 'rb') as nodegraph: - signature, = unpack('4s', nodegraph.read(4)) - version, = unpack('B', nodegraph.read(1)) - ht_type, = unpack('B', nodegraph.read(1)) - ksize, = unpack('I', nodegraph.read(uint_size)) - n_tables, = unpack('B', nodegraph.read(uchar_size)) - occupied, = unpack('Q', nodegraph.read(ulonglong_size)) - table_size, = unpack('Q', nodegraph.read(ulonglong_size)) + with open(filename, "rb") as nodegraph: + (signature,) = unpack("4s", nodegraph.read(4)) + (version,) = unpack("B", nodegraph.read(1)) + (ht_type,) = unpack("B", nodegraph.read(1)) + (ksize,) = unpack("I", nodegraph.read(uint_size)) + (n_tables,) = unpack("B", nodegraph.read(uchar_size)) + (occupied,) = unpack("Q", nodegraph.read(ulonglong_size)) + (table_size,) = unpack("Q", nodegraph.read(ulonglong_size)) if signature != b"OXLI": - raise ValueError("Node graph '{}' is missing file type " - "signature".format(filename) + str(signature)) + raise ValueError( + f"Node graph '{filename}' is missing file type " + "signature" + str(signature) + ) except: - raise ValueError("Node graph '{}' is corrupt ".format(filename)) + raise ValueError(f"Node graph '{filename}' is corrupt ") return ksize, round(table_size, -2), n_tables, version, ht_type, occupied -def calc_expected_collisions(graph, force=False, max_false_pos=.2): +def calc_expected_collisions(graph, force=False, max_false_pos=0.2): fp_all = graph.expected_collisions if fp_all > max_false_pos: print("**", file=sys.stderr) - print("** ERROR: the graph structure is too small for ", - file=sys.stderr) - print("** this data set. Increase data structure size.", - file=sys.stderr) + print("** ERROR: the graph structure is too small for ", file=sys.stderr) + print("** this data set. Increase data structure size.", file=sys.stderr) print("** Do not use these results!!", file=sys.stderr) print("**", file=sys.stderr) - print("** (estimated false positive rate of %.3f;" % fp_all, - file=sys.stderr, end=' ') + print( + "** (estimated false positive rate of %.3f;" % fp_all, + file=sys.stderr, + end=" ", + ) print("max recommended %.3f)" % max_false_pos, file=sys.stderr) print("**", file=sys.stderr) diff --git a/src/sourmash/np_utils.py b/src/sourmash/np_utils.py index 683f0be6f6..5c69a0bd5d 100644 --- a/src/sourmash/np_utils.py +++ b/src/sourmash/np_utils.py @@ -12,9 +12,11 @@ def to_memmap(array): """ import numpy as np - filename = tempfile.NamedTemporaryFile(prefix="array", suffix=".mmap", delete=False).name + filename = tempfile.NamedTemporaryFile( + prefix="array", suffix=".mmap", delete=False + ).name shape = array.shape - f = np.memmap(filename, mode='w+', shape=shape, dtype=array.dtype) + f = np.memmap(filename, mode="w+", shape=shape, dtype=array.dtype) f[:] = array[:] del f large_memmap = np.memmap(filename, dtype=array.dtype, shape=shape) diff --git a/src/sourmash/picklist.py b/src/sourmash/picklist.py index 8f43aca739..8a5652eb1a 100644 --- a/src/sourmash/picklist.py +++ b/src/sourmash/picklist.py @@ -17,29 +17,32 @@ preprocess = {} # exact matches -preprocess['name'] = lambda x: x -preprocess['md5'] = lambda x: x +preprocess["name"] = lambda x: x +preprocess["md5"] = lambda x: x # identifier matches/prefix foo - space delimited identifiers -preprocess['identprefix'] = lambda x: x.split(' ')[0].split('.')[0] -preprocess['ident'] = lambda x: x.split(' ')[0] +preprocess["identprefix"] = lambda x: x.split(" ")[0].split(".")[0] +preprocess["ident"] = lambda x: x.split(" ")[0] # match 8 characters -preprocess['md5prefix8'] = lambda x: x[:8] -preprocess['md5short'] = lambda x: x[:8] +preprocess["md5prefix8"] = lambda x: x[:8] +preprocess["md5short"] = lambda x: x[:8] + # all meta-coltypes use the same preprocessing of tuple => (ident, md5short) def combine_ident_md5(x): "preprocess (name, md5) tup into (ident, md5short) tup" name, md5 = x - ident = name.split(' ')[0] + ident = name.split(" ")[0] md5 = md5[:8] return (ident, md5) -preprocess['manifest'] = combine_ident_md5 -preprocess['prefetch'] = combine_ident_md5 -preprocess['gather'] = combine_ident_md5 -preprocess['search'] = combine_ident_md5 + + +preprocess["manifest"] = combine_ident_md5 +preprocess["prefetch"] = combine_ident_md5 +preprocess["gather"] = combine_ident_md5 +preprocess["search"] = combine_ident_md5 class PickStyle(Enum): @@ -74,12 +77,20 @@ class SignaturePicklist: blank in this case: e.g. use 'pickfile.csv::gather'. These "meta-coltypes" use composite selection on (ident, md5short) tuples. """ - meta_coltypes = ('manifest', 'gather', 'prefetch', 'search') - supported_coltypes = ('md5', 'md5prefix8', 'md5short', - 'name', 'ident', 'identprefix') - def __init__(self, coltype, *, pickfile=None, column_name=None, - pickstyle=PickStyle.INCLUDE): + meta_coltypes = ("manifest", "gather", "prefetch", "search") + supported_coltypes = ( + "md5", + "md5prefix8", + "md5short", + "name", + "ident", + "identprefix", + ) + + def __init__( + self, coltype, *, pickfile=None, column_name=None, pickstyle=PickStyle.INCLUDE + ): "create a picklist of column type 'coltype'." # first, check coltype... @@ -96,10 +107,10 @@ def __init__(self, coltype, *, pickfile=None, column_name=None, if column_name: raise ValueError(f"no column name allowed for coltype '{coltype}'") - if coltype == 'prefetch': - column_name = '(match_name, match_md5)' + if coltype == "prefetch": + column_name = "(match_name, match_md5)" else: - column_name = '(name, md5)' + column_name = "(name, md5)" self.coltype = coltype self.pickfile = pickfile @@ -114,18 +125,20 @@ def __init__(self, coltype, *, pickfile=None, column_name=None, @classmethod def from_picklist_args(cls, argstr): "load a picklist from an argument string 'pickfile:col:coltype:style'" - picklist = argstr.split(':') + picklist = argstr.split(":") pickstyle = PickStyle.INCLUDE # pickstyle specified? if len(picklist) == 4: pickstyle_str = picklist.pop() - if pickstyle_str == 'include': + if pickstyle_str == "include": pickstyle = PickStyle.INCLUDE - elif pickstyle_str == 'exclude': + elif pickstyle_str == "exclude": pickstyle = PickStyle.EXCLUDE else: - raise ValueError(f"invalid picklist 'pickstyle' argument 4: '{pickstyle_str}' must be 'include' or 'exclude'") + raise ValueError( + f"invalid picklist 'pickstyle' argument 4: '{pickstyle_str}' must be 'include' or 'exclude'" + ) if len(picklist) != 3: raise ValueError(f"invalid picklist argument '{argstr}'") @@ -133,36 +146,39 @@ def from_picklist_args(cls, argstr): assert len(picklist) == 3 pickfile, column, coltype = picklist - return cls(coltype, pickfile=pickfile, column_name=column, - pickstyle=pickstyle) + return cls(coltype, pickfile=pickfile, column_name=column, pickstyle=pickstyle) def _get_sig_attribute(self, ss): "for a given SourmashSignature, return relevant picklist value." coltype = self.coltype - if coltype in self.meta_coltypes: # gather, prefetch, search, manifest + if coltype in self.meta_coltypes: # gather, prefetch, search, manifest q = (ss.name, ss.md5sum()) - elif coltype in ('md5', 'md5prefix8', 'md5short'): + elif coltype in ("md5", "md5prefix8", "md5short"): q = ss.md5sum() - elif coltype in ('name', 'ident', 'identprefix'): + elif coltype in ("name", "ident", "identprefix"): q = ss.name else: - raise ValueError(f"picklist get_sig_attribute {coltype} has unhandled branch") + raise ValueError( + f"picklist get_sig_attribute {coltype} has unhandled branch" + ) return q def _get_value_for_manifest_row(self, row): "return the picklist value from a manifest row" - if self.coltype in self.meta_coltypes: # gather, prefetch, search, manifest - q = (row['name'], row['md5']) + if self.coltype in self.meta_coltypes: # gather, prefetch, search, manifest + q = (row["name"], row["md5"]) else: - if self.coltype == 'md5': - colkey = 'md5' - elif self.coltype in ('md5prefix8', 'md5short'): - colkey = 'md5short' - elif self.coltype in ('name', 'ident', 'identprefix'): - colkey = 'name' + if self.coltype == "md5": + colkey = "md5" + elif self.coltype in ("md5prefix8", "md5short"): + colkey = "md5short" + elif self.coltype in ("name", "ident", "identprefix"): + colkey = "name" else: - raise ValueError(f"picklist get_value_for_row {colkey} has unhandled branch") + raise ValueError( + f"picklist get_value_for_row {colkey} has unhandled branch" + ) q = row.get(colkey) @@ -175,12 +191,12 @@ def _get_value_for_csv_row(self, row): "return the picklist value from a CSV pickfile row - supplied by user, typically" # customize for each type of meta_coltypes - if self.coltype == 'manifest': - q = (row['name'], row['md5']) - elif self.coltype == 'prefetch': - q = (row['match_name'], row['match_md5']) - elif self.coltype in ('gather', 'search'): - q = (row['name'], row['md5']) + if self.coltype == "manifest": + q = (row["name"], row["md5"]) + elif self.coltype == "prefetch": + q = (row["match_name"], row["match_md5"]) + elif self.coltype in ("gather", "search"): + q = (row["name"], row["md5"]) else: q = row[self.column_name] @@ -218,7 +234,9 @@ def load(self, *, allow_empty=False): self.pickfile = pickfile if not r.fieldnames: if not allow_empty: - raise ValueError(f"empty or improperly formatted pickfile '{pickfile}'") + raise ValueError( + f"empty or improperly formatted pickfile '{pickfile}'" + ) else: return 0, 0 diff --git a/src/sourmash/plugins.py b/src/sourmash/plugins.py index 4c18f27533..0871154f2d 100644 --- a/src/sourmash/plugins.py +++ b/src/sourmash/plugins.py @@ -18,7 +18,7 @@ import itertools import argparse -from .logging import (debug_literal, error, notify, set_quiet) +from .logging import debug_literal, error, notify, set_quiet # cover for older versions of Python that don't support selection on load # (the 'group=' below). @@ -26,20 +26,22 @@ # load 'load_from' entry points. NOTE: this executes on import of this module. try: - _plugin_load_from = entry_points(group='sourmash.load_from') + _plugin_load_from = entry_points(group="sourmash.load_from") except TypeError: from importlib_metadata import entry_points - _plugin_load_from = entry_points(group='sourmash.load_from') + + _plugin_load_from = entry_points(group="sourmash.load_from") # load 'save_to' entry points as well. -_plugin_save_to = entry_points(group='sourmash.save_to') +_plugin_save_to = entry_points(group="sourmash.save_to") # aaaaand CLI entry points: -_plugin_cli = entry_points(group='sourmash.cli_script') +_plugin_cli = entry_points(group="sourmash.cli_script") _plugin_cli_once = False ### + def get_load_from_functions(): "Load the 'load_from' plugins and yield tuples (priority, name, fn)." debug_literal(f"load_from plugins: {_plugin_load_from}") @@ -49,11 +51,13 @@ def get_load_from_functions(): try: loader_fn = plugin.load() except (ModuleNotFoundError, AttributeError) as e: - debug_literal(f"plugins.load_from_functions: got error loading {plugin.name}: {str(e)}") + debug_literal( + f"plugins.load_from_functions: got error loading {plugin.name}: {str(e)}" + ) continue # get 'priority' if it is available - priority = getattr(loader_fn, 'priority', DEFAULT_LOAD_FROM_PRIORITY) + priority = getattr(loader_fn, "priority", DEFAULT_LOAD_FROM_PRIORITY) # retrieve name (which is specified by plugin?) name = plugin.name @@ -70,11 +74,13 @@ def get_save_to_functions(): try: save_cls = plugin.load() except (ModuleNotFoundError, AttributeError) as e: - debug_literal(f"plugins.load_from_functions: got error loading {plugin.name}: {str(e)}") + debug_literal( + f"plugins.load_from_functions: got error loading {plugin.name}: {str(e)}" + ) continue # get 'priority' if it is available - priority = getattr(save_cls, 'priority', DEFAULT_SAVE_TO_PRIORITY) + priority = getattr(save_cls, "priority", DEFAULT_SAVE_TO_PRIORITY) # retrieve name (which is specified by plugin?) name = plugin.name @@ -88,17 +94,16 @@ class CommandLinePlugin: Subclasses should call super().__init__(parser) and super().main(args). """ + command = None description = None def __init__(self, parser): parser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' + "-q", "--quiet", action="store_true", help="suppress non-error output" ) parser.add_argument( - '-d', '--debug', action='store_true', - help='provide debugging output' + "-d", "--debug", action="store_true", help="provide debugging output" ) def main(self, args): @@ -116,14 +121,18 @@ def get_cli_script_plugins(): script_cls = plugin.load() except (ModuleNotFoundError, AttributeError): if _plugin_cli_once is False: - error(f"ERROR: cannot find or load module for cli_script plugin '{name}'") + error( + f"ERROR: cannot find or load module for cli_script plugin '{name}'" + ) continue - command = getattr(script_cls, 'command', None) + command = getattr(script_cls, "command", None) if command is None: # print error message only once... if _plugin_cli_once is False: - error(f"ERROR: no command provided by cli_script plugin '{name}' from {mod}; skipping") + error( + f"ERROR: no command provided by cli_script plugin '{name}' from {mod}; skipping" + ) else: x.append(plugin) @@ -137,8 +146,8 @@ def get_cli_scripts_descriptions(): name = plugin.name script_cls = plugin.load() - command = getattr(script_cls, 'command') - description = getattr(script_cls, 'description', "") + command = getattr(script_cls, "command") + description = getattr(script_cls, "description", "") if description: description = description.splitlines()[0] if not description: @@ -155,18 +164,21 @@ def add_cli_scripts(parser): name = plugin.name script_cls = plugin.load() - usage = getattr(script_cls, 'usage', None) - description = getattr(script_cls, 'description', None) - epilog = getattr(script_cls, 'epilog', None) - formatter_class = getattr(script_cls, 'formatter_class', - argparse.HelpFormatter) - - subparser = parser.add_parser(script_cls.command, - usage=usage, - description=description, - epilog=epilog, - formatter_class=formatter_class) - debug_literal(f"cls_script plugin '{name}' adding command '{script_cls.command}'") + usage = getattr(script_cls, "usage", None) + description = getattr(script_cls, "description", None) + epilog = getattr(script_cls, "epilog", None) + formatter_class = getattr(script_cls, "formatter_class", argparse.HelpFormatter) + + subparser = parser.add_parser( + script_cls.command, + usage=usage, + description=description, + epilog=epilog, + formatter_class=formatter_class, + ) + debug_literal( + f"cls_script plugin '{name}' adding command '{script_cls.command}'" + ) obj = script_cls(subparser) d[script_cls.command] = obj @@ -174,9 +186,7 @@ def add_cli_scripts(parser): def list_all_plugins(): - plugins = itertools.chain(_plugin_load_from, - _plugin_save_to, - _plugin_cli) + plugins = itertools.chain(_plugin_load_from, _plugin_save_to, _plugin_cli) plugins = list(plugins) if not plugins: @@ -185,7 +195,9 @@ def list_all_plugins(): notify("") notify("the following plugins are installed:") notify("") - notify(f"{'plugin type':<20s} {'from python module':<30s} {'v':<5s} {'entry point name':<20s}") + notify( + f"{'plugin type':<20s} {'from python module':<30s} {'v':<5s} {'entry point name':<20s}" + ) notify(f"{'-'*20} {'-'*30} {'-'*5} {'-'*20}") for plugin in plugins: diff --git a/src/sourmash/save_load.py b/src/sourmash/save_load.py index f7109f0fb1..1f73c116c7 100644 --- a/src/sourmash/save_load.py +++ b/src/sourmash/save_load.py @@ -43,7 +43,7 @@ from .sbtmh import load_sbt_index from .lca.lca_db import load_single_database from . import signature as sigmod -from .index import (LinearIndex, ZipFileLinearIndex, MultiIndex) +from .index import LinearIndex, ZipFileLinearIndex, MultiIndex from .manifest import CollectionManifest @@ -74,16 +74,18 @@ def SaveSignaturesToLocation(location): with SaveSignaturesToLocation(filename_or_location) as save_sigs: save_sigs.add(sig_obj) """ - save_list = itertools.chain(_save_classes, - sourmash_plugins.get_save_to_functions()) - for priority, cls in sorted(save_list, key=lambda x:x[0]): + save_list = itertools.chain(_save_classes, sourmash_plugins.get_save_to_functions()) + for priority, cls in sorted(save_list, key=lambda x: x[0]): debug_literal(f"trying to match save function {cls}, priority={priority}") if cls.matches(location): debug_literal(f"{cls} is a match!") return cls(location) - raise Exception(f"cannot determine how to open location {location} for saving; this should never happen!?") + raise Exception( + f"cannot determine how to open location {location} for saving; this should never happen!?" + ) + ### Implementation machinery for _load_databases @@ -101,18 +103,19 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): plugin_fns = sourmash_plugins.get_load_from_functions() # aggregate with default load_from functions & sort by priority - load_from_functions = sorted(itertools.chain(_loader_functions, - plugin_fns)) - + load_from_functions = sorted(itertools.chain(_loader_functions, plugin_fns)) + # iterate through loader functions, sorted by priority; try them all. # Catch ValueError & IndexNotLoaded but nothing else. - for (priority, desc, load_fn) in load_from_functions: + for priority, desc, load_fn in load_from_functions: db = None try: - debug_literal(f"_load_databases: trying loader fn - priority {priority} - '{desc}'") - db = load_fn(filename, - traverse_yield_all=traverse_yield_all, - cache_size=cache_size) + debug_literal( + f"_load_databases: trying loader fn - priority {priority} - '{desc}'" + ) + db = load_fn( + filename, traverse_yield_all=traverse_yield_all, cache_size=cache_size + ) except (ValueError, IndexNotLoaded): debug_literal(f"_load_databases: FAIL with ValueError: on fn {desc}.") debug_literal(traceback.format_exc()) @@ -126,16 +129,20 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): if loaded: assert db is not None return db - + raise ValueError(f"Error while reading signatures from '{filename}'.") _loader_functions = [] + + def add_loader(name, priority): "decorator to add name/priority to _loader_functions" + def dec_priority(func): _loader_functions.append((priority, name, func)) return func + return dec_priority @@ -143,10 +150,10 @@ def dec_priority(func): def _load_stdin(filename, **kwargs): "Load collection from .sig file streamed in via stdin" db = None - if filename == '-': + if filename == "-": # load as LinearIndex, then pass into MultiIndex to generate a # manifest. - lidx = LinearIndex.load(sys.stdin, filename='-') + lidx = LinearIndex.load(sys.stdin, filename="-") db = MultiIndex.load((lidx,), (None,), parent="-") return db @@ -175,7 +182,7 @@ def _multiindex_load_from_pathlist(filename, **kwargs): @add_loader("load from path (file or directory)", 40) def _multiindex_load_from_path(filename, **kwargs): "Load collection from a directory." - traverse_yield_all = kwargs['traverse_yield_all'] + traverse_yield_all = kwargs["traverse_yield_all"] db = MultiIndex.load_from_path(filename, traverse_yield_all) return db @@ -184,7 +191,7 @@ def _multiindex_load_from_path(filename, **kwargs): @add_loader("load SBT", 60) def _load_sbt(filename, **kwargs): "Load collection from an SBT." - cache_size = kwargs.get('cache_size') + cache_size = kwargs.get("cache_size") try: db = load_sbt_index(filename, cache_size=cache_size) @@ -210,11 +217,12 @@ def _load_sqlite_db(filename, **kwargs): def _load_zipfile(filename, **kwargs): "Load collection from a .zip file." db = None - if filename.endswith('.zip'): - traverse_yield_all = kwargs['traverse_yield_all'] + if filename.endswith(".zip"): + traverse_yield_all = kwargs["traverse_yield_all"] try: - db = ZipFileLinearIndex.load(filename, - traverse_yield_all=traverse_yield_all) + db = ZipFileLinearIndex.load( + filename, traverse_yield_all=traverse_yield_all + ) except FileNotFoundError as exc: # turn this into an IndexNotLoaded => proper exception handling by # _load_database. @@ -236,13 +244,17 @@ def _error_on_fastaq(filename, **kwargs): pass if success: - raise Exception(f"Error while reading signatures from '{filename}' - got sequences instead! Is this a FASTA/FASTQ file?") + raise Exception( + f"Error while reading signatures from '{filename}' - got sequences instead! Is this a FASTA/FASTQ file?" + ) ### Implementation machinery for SaveSignaturesToLocation + class Base_SaveSignaturesToLocation: "Base signature saving class. Track location (if any) and count." + def __init__(self, location): self.location = location self.count = 0 @@ -288,14 +300,14 @@ def _get_signatures_from_rust(siglist): # Rust supports multiple. For now, go through serializing # and deserializing the signature! See issue #1167 for more. json_str = sourmash.save_signatures(siglist) - for ss in sourmash.signature.load_signatures(json_str): - yield ss + yield from sourmash.signature.load_signatures(json_str) class SaveSignatures_NoOutput(Base_SaveSignaturesToLocation): "Do not save signatures." + def __repr__(self): - return 'SaveSignatures_NoOutput()' + return "SaveSignatures_NoOutput()" @classmethod def matches(cls, location): @@ -310,6 +322,7 @@ def close(self): class SaveSignatures_Directory(Base_SaveSignaturesToLocation): "Save signatures within a directory, using md5sum names." + def __init__(self, location): super().__init__(location) @@ -320,7 +333,7 @@ def __repr__(self): def matches(cls, location): "anything ending in /" if location: - return location.endswith('/') + return location.endswith("/") def close(self): pass @@ -354,6 +367,7 @@ def add(self, ss): class SaveSignatures_SqliteIndex(Base_SaveSignaturesToLocation): "Save signatures within a directory, using md5sum names." + def __init__(self, location): super().__init__(location) self.location = location @@ -364,14 +378,14 @@ def __init__(self, location): def matches(cls, location): "anything ending in .sqldb" if location: - return location.endswith('.sqldb') + return location.endswith(".sqldb") def __repr__(self): return f"SaveSignatures_SqliteIndex('{self.location}')" def close(self): self.idx.commit() - self.cursor.execute('VACUUM') + self.cursor.execute("VACUUM") self.idx.close() def open(self): @@ -390,11 +404,12 @@ def add(self, add_sig): class SaveSignatures_SigFile(Base_SaveSignaturesToLocation): "Save signatures to a .sig JSON file." + def __init__(self, location): super().__init__(location) self.keep = [] self.compress = 0 - if self.location.endswith('.gz'): + if self.location.endswith(".gz"): self.compress = 1 @classmethod @@ -409,12 +424,12 @@ def open(self): pass def close(self): - if self.location == '-': + if self.location == "-": sourmash.save_signatures(self.keep, sys.stdout) else: # text mode? encode in utf-8 mode = "w" - encoding = 'utf-8' + encoding = "utf-8" # compressed? bytes & binary. if self.compress: @@ -422,8 +437,7 @@ def close(self): mode = "wb" with open(self.location, mode, encoding=encoding) as fp: - sourmash.save_signatures(self.keep, fp, - compression=self.compress) + sourmash.save_signatures(self.keep, fp, compression=self.compress) def add(self, ss): super().add(ss) @@ -432,6 +446,7 @@ def add(self, ss): class SaveSignatures_ZipFile(Base_SaveSignaturesToLocation): "Save compressed signatures in an uncompressed Zip file." + def __init__(self, location): super().__init__(location) self.storage = None @@ -440,7 +455,7 @@ def __init__(self, location): def matches(cls, location): "anything ending in .zip" if location: - return location.endswith('.zip') + return location.endswith(".zip") def __repr__(self): return f"SaveSignatures_ZipFile('{self.location}')" @@ -454,8 +469,7 @@ def close(self): manifest.write_to_csv(manifest_fp, write_header=True) manifest_data = manifest_fp.getvalue().encode("utf-8") - self.storage.save(manifest_name, manifest_data, overwrite=True, - compress=True) + self.storage.save(manifest_name, manifest_data, overwrite=True, compress=True) self.storage.flush() self.storage.close() @@ -476,19 +490,21 @@ def open(self): raise ValueError(f"File '{self.location}' cannot be opened as a zip file.") if not storage.subdir: - storage.subdir = 'signatures' + storage.subdir = "signatures" # now, try to load manifest try: - manifest_data = storage.load('SOURMASH-MANIFEST.csv') + manifest_data = storage.load("SOURMASH-MANIFEST.csv") except (FileNotFoundError, KeyError): # if file already exists must have manifest... if not do_create: - raise ValueError(f"Cannot add to existing zipfile '{self.location}' without a manifest") + raise ValueError( + f"Cannot add to existing zipfile '{self.location}' without a manifest" + ) self.manifest_rows = [] else: # success! decode manifest_data, create manifest rows => append. - manifest_data = manifest_data.decode('utf-8') + manifest_data = manifest_data.decode("utf-8") manifest_fp = StringIO(manifest_data) manifest = CollectionManifest.load_from_csv(manifest_fp) self.manifest_rows = list(manifest._select()) @@ -511,12 +527,13 @@ def add(self, add_sig): md5 = ss.md5sum() storage = self.storage - path = f'{storage.subdir}/{md5}.sig.gz' + path = f"{storage.subdir}/{md5}.sig.gz" location = storage.save(path, buf) # update manifest - row = CollectionManifest.make_manifest_row(ss, location, - include_signature=False) + row = CollectionManifest.make_manifest_row( + ss, location, include_signature=False + ) self.manifest_rows.append(row) super().add(ss) diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index 3ad36ebe1f..452ca29375 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -26,10 +26,10 @@ from .nodegraph import Nodegraph, extract_nodegraph_info, calc_expected_collisions STORAGES = { - 'FSStorage': FSStorage, - 'IPFSStorage': IPFSStorage, - 'RedisStorage': RedisStorage, - 'ZipStorage': ZipStorage, + "FSStorage": FSStorage, + "IPFSStorage": IPFSStorage, + "RedisStorage": RedisStorage, + "ZipStorage": ZipStorage, } @@ -103,7 +103,7 @@ def popitem(self): # we just need to select the maximum key/node id (key, _) = max(c for c in common if c[1] == count) except IndexError: - msg = '%s is empty' % self.__class__.__name__ + msg = "%s is empty" % self.__class__.__name__ raise KeyError(msg) from None else: value = self.pop(key) @@ -136,6 +136,7 @@ class SBT(Index): We use two dicts to store the tree structure: One for the internal nodes, and another for the leaves (datasets). """ + is_database = True def __init__(self, factory, *, d=2, storage=None, cache_size=None): @@ -162,6 +163,7 @@ def signatures(self): # if manifest, use it & load using direct path to storage. # this will be faster when using picklists. from .signature import load_one_signature + manifest = self.manifest # iteratively select picklists; no other selection criteria @@ -191,8 +193,16 @@ def _signatures_with_internal(self): ss = k.data yield ss, k._path - def select(self, ksize=None, moltype=None, num=0, scaled=0, - containment=False, abund=None, picklist=None): + def select( + self, + ksize=None, + moltype=None, + num=0, + scaled=0, + containment=False, + abund=None, + picklist=None, + ): """Make sure this database matches the requested requirements. Will always raise ValueError if a requirement cannot be met. @@ -216,33 +226,45 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0, # check ksize. if ksize is not None and db_mh.ksize != ksize: - raise ValueError(f"search ksize {ksize} is different from database ksize {db_mh.ksize}") + raise ValueError( + f"search ksize {ksize} is different from database ksize {db_mh.ksize}" + ) # check moltype. if moltype is not None and db_mh.moltype != moltype: - raise ValueError(f"search moltype {moltype} is different from database moltype {db_mh.moltype}") + raise ValueError( + f"search moltype {moltype} is different from database moltype {db_mh.moltype}" + ) # containment requires 'scaled'. if containment: if not scaled: raise ValueError("'containment' requires 'scaled' in SBT.select'") if not db_mh.scaled: - raise ValueError("cannot search this SBT for containment; signatures are not calculated with scaled") + raise ValueError( + "cannot search this SBT for containment; signatures are not calculated with scaled" + ) # 'num' and 'scaled' do not mix. if num: if not db_mh.num: - raise ValueError(f"this database was created with 'scaled' MinHash sketches, not 'num'") + raise ValueError( + "this database was created with 'scaled' MinHash sketches, not 'num'" + ) if num != db_mh.num: raise ValueError(f"num mismatch for SBT: num={num}, {db_mh.num}") if scaled: if not db_mh.scaled: - raise ValueError(f"this database was created with 'num' MinHash sketches, not 'scaled'") + raise ValueError( + "this database was created with 'num' MinHash sketches, not 'scaled'" + ) # we can downsample SBTs for containment operations. if scaled > db_mh.scaled and not containment: - raise ValueError(f"search scaled value {scaled} is less than database scaled value of {db_mh.scaled}") + raise ValueError( + f"search scaled value {scaled} is less than database scaled value of {db_mh.scaled}" + ) if abund: raise ValueError("SBT indices do not support sketches with abund=True") @@ -269,9 +291,13 @@ def new_node_pos(self, node): next_internal_node = None if self.next_node <= min_leaf: for i in range(min_leaf): - if all((i not in self._nodes, + if all( + ( + i not in self._nodes, i not in self._leaves, - i not in self._missing_nodes)): + i not in self._missing_nodes, + ) + ): next_internal_node = i break @@ -285,7 +311,7 @@ def new_node_pos(self, node): def insert(self, signature): "Add a new SourmashSignature in to the SBT." from .sbtmh import SigLeaf - + leaf = SigLeaf(signature.md5sum(), signature) self.add_node(leaf) @@ -315,19 +341,19 @@ def add_node(self, node): c1, c2 = self.children(p.pos)[:2] self._leaves[c1.pos] = p.node - self._leaves[c2.pos] = node + self._leaves[c2.pos] = node del self._leaves[p.pos] for child in (p.node, node): child.update(n) elif isinstance(p.node, Node): - self._leaves[pos] = node + self._leaves[pos] = node node.update(p.node) elif p.node is None: n = Node(self.factory, name="internal." + str(p.pos)) self._nodes[p.pos] = n c1 = self.children(p.pos)[0] - self._leaves[c1.pos] = node + self._leaves[c1.pos] = node node.update(n) else: # this branch should never be reached; put guard in to make sure! @@ -375,16 +401,15 @@ def _find_nodes(self, search_fn, *args, **kwargs): # apply search fn. If return false, truncate search. if search_fn(node_g, *args): - # leaf node? it's a match! if isinstance(node_g, Leaf): matches.append(node_g) # internal node? descend. elif isinstance(node_g, Node): - if kwargs.get('dfs', True): # defaults search to dfs + if kwargs.get("dfs", True): # defaults search to dfs for c in self.children(node_p): queue.insert(0, c.pos) - else: # bfs + else: # bfs queue.extend(c.pos for c in self.children(node_p)) if unload_data: @@ -423,8 +448,11 @@ def find(self, search_fn, query, **kwargs): # provide function to downsample leaf_node as well if scaled == tree_scaled: - downsample_node = lambda x: x + + def downsample_node(x): + return x else: + def downsample_node(node_mh): return node_mh.downsample(scaled=scaled) else: @@ -439,8 +467,11 @@ def downsample_node(node_mh): # provide function to downsample leaf nodes. if min_num == a_leaf.data.minhash.num: - downsample_node = lambda x: x + + def downsample_node(x): + return x else: + def downsample_node(node_mh): return node_mh.downsample(num=min_num) @@ -469,23 +500,22 @@ def node_search(node, *args, **kwargs): else: # Node / Nodegraph by minhash comparison # no downsampling needed -- shared_size = node.data.matches(query_mh) - subj_size = node.metadata.get('min_n_below', -1) + subj_size = node.metadata.get("min_n_below", -1) if subj_size == -1: - raise ValueError("ERROR: no min_n_below on this tree, cannot search.") - total_size = subj_size # approximate; do not collect + raise ValueError( + "ERROR: no min_n_below on this tree, cannot search." + ) + total_size = subj_size # approximate; do not collect # calculate score (exact, if leaf; approximate, if not) - score = search_fn.score_fn(query_size, - shared_size, - subj_size, - total_size) + score = search_fn.score_fn(query_size, shared_size, subj_size, total_size) if search_fn.passes(score): - if is_leaf: # terminal node? keep. + if is_leaf: # terminal node? keep. if search_fn.collect(score, node.data): results[node.data] = score return True - else: # it's a good internal node, keep. + else: # it's a good internal node, keep. return True return False @@ -514,7 +544,7 @@ def _rebuild_node(self, pos=0): # this node was already build, skip return - node = Node(self.factory, name="internal.{}".format(pos)) + node = Node(self.factory, name=f"internal.{pos}") self._nodes[pos] = node for c in self.children(pos): if c.pos in self._missing_nodes or isinstance(c.node, Leaf): @@ -614,8 +644,8 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): full path to the new SBT description """ info = {} - info['d'] = self.d - info['version'] = 6 + info["d"] = self.d + info["version"] = 6 info["index_type"] = self.__class__.__name__ # TODO: check # choose between ZipStorage and FS (file system/directory) storage. @@ -623,22 +653,22 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): kind = None if not path.endswith(".sbt.json"): kind = "Zip" - if not path.endswith('.sbt.zip'): - path += '.sbt.zip' + if not path.endswith(".sbt.zip"): + path += ".sbt.zip" storage = ZipStorage(path, mode="w") backend = "FSStorage" - assert path[-8:] == '.sbt.zip' + assert path[-8:] == ".sbt.zip" name = os.path.basename(path[:-8]) # align the storage prefix with what we do for FSStorage, below. - subdir = '.sbt.{}'.format(name) + subdir = f".sbt.{name}" storage_args = FSStorage("", subdir, make_dirs=False).init_args() storage.save(subdir + "/", b"") storage.subdir = subdir index_filename = os.path.abspath(path) - else: # path.endswith('.sbt.json') - assert path.endswith('.sbt.json') + else: # path.endswith('.sbt.json') + assert path.endswith(".sbt.json") name = os.path.basename(path) name = name[:-9] index_filename = os.path.abspath(path) @@ -649,7 +679,7 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): location = os.path.dirname(index_filename) # align subdir names with what we do above for ZipStorage - subdir = '.sbt.{}'.format(name) + subdir = f".sbt.{name}" # when we go to default of FSStorage, use full location for # storage, e.g. location/.sbt.{name}/ @@ -659,13 +689,10 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): backend = [k for (k, v) in STORAGES.items() if v == type(storage)][0] storage_args = storage.init_args() - info['storage'] = { - 'backend': backend, - 'args': storage_args - } - info['factory'] = { - 'class': GraphFactory.__name__, - 'args': self.factory.init_args() + info["storage"] = {"backend": backend, "args": storage_args} + info["factory"] = { + "class": GraphFactory.__name__, + "args": self.factory.init_args(), } nodes = {} @@ -685,16 +712,16 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): data = { # TODO: start using md5sum instead? - 'filename': os.path.basename(node.name), - 'name': node.name + "filename": os.path.basename(node.name), + "name": node.name, } try: - node.metadata.pop('max_n_below') + node.metadata.pop("max_n_below") except (AttributeError, KeyError): pass - data['metadata'] = node.metadata + data["metadata"] = node.metadata if structure_only is False: # trigger data loading before saving to the new place @@ -703,27 +730,26 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): node.storage = storage if kind == "Zip": - new_name = node.save(os.path.join(subdir, data['filename'])) - assert new_name.startswith(subdir + '/') + new_name = node.save(os.path.join(subdir, data["filename"])) + assert new_name.startswith(subdir + "/") # strip off prefix - new_name = new_name[len(subdir) + 1:] - data['filename'] = new_name + new_name = new_name[len(subdir) + 1 :] + data["filename"] = new_name else: - data['filename'] = node.save(data['filename']) - + data["filename"] = node.save(data["filename"]) if isinstance(node, Node): nodes[i] = data else: leaves[i] = data - row = node.make_manifest_row(data['filename']) + row = node.make_manifest_row(data["filename"]) if row: manifest_rows.append(row) if n % 100 == 0: - notify(f"{format(n+1)} of {format(total_nodes)} nodes saved", end='\r') + notify(f"{format(n+1)} of {format(total_nodes)} nodes saved", end="\r") # now, save the index file and manifests. # @@ -736,8 +762,8 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): # (CTB: manifests are not yet supported for Redis and IPFS) # notify("Finished saving nodes, now saving SBT index file.") - info['nodes'] = nodes - info['signatures'] = leaves + info["nodes"] = nodes + info["signatures"] = leaves # finish constructing manifest object & save manifest = CollectionManifest(manifest_rows) @@ -749,39 +775,46 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): if kind == "Zip": manifest_name = os.path.join(storage.subdir, manifest_name) - manifest_path = storage.save(manifest_name, manifest_data, - overwrite=True, compress=True) + manifest_path = storage.save( + manifest_name, manifest_data, overwrite=True, compress=True + ) elif kind == "FS": manifest_name = manifest_name - manifest_path = storage.save(manifest_name, manifest_data, - overwrite=True) + manifest_path = storage.save(manifest_name, manifest_data, overwrite=True) else: manifest_path = None if manifest_path: - info['manifest_path'] = manifest_path + info["manifest_path"] = manifest_path # now, save index. tree_data = json.dumps(info).encode("utf-8") if kind == "Zip": - save_path = "{}.sbt.json".format(name) + save_path = f"{name}.sbt.json" storage.save(save_path, tree_data, overwrite=True) storage.flush() elif kind == "FS": storage.save(index_filename, tree_data, overwrite=True) else: # save tree locally. - with open(index_filename, 'wb') as tree_fp: + with open(index_filename, "wb") as tree_fp: tree_fp.write(tree_data) notify(f"Finished saving SBT index, available at {format(index_filename)}\n") return path - @classmethod - def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning=True, cache_size=None): + def load( + cls, + location, + *, + leaf_loader=None, + storage=None, + print_version_warning=True, + cache_size=None, + ): """Load an SBT description from a file. Parameters @@ -807,8 +840,8 @@ def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning if ZipStorage.can_open(location): storage = ZipStorage(location) else: - if not location.endswith('.sbt.zip'): - location2 = location + '.sbt.zip' + if not location.endswith(".sbt.zip"): + location2 = location + ".sbt.zip" if ZipStorage.can_open(location2): storage = ZipStorage(location2) @@ -828,12 +861,12 @@ def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning if sbt_name is None: dirname = os.path.dirname(os.path.abspath(location)) sbt_name = os.path.basename(location) - if sbt_name.endswith('.sbt.json'): + if sbt_name.endswith(".sbt.json"): sbt_name = sbt_name[:-9] sbt_fn = os.path.join(dirname, sbt_name) - if not sbt_fn.endswith('.sbt.json') and tempfile is None: - sbt_fn += '.sbt.json' + if not sbt_fn.endswith(".sbt.json") and tempfile is None: + sbt_fn += ".sbt.json" try: with open(sbt_fn) as fp: @@ -846,7 +879,7 @@ def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning version = 1 if isinstance(jnodes, Mapping): - version = jnodes['version'] + version = jnodes["version"] if leaf_loader is None: leaf_loader = Leaf.load @@ -865,26 +898,33 @@ def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning except KeyError: raise IndexNotSupported() - #if version >= 6: + # if version >= 6: # if jnodes.get("index_type", "SBT") == "LocalizedSBT": # loaders[6] = LocalizedSBT._load_v6 if version < 3 and storage is None: - storage = FSStorage(dirname, '.sbt.{}'.format(sbt_name)) + storage = FSStorage(dirname, f".sbt.{sbt_name}") elif storage is None: - klass = STORAGES[jnodes['storage']['backend']] - if jnodes['storage']['backend'] == "FSStorage": - storage = FSStorage(dirname, jnodes['storage']['args']['path']) + klass = STORAGES[jnodes["storage"]["backend"]] + if jnodes["storage"]["backend"] == "FSStorage": + storage = FSStorage(dirname, jnodes["storage"]["args"]["path"]) elif storage is None: - storage = klass(**jnodes['storage']['args']) - - obj = loader(jnodes, leaf_loader, dirname, storage, print_version_warning=print_version_warning, cache_size=cache_size) + storage = klass(**jnodes["storage"]["args"]) + + obj = loader( + jnodes, + leaf_loader, + dirname, + storage, + print_version_warning=print_version_warning, + cache_size=cache_size, + ) obj._location = location - if 'manifest_path' in jnodes: - manifest_path = jnodes['manifest_path'] + if "manifest_path" in jnodes: + manifest_path = jnodes["manifest_path"] manifest_data = storage.load(manifest_path) - manifest_data = manifest_data.decode('utf-8') + manifest_data = manifest_data.decode("utf-8") manifest_fp = StringIO(manifest_data) obj.manifest = CollectionManifest.load_from_csv(manifest_fp) else: @@ -893,15 +933,22 @@ def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning return obj @staticmethod - def _load_v1(jnodes, leaf_loader, dirname, storage, *, print_version_warning=True, cache_size=None): - + def _load_v1( + jnodes, + leaf_loader, + dirname, + storage, + *, + print_version_warning=True, + cache_size=None, + ): if jnodes[0] is None: raise ValueError("Empty tree!") sbt_nodes = {} sbt_leaves = {} - sample_bf = os.path.join(dirname, jnodes[0]['filename']) + sample_bf = os.path.join(dirname, jnodes[0]["filename"]) ksize, tablesize, ntables = extract_nodegraph_info(sample_bf)[:3] factory = GraphFactory(ksize, tablesize, ntables) @@ -909,10 +956,10 @@ def _load_v1(jnodes, leaf_loader, dirname, storage, *, print_version_warning=Tru if jnode is None: continue - jnode['filename'] = os.path.join(dirname, jnode['filename']) + jnode["filename"] = os.path.join(dirname, jnode["filename"]) - if 'internal' in jnode['name']: - jnode['factory'] = factory + if "internal" in jnode["name"]: + jnode["factory"] = factory sbt_node = Node.load(jnode, storage) sbt_nodes[i] = sbt_node else: @@ -926,8 +973,17 @@ def _load_v1(jnodes, leaf_loader, dirname, storage, *, print_version_warning=Tru return tree @classmethod - def _load_v2(cls, info, leaf_loader, dirname, storage, *, print_version_warning=True, cache_size=None): - nodes = {int(k): v for (k, v) in info['nodes'].items()} + def _load_v2( + cls, + info, + leaf_loader, + dirname, + storage, + *, + print_version_warning=True, + cache_size=None, + ): + nodes = {int(k): v for (k, v) in info["nodes"].items()} if nodes[0] is None: raise ValueError("Empty tree!") @@ -935,7 +991,7 @@ def _load_v2(cls, info, leaf_loader, dirname, storage, *, print_version_warning= sbt_nodes = {} sbt_leaves = {} - sample_bf = os.path.join(dirname, nodes[0]['filename']) + sample_bf = os.path.join(dirname, nodes[0]["filename"]) k, size, ntables = extract_nodegraph_info(sample_bf)[:3] factory = GraphFactory(k, size, ntables) @@ -943,25 +999,34 @@ def _load_v2(cls, info, leaf_loader, dirname, storage, *, print_version_warning= if node is None: continue - node['filename'] = os.path.join(dirname, node['filename']) + node["filename"] = os.path.join(dirname, node["filename"]) - if 'internal' in node['name']: - node['factory'] = factory + if "internal" in node["name"]: + node["factory"] = factory sbt_node = Node.load(node, storage) sbt_nodes[k] = sbt_node else: sbt_node = leaf_loader(node, storage) sbt_leaves[k] = sbt_node - tree = cls(factory, d=info['d'], cache_size=cache_size) + tree = cls(factory, d=info["d"], cache_size=cache_size) tree._nodes = sbt_nodes tree._leaves = sbt_leaves return tree @classmethod - def _load_v3(cls, info, leaf_loader, dirname, storage, *, print_version_warning=True, cache_size=None): - nodes = {int(k): v for (k, v) in info['nodes'].items()} + def _load_v3( + cls, + info, + leaf_loader, + dirname, + storage, + *, + print_version_warning=True, + cache_size=None, + ): + nodes = {int(k): v for (k, v) in info["nodes"].items()} if not nodes: raise ValueError("Empty tree!") @@ -969,15 +1034,15 @@ def _load_v3(cls, info, leaf_loader, dirname, storage, *, print_version_warning= sbt_nodes = {} sbt_leaves = {} - factory = GraphFactory(*info['factory']['args']) + factory = GraphFactory(*info["factory"]["args"]) max_node = 0 for k, node in nodes.items(): if node is None: continue - if 'internal' in node['name']: - node['factory'] = factory + if "internal" in node["name"]: + node["factory"] = factory sbt_node = Node.load(node, storage) sbt_nodes[k] = sbt_node else: @@ -986,23 +1051,37 @@ def _load_v3(cls, info, leaf_loader, dirname, storage, *, print_version_warning= max_node = max(max_node, k) - tree = cls(factory, d=info['d'], storage=storage, cache_size=cache_size) + tree = cls(factory, d=info["d"], storage=storage, cache_size=cache_size) tree._nodes = sbt_nodes tree._leaves = sbt_leaves - tree._missing_nodes = {i for i in range(max_node) - if i not in sbt_nodes and i not in sbt_leaves} + tree._missing_nodes = { + i for i in range(max_node) if i not in sbt_nodes and i not in sbt_leaves + } if print_version_warning: - error("WARNING: this is an old index version, please run `sourmash migrate` to update it.") - error("WARNING: proceeding with execution, but it will take longer to finish!") + error( + "WARNING: this is an old index version, please run `sourmash migrate` to update it." + ) + error( + "WARNING: proceeding with execution, but it will take longer to finish!" + ) tree._fill_min_n_below() return tree @classmethod - def _load_v4(cls, info, leaf_loader, dirname, storage, *, print_version_warning=True, cache_size=None): - nodes = {int(k): v for (k, v) in info['nodes'].items()} + def _load_v4( + cls, + info, + leaf_loader, + dirname, + storage, + *, + print_version_warning=True, + cache_size=None, + ): + nodes = {int(k): v for (k, v) in info["nodes"].items()} if not nodes: raise ValueError("Empty tree!") @@ -1010,12 +1089,12 @@ def _load_v4(cls, info, leaf_loader, dirname, storage, *, print_version_warning= sbt_nodes = {} sbt_leaves = {} - factory = GraphFactory(*info['factory']['args']) + factory = GraphFactory(*info["factory"]["args"]) max_node = 0 for k, node in nodes.items(): - if 'internal' in node['name']: - node['factory'] = factory + if "internal" in node["name"]: + node["factory"] = factory sbt_node = Node.load(node, storage) sbt_nodes[k] = sbt_node else: @@ -1024,20 +1103,30 @@ def _load_v4(cls, info, leaf_loader, dirname, storage, *, print_version_warning= max_node = max(max_node, k) - tree = cls(factory, d=info['d'], storage=storage, cache_size=cache_size) + tree = cls(factory, d=info["d"], storage=storage, cache_size=cache_size) tree._nodes = sbt_nodes tree._leaves = sbt_leaves - tree._missing_nodes = {i for i in range(max_node) - if i not in sbt_nodes and i not in sbt_leaves} + tree._missing_nodes = { + i for i in range(max_node) if i not in sbt_nodes and i not in sbt_leaves + } tree.next_node = max_node return tree @classmethod - def _load_v5(cls, info, leaf_loader, dirname, storage, *, print_version_warning=True, cache_size=None): - nodes = {int(k): v for (k, v) in info['nodes'].items()} - leaves = {int(k): v for (k, v) in info['leaves'].items()} + def _load_v5( + cls, + info, + leaf_loader, + dirname, + storage, + *, + print_version_warning=True, + cache_size=None, + ): + nodes = {int(k): v for (k, v) in info["nodes"].items()} + leaves = {int(k): v for (k, v) in info["leaves"].items()} if not leaves: raise ValueError("Empty tree!") @@ -1046,17 +1135,17 @@ def _load_v5(cls, info, leaf_loader, dirname, storage, *, print_version_warning= sbt_leaves = {} if storage is None: - klass = STORAGES[info['storage']['backend']] - if info['storage']['backend'] == "FSStorage": - storage = FSStorage(dirname, info['storage']['args']['path']) + klass = STORAGES[info["storage"]["backend"]] + if info["storage"]["backend"] == "FSStorage": + storage = FSStorage(dirname, info["storage"]["args"]["path"]) elif storage is None: - storage = klass(**info['storage']['args']) + storage = klass(**info["storage"]["args"]) - factory = GraphFactory(*info['factory']['args']) + factory = GraphFactory(*info["factory"]["args"]) max_node = 0 for k, node in nodes.items(): - node['factory'] = factory + node["factory"] = factory sbt_node = Node.load(node, storage) sbt_nodes[k] = sbt_node @@ -1067,18 +1156,28 @@ def _load_v5(cls, info, leaf_loader, dirname, storage, *, print_version_warning= sbt_leaves[k] = sbt_leaf max_node = max(max_node, k) - tree = cls(factory, d=info['d'], storage=storage, cache_size=cache_size) + tree = cls(factory, d=info["d"], storage=storage, cache_size=cache_size) tree._nodes = sbt_nodes tree._leaves = sbt_leaves - tree._missing_nodes = {i for i in range(max_node) - if i not in sbt_nodes and i not in sbt_leaves} + tree._missing_nodes = { + i for i in range(max_node) if i not in sbt_nodes and i not in sbt_leaves + } return tree @classmethod - def _load_v6(cls, info, leaf_loader, dirname, storage, *, print_version_warning=True, cache_size=None): - nodes = {int(k): v for (k, v) in info['nodes'].items()} - leaves = {int(k): v for (k, v) in info['signatures'].items()} + def _load_v6( + cls, + info, + leaf_loader, + dirname, + storage, + *, + print_version_warning=True, + cache_size=None, + ): + nodes = {int(k): v for (k, v) in info["nodes"].items()} + leaves = {int(k): v for (k, v) in info["signatures"].items()} if not leaves: raise ValueError("Empty tree!") @@ -1087,17 +1186,17 @@ def _load_v6(cls, info, leaf_loader, dirname, storage, *, print_version_warning= sbt_leaves = {} if storage is None: - klass = STORAGES[info['storage']['backend']] - if info['storage']['backend'] == "FSStorage": - storage = FSStorage(dirname, info['storage']['args']['path']) + klass = STORAGES[info["storage"]["backend"]] + if info["storage"]["backend"] == "FSStorage": + storage = FSStorage(dirname, info["storage"]["args"]["path"]) elif storage is None: - storage = klass(**info['storage']['args']) + storage = klass(**info["storage"]["args"]) - factory = GraphFactory(*info['factory']['args']) + factory = GraphFactory(*info["factory"]["args"]) max_node = 0 for k, node in nodes.items(): - node['factory'] = factory + node["factory"] = factory sbt_node = Node.load(node, storage) sbt_nodes[k] = sbt_node @@ -1108,11 +1207,12 @@ def _load_v6(cls, info, leaf_loader, dirname, storage, *, print_version_warning= sbt_leaves[k] = sbt_leaf max_node = max(max_node, k) - tree = cls(factory, d=info['d'], storage=storage, cache_size=cache_size) + tree = cls(factory, d=info["d"], storage=storage, cache_size=cache_size) tree._nodes = sbt_nodes tree._leaves = sbt_leaves - tree._missing_nodes = {i for i in range(max_node) - if i not in sbt_nodes and i not in sbt_leaves} + tree._missing_nodes = { + i for i in range(max_node) if i not in sbt_nodes and i not in sbt_leaves + } return tree @@ -1121,31 +1221,31 @@ def _fill_min_n_below(self): Propagate the smallest hash size below each node up the tree from the leaves. """ + def fill_min_n_below(node, *args, **kwargs): - original_min_n_below = node.metadata.get('min_n_below', sys.maxsize) + original_min_n_below = node.metadata.get("min_n_below", sys.maxsize) min_n_below = original_min_n_below - children = kwargs['children'] + children = kwargs["children"] for child in children: if child.node is not None: if isinstance(child.node, Leaf): min_n_below = min(len(child.node.data.minhash), min_n_below) else: - child_n = child.node.metadata.get('min_n_below', sys.maxsize) + child_n = child.node.metadata.get("min_n_below", sys.maxsize) min_n_below = min(child_n, min_n_below) if min_n_below == 0: min_n_below = 1 - node.metadata['min_n_below'] = min_n_below + node.metadata["min_n_below"] = min_n_below return original_min_n_below != min_n_below self._fill_up(fill_min_n_below) def _fill_internal(self): - def fill_nodegraphs(node, *args, **kwargs): - children = kwargs['children'] + children = kwargs["children"] for child in children: if child.node is not None: child.node.update(node) @@ -1191,28 +1291,29 @@ def _fill_up(self, search_fn, *args, **kwargs): processed += 1 if processed % 100 == 0: - debug("processed {}, in queue {}", processed, len(queue), sep='\r') + debug("processed {}, in queue {}", processed, len(queue), sep="\r") def __len__(self): return len(self._leaves) def print_dot(self): - print(""" + print( + """ digraph G { nodesep=0.3; ranksep=0.2; margin=0.1; node [shape=ellipse]; edge [arrowsize=0.8]; - """) + """ + ) for i, node in self._nodes.items(): if isinstance(node, Node): - print('"{}" [shape=box fillcolor=gray style=filled]'.format( - node.name)) + print(f'"{node.name}" [shape=box fillcolor=gray style=filled]') for j, child in self.children(i): if child is not None: - print('"{}" -> "{}"'.format(node.name, child.name)) + print(f'"{node.name}" -> "{child.name}"') print("}") def print(self): @@ -1225,8 +1326,9 @@ def print(self): depth = int(math.floor(math.log(node_p + 1, self.d))) print(" " * 4 * depth, node_g) if isinstance(node_g, Node): - stack.extend(c.pos for c in self.children(node_p) - if c.pos not in visited) + stack.extend( + c.pos for c in self.children(node_p) if c.pos not in visited + ) def __iter__(self): for i, node in self._nodes.items(): @@ -1274,14 +1376,14 @@ def combine(self, other): for pos in range(n_previous, n_next): if tree._nodes.get(pos, None) is not None: new_node = copy(tree._nodes[pos]) - new_node.name = "internal.{}".format(current_pos) + new_node.name = f"internal.{current_pos}" new_nodes[current_pos] = new_node elif tree._leaves.get(pos, None) is not None: new_node = copy(tree._leaves[pos]) new_leaves[current_pos] = new_node current_pos += 1 n_previous = n_next - n_next = n_previous + int(self.d ** level) + n_next = n_previous + int(self.d**level) current_pos = n_next # TODO: do we want to return a new tree, or merge into this one? @@ -1299,12 +1401,14 @@ def __init__(self, factory, name=None, path=None, storage=None): self._factory = factory self._data = None self._path = path - self.metadata = dict() + self.metadata = {} def __str__(self): - return '*Node:{name} [occupied: {nb}, fpr: {fpr:.2}]'.format( - name=self.name, nb=self.data.n_occupied(), - fpr=calc_expected_collisions(self.data, True, 1.1)) + return "*Node:{name} [occupied: {nb}, fpr: {fpr:.2}]".format( + name=self.name, + nb=self.data.n_occupied(), + fpr=calc_expected_collisions(self.data, True, 1.1), + ) def save(self, path): buf = self.data.to_bytes(compression=1) @@ -1332,21 +1436,22 @@ def unload(self): @staticmethod def load(info, storage=None): - new_node = Node(info['factory'], - name=info['name'], - path=info['filename'], - storage=storage) - new_node.metadata = info.get('metadata', {}) + new_node = Node( + info["factory"], name=info["name"], path=info["filename"], storage=storage + ) + new_node.metadata = info.get("metadata", {}) return new_node def update(self, parent): parent.data.update(self.data) - if 'min_n_below' in self.metadata: - min_n_below = min(parent.metadata.get('min_n_below', sys.maxsize), - self.metadata.get('min_n_below')) + if "min_n_below" in self.metadata: + min_n_below = min( + parent.metadata.get("min_n_below", sys.maxsize), + self.metadata.get("min_n_below"), + ) if min_n_below == 0: min_n_below = 1 - parent.metadata['min_n_below'] = min_n_below + parent.metadata["min_n_below"] = min_n_below class Leaf: @@ -1363,10 +1468,12 @@ def __init__(self, metadata, data=None, name=None, storage=None, path=None): self._path = path def __str__(self): - return '**Leaf:{name} [occupied: {nb}, fpr: {fpr:.2}] -> {metadata}'.format( - name=self.name, metadata=self.metadata, - nb=self.data.n_occupied(), - fpr=calc_expected_collisions(self.data, True, 1.1)) + return "**Leaf:{name} [occupied: {nb}, fpr: {fpr:.2}] -> {metadata}".format( + name=self.name, + metadata=self.metadata, + nb=self.data.n_occupied(), + fpr=calc_expected_collisions(self.data, True, 1.1), + ) def make_manifest_row(self, location): return None @@ -1397,10 +1504,9 @@ def update(self, parent): @classmethod def load(cls, info, storage=None): - return cls(info['metadata'], - name=info['name'], - path=info['filename'], - storage=storage) + return cls( + info["metadata"], name=info["name"], path=info["filename"], storage=storage + ) def filter_distance(filter_a, filter_b, n=1000): @@ -1428,9 +1534,15 @@ def filter_distance(filter_a, filter_b, n=1000): a = array(q, copy=False) b = array(p, copy=False) for i in map(lambda x: randint(0, len(a)), range(n)): - distance += sum(map(int, - [not bool((a[i] >> j) & 1) ^ bool((b[i] >> j) & 1) - for j in range(8)])) + distance += sum( + map( + int, + [ + not bool((a[i] >> j) & 1) ^ bool((b[i] >> j) & 1) + for j in range(8) + ], + ) + ) return distance / (8.0 * len(A) * n) @@ -1438,41 +1550,41 @@ def convert_cmd(name, backend): "Convert an SBT to use a different back end." from .sbtmh import SigLeaf - options = backend.split('(') + options = backend.split("(") backend = options.pop(0) backend = backend.lower().strip("'") kwargs = {} if options: - print(options) - options = options[0].split(')') - options = [options.pop(0)] - #options = {} + print(options) + options = options[0].split(")") + options = [options.pop(0)] + # options = {} else: - options = [] + options = [] - if backend.lower() in ('ipfs', 'ipfsstorage'): + if backend.lower() in ("ipfs", "ipfsstorage"): backend = IPFSStorage - elif backend.lower() in ('redis', 'redisstorage'): + elif backend.lower() in ("redis", "redisstorage"): backend = RedisStorage - elif backend.lower() in ('zip', 'zipstorage'): + elif backend.lower() in ("zip", "zipstorage"): backend = ZipStorage - kwargs['mode'] = 'w' - elif backend.lower() in ('fs', 'fsstorage'): + kwargs["mode"] = "w" + elif backend.lower() in ("fs", "fsstorage"): backend = FSStorage if options: options = [os.path.dirname(options[0]), os.path.basename(options[0])] else: # this is the default for SBT v2 - tag = '.sbt.' + os.path.basename(name) - if tag.endswith('.sbt.json'): + tag = ".sbt." + os.path.basename(name) + if tag.endswith(".sbt.json"): tag = tag[:-9] path = os.path.dirname(name) options = [path, tag] else: - error('backend not recognized: {}'.format(backend)) + error(f"backend not recognized: {backend}") with backend(*options, **kwargs) as storage: sbt = SBT.load(name, leaf_loader=SigLeaf.load) diff --git a/src/sourmash/sbt_storage.py b/src/sourmash/sbt_storage.py index 42a4fceaa6..1b7a9e7d78 100644 --- a/src/sourmash/sbt_storage.py +++ b/src/sourmash/sbt_storage.py @@ -15,7 +15,6 @@ class Storage(ABC): - @abc.abstractmethod def save(self, path, content, *, overwrite=False): pass @@ -44,7 +43,6 @@ def can_open(self, location): class FSStorage(Storage): - def __init__(self, location, subdir, make_dirs=True): self.location = location self.subdir = subdir @@ -55,7 +53,7 @@ def __init__(self, location, subdir, make_dirs=True): os.makedirs(fullpath) def init_args(self): - return {'path': self.subdir} + return {"path": self.subdir} def save(self, path, content, overwrite=False): "Save a node/leaf." @@ -64,27 +62,27 @@ def save(self, path, content, overwrite=False): if os.path.exists(fullpath): # check for content, if same return path, - with open(fullpath, 'rb') as f: + with open(fullpath, "rb") as f: old_content = f.read() if old_content == content: return path if overwrite: - pass # fine to overwrite file! + pass # fine to overwrite file! else: # different content, need to find new path to save newpath = None n = 0 while newpath is None: - testpath = "{}_{}".format(fullpath, n) + testpath = f"{fullpath}_{n}" if os.path.exists(testpath): n += 1 else: # testpath is available, use it as newpath - newpath = "{}_{}".format(path, n) + newpath = f"{path}_{n}" fullpath = os.path.join(self.location, self.subdir, newpath) - with open(fullpath, 'wb') as f: + with open(fullpath, "wb") as f: f.write(content) return newpath @@ -95,7 +93,6 @@ def load(self, path): class ZipStorage(RustObject, Storage): - __dealloc_func__ = lib.zipstorage_free def __init__(self, path, *, mode="r"): @@ -146,7 +143,9 @@ def _filenames(self): def save(self, path, content, *, overwrite=False, compress=False): if self.__inner: - return self.__inner.save(path, content, overwrite=overwrite, compress=compress) + return self.__inner.save( + path, content, overwrite=overwrite, compress=compress + ) raise NotImplementedError() def load(self, path): @@ -155,7 +154,9 @@ def load(self, path): try: size = ffi.new("uintptr_t *") - rawbuf = self._methodcall(lib.zipstorage_load, to_bytes(path), len(path), size) + rawbuf = self._methodcall( + lib.zipstorage_load, to_bytes(path), len(path), size + ) size = size[0] rawbuf = ffi.gc(rawbuf, lambda o: lib.nodegraph_buffer_free(o, size), size) @@ -182,7 +183,7 @@ def list_sbts(self): return paths def init_args(self): - return {'path': self.path} + return {"path": self.path} def flush(self): if self.__inner: @@ -198,7 +199,6 @@ def can_open(location): class _RwZipStorage(Storage): - def __init__(self, path): self.path = os.path.abspath(path) @@ -212,14 +212,15 @@ def __init__(self, path): # so we need to check some things: if not os.path.exists(self.path): # If the file doesn't exist open it in write mode. - self.zipfile = zipfile.ZipFile(path, mode='w', - compression=zipfile.ZIP_STORED) + self.zipfile = zipfile.ZipFile( + path, mode="w", compression=zipfile.ZIP_STORED + ) else: # If it exists, open it in read mode and prepare a buffer for # new/duplicated items. During close() there are checks to see # how the original file needs to be updated (append new items, # deal with duplicates, and so on) - self.zipfile = zipfile.ZipFile(path, 'r') + self.zipfile = zipfile.ZipFile(path, "r") self.bufferzip = zipfile.ZipFile(BytesIO(), mode="w") self.subdir = "" @@ -250,7 +251,7 @@ def _generate_filename(self, zf, path, content): newpath = None n = 0 while newpath is None: - testpath = "{}_{}".format(path, n) + testpath = f"{path}_{n}" try: matches = self._content_matches(zf, testpath, content) if matches: @@ -260,7 +261,7 @@ def _generate_filename(self, zf, path, content): except KeyError: return testpath, True - assert 0 # should never get here! + assert 0 # should never get here! def _write_to_zf(self, zf, path, content, *, compress=False): compress_type = zipfile.ZIP_STORED @@ -272,9 +273,9 @@ def _write_to_zf(self, zf, path, content, *, compress=False): # set permissions zi = zf.getinfo(path) - perms = 0o444 << 16 # give a+r access - if path.endswith('/'): - perms = 0o755 << 16 # directories get u+rwx, a+rx + perms = 0o444 << 16 # give a+r access + if path.endswith("/"): + perms = 0o755 << 16 # directories get u+rwx, a+rx zi.external_attr = perms def save(self, path, content, *, overwrite=False, compress=False): @@ -287,15 +288,15 @@ def save(self, path, content, *, overwrite=False, compress=False): newpath, do_write = self._generate_filename(self.zipfile, path, content) if do_write: try: - self._write_to_zf(self.zipfile, newpath, content, - compress=compress) + self._write_to_zf(self.zipfile, newpath, content, compress=compress) except (ValueError, RuntimeError): # Can't write in the zipfile, write in buffer instead # CTB: do we need to generate a new filename wrt to the # bufferzip, too? Not sure this code is working as intended... if self.bufferzip: - self._write_to_zf(self.bufferzip, newpath, content, - compress=compress) + self._write_to_zf( + self.bufferzip, newpath, content, compress=compress + ) else: # Throw error, can't write the data raise ValueError("can't write data") @@ -326,7 +327,7 @@ def close(self): # might not have self.zipfile if was invalid zipfile and __init__ # failed. - if hasattr(self, 'zipfile'): + if hasattr(self, "zipfile"): if self.zipfile is not None or self.bufferzip is not None: self.flush(keep_closed=True) self.zipfile.close() @@ -341,8 +342,9 @@ def flush(self, *, keep_closed=False): if self.zipfile is not None: self.zipfile.close() if not keep_closed: - self.zipfile = zipfile.ZipFile(self.path, mode='a', - compression=zipfile.ZIP_STORED) + self.zipfile = zipfile.ZipFile( + self.path, mode="a", compression=zipfile.ZIP_STORED + ) else: # The complicated one. Need to consider: # - Is there data in the buffer? @@ -367,7 +369,9 @@ def flush(self, *, keep_closed=False): if item in duplicated or item in buffer_names: # we prioritize writing data from the buffer to the # final file - self._write_to_zf(final_file, item, self.bufferzip.read(item)) + self._write_to_zf( + final_file, item, self.bufferzip.read(item) + ) else: # it is only in the zipfile, so write from it self._write_to_zf(final_file, item, self.zipfile.read(item)) @@ -379,8 +383,9 @@ def flush(self, *, keep_closed=False): os.unlink(self.path) shutil.move(tempfile.name, self.path) if not keep_closed: - self.zipfile = zipfile.ZipFile(self.path, mode='a', - compression=zipfile.ZIP_STORED) + self.zipfile = zipfile.ZipFile( + self.path, mode="a", compression=zipfile.ZIP_STORED + ) elif new_data: # Since there is no duplicated data, we can # reopen self.zipfile in append mode and write the new data @@ -388,8 +393,9 @@ def flush(self, *, keep_closed=False): if keep_closed: raise Exception("unexpected error") else: - zf = zipfile.ZipFile(self.path, mode='a', - compression=zipfile.ZIP_STORED) + zf = zipfile.ZipFile( + self.path, mode="a", compression=zipfile.ZIP_STORED + ) for item in new_data: self._write_to_zf(zf, item, self.bufferzip.read(item)) self.zipfile = zf @@ -405,9 +411,9 @@ def __del__(self): class IPFSStorage(Storage): - def __init__(self, pin_on_add=True, **kwargs): import ipfshttpclient + self.ipfs_args = kwargs self.pin_on_add = pin_on_add self.api = ipfshttpclient.connect(**self.ipfs_args) @@ -444,9 +450,9 @@ def __exit__(self, type, value, traceback): class RedisStorage(Storage): - def __init__(self, **kwargs): import redis + self.redis_args = kwargs self.conn = redis.Redis(**self.redis_args) diff --git a/src/sourmash/sbtmh.py b/src/sourmash/sbtmh.py index 6cb9cc0135..3fa7aa23f2 100644 --- a/src/sourmash/sbtmh.py +++ b/src/sourmash/sbtmh.py @@ -7,9 +7,12 @@ def load_sbt_index(filename, *, print_version_warning=True, cache_size=None): "Load and return an SBT index." - return SBT.load(filename, leaf_loader=SigLeaf.load, - print_version_warning=print_version_warning, - cache_size=cache_size) + return SBT.load( + filename, + leaf_loader=SigLeaf.load, + print_version_warning=print_version_warning, + cache_size=cache_size, + ) def create_sbt_index(bloom_filter_size=1e5, n_children=2): @@ -29,21 +32,18 @@ def search_sbt_index(tree, query, threshold): for match_sig, similarity in search_sbt_index(tree, query, threshold): ... """ - for (score, match, _) in tree.search(query, threshold=threshold, - unload_data=True): + for score, match, _ in tree.search(query, threshold=threshold, unload_data=True): yield match, score class SigLeaf(Leaf): def __str__(self): - return '**Leaf:{name} -> {metadata}'.format( - name=self.name, metadata=self.metadata) + return f"**Leaf:{self.name} -> {self.metadata}" def make_manifest_row(self, loc): from .index import CollectionManifest - row = CollectionManifest.make_manifest_row(self.data, - loc, - include_signature=0) + + row = CollectionManifest.make_manifest_row(self.data, loc, include_signature=0) return row def save(self, path): @@ -58,13 +58,13 @@ def save(self, path): def update(self, parent): mh = self.data.minhash parent.data.update(mh) - min_n_below = parent.metadata.get('min_n_below', sys.maxsize) + min_n_below = parent.metadata.get("min_n_below", sys.maxsize) min_n_below = min(len(mh), min_n_below) if min_n_below == 0: min_n_below = 1 - parent.metadata['min_n_below'] = min_n_below + parent.metadata["min_n_below"] = min_n_below @property def data(self): diff --git a/src/sourmash/search.py b/src/sourmash/search.py index 7b2db8008f..f730d1daf5 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -4,10 +4,10 @@ import csv import numpy as np from enum import Enum -import numpy as np from dataclasses import dataclass -from .signature import SourmashSignature, MinHash +from .minhash import MinHash +from .signature import SourmashSignature from .sketchcomparison import FracMinHashComparison, NumMinHashComparison @@ -42,11 +42,9 @@ class SearchType(Enum): MAX_CONTAINMENT = 3 -def make_jaccard_search_query(*, - do_containment=False, - do_max_containment=False, - best_only=False, - threshold=None): +def make_jaccard_search_query( + *, do_containment=False, do_max_containment=False, best_only=False, threshold=None +): """\ Make a "flat" search object for Jaccard search & containment. """ @@ -81,11 +79,9 @@ def make_containment_query(query_mh, threshold_bp, *, best_only=True): threshold, _ = calc_threshold_from_bp(threshold_bp, scaled, len(query_mh)) if best_only: - search_obj = JaccardSearchBestOnly(SearchType.CONTAINMENT, - threshold=threshold) + search_obj = JaccardSearchBestOnly(SearchType.CONTAINMENT, threshold=threshold) else: - search_obj = JaccardSearch(SearchType.CONTAINMENT, - threshold=threshold) + search_obj = JaccardSearch(SearchType.CONTAINMENT, threshold=threshold) return search_obj @@ -94,6 +90,7 @@ class JaccardSearch: """ A class used by Index classes for searching/gathering. """ + def __init__(self, search_type, threshold=None): "Constructor. Takes type of search, and optional threshold." score_fn = None @@ -148,15 +145,13 @@ def score_jaccard(self, query_size, shared_size, subject_size, total_size): return 0 return shared_size / total_size - def score_containment(self, query_size, shared_size, subject_size, - total_size): + def score_containment(self, query_size, shared_size, subject_size, total_size): "Calculate Jaccard containment." if query_size == 0: return 0 return shared_size / query_size - def score_max_containment(self, query_size, shared_size, subject_size, - total_size): + def score_max_containment(self, query_size, shared_size, subject_size, total_size): "Calculate Jaccard max containment." min_denom = min(query_size, subject_size) if min_denom == 0: @@ -166,11 +161,13 @@ def score_max_containment(self, query_size, shared_size, subject_size, class JaccardSearchBestOnly(JaccardSearch): "A subclass of JaccardSearch that implements best-only." + def collect(self, score, match): "Raise the threshold to the best match found so far." self.threshold = max(self.threshold, score) return True + @dataclass class BaseResult: """ @@ -179,10 +176,11 @@ class BaseResult: properly initialize a SketchComparison, this class doesn't actually do anything other than define some functions needed by *Result classes. """ + query: SourmashSignature match: SourmashSignature filename: str = None - ignore_abundance: bool = False # optionally ignore abundances + ignore_abundance: bool = False # optionally ignore abundances # need these for scaled result comparisons estimate_ani_ci: bool = False ani_confidence: float = 0.95 @@ -196,18 +194,24 @@ def init_result(self): self.mh2 = self.match.minhash def build_fracminhashcomparison(self): - self.cmp = FracMinHashComparison(self.mh1, self.mh2, cmp_scaled=self.cmp_scaled, - threshold_bp=self.threshold_bp, - ignore_abundance=self.ignore_abundance, - estimate_ani_ci=self.estimate_ani_ci, - ani_confidence=self.ani_confidence) + self.cmp = FracMinHashComparison( + self.mh1, + self.mh2, + cmp_scaled=self.cmp_scaled, + threshold_bp=self.threshold_bp, + ignore_abundance=self.ignore_abundance, + estimate_ani_ci=self.estimate_ani_ci, + ani_confidence=self.ani_confidence, + ) self.cmp_scaled = self.cmp.cmp_scaled self.query_scaled = self.mh1.scaled self.match_scaled = self.mh2.scaled self.size_may_be_inaccurate = self.cmp.size_may_be_inaccurate def build_numminhashcomparison(self, cmp_num=None): - self.cmp = NumMinHashComparison(self.mh1, self.mh2, cmp_num=cmp_num, ignore_abundance=self.ignore_abundance) + self.cmp = NumMinHashComparison( + self.mh1, self.mh2, cmp_num=cmp_num, ignore_abundance=self.ignore_abundance + ) self.cmp_num = self.cmp.cmp_num self.query_num = self.mh1.num self.match_num = self.mh2.num @@ -230,7 +234,7 @@ def get_cmpinfo(self): self.filename = self.match_filename self.match_md5 = self.match.md5sum() # set these from self.match_* - self.md5= self.match_md5 + self.md5 = self.match_md5 self.name = self.match_name # could define in PrefetchResult instead, same reasoning as above self.query_abundance = self.mh1.track_abundance @@ -248,8 +252,9 @@ def shorten_md5(self, md5): def to_write(self, columns=[]): # convert comparison attrs into a dictionary # that can be used by csv dictwriter - info = {k: v for k, v in self.__dict__.items() - if k in columns and v is not None} + info = { + k: v for k, v in self.__dict__.items() if k in columns and v is not None + } return info def init_dictwriter(self, csv_handle): @@ -279,13 +284,22 @@ class SearchResult(BaseResult): """ SearchResult class supports 'sourmash search' operations. """ + similarity: float = None cmp_num: int = None searchtype: SearchType = None - #columns for standard SearchResult output - search_write_cols = ['similarity', 'md5', 'filename', 'name', # here we use 'filename' - 'query_filename', 'query_name', 'query_md5', 'ani'] + # columns for standard SearchResult output + search_write_cols = [ + "similarity", + "md5", + "filename", + "name", # here we use 'filename' + "query_filename", + "query_name", + "query_md5", + "ani", + ] ci_cols = ["ani_low", "ani_high"] @@ -297,10 +311,10 @@ def init_sigcomparison(self): self.build_fracminhashcomparison() elif any([self.mh1.num, self.mh2.num]): self.build_numminhashcomparison(cmp_num=self.cmp_num) - self.get_cmpinfo() # grab comparison metadata + self.get_cmpinfo() # grab comparison metadata def __post_init__(self): - self.init_sigcomparison() # build sketch comparison + self.init_sigcomparison() # build sketch comparison self.check_similarity() if self.cmp_scaled is not None and self.searchtype is not None: self.estimate_search_ani() @@ -317,11 +331,13 @@ def check_similarity(self): raise ValueError("Error: Must provide 'similarity' for SearchResult.") def estimate_search_ani(self): - #future: could estimate ANI from abund searches if we want (use query containment?) + # future: could estimate ANI from abund searches if we want (use query containment?) if self.cmp_scaled is None: raise TypeError("Error: ANI can only be estimated from scaled signatures.") if self.searchtype == SearchType.CONTAINMENT: - self.cmp.estimate_ani_from_mh1_containment_in_mh2(containment = self.similarity) + self.cmp.estimate_ani_from_mh1_containment_in_mh2( + containment=self.similarity + ) self.ani = self.cmp.ani_from_mh1_containment_in_mh2 if self.estimate_ani_ci: self.ani_low = self.cmp.ani_from_mh1_containment_in_mh2_low @@ -347,16 +363,38 @@ class PrefetchResult(BaseResult): """ # current prefetch columns - prefetch_write_cols = ['intersect_bp', 'jaccard', 'max_containment', 'f_query_match', - 'f_match_query', 'match_filename', 'match_name', # here we use 'match_filename' - 'match_md5', 'match_bp', 'query_filename', 'query_name', - 'query_md5', 'query_bp', 'ksize', 'moltype', 'scaled', - 'query_n_hashes', 'query_abundance', 'query_containment_ani', - 'match_containment_ani', 'average_containment_ani', 'max_containment_ani', - 'potential_false_negative'] #'match_abundance' - - ci_cols = ["query_containment_ani_low", "query_containment_ani_high", - "match_containment_ani_low", "match_containment_ani_high"] + prefetch_write_cols = [ + "intersect_bp", + "jaccard", + "max_containment", + "f_query_match", + "f_match_query", + "match_filename", + "match_name", # here we use 'match_filename' + "match_md5", + "match_bp", + "query_filename", + "query_name", + "query_md5", + "query_bp", + "ksize", + "moltype", + "scaled", + "query_n_hashes", + "query_abundance", + "query_containment_ani", + "match_containment_ani", + "average_containment_ani", + "max_containment_ani", + "potential_false_negative", + ] #'match_abundance' + + ci_cols = [ + "query_containment_ani_low", + "query_containment_ani_high", + "match_containment_ani_low", + "match_containment_ani_high", + ] prefetch_write_cols_ci = prefetch_write_cols + ci_cols @@ -366,8 +404,10 @@ def init_sigcomparison(self): if all([self.mh1.scaled, self.mh2.scaled]): self.build_fracminhashcomparison() else: - raise TypeError("Error: prefetch and gather results must be between scaled signatures.") - self.get_cmpinfo() # grab comparison metadata + raise TypeError( + "Error: prefetch and gather results must be between scaled signatures." + ) + self.get_cmpinfo() # grab comparison metadata self.intersect_bp = self.cmp.total_unique_intersect_hashes self.max_containment = self.cmp.max_containment self.query_bp = self.mh1.unique_dataset_hashes @@ -394,8 +434,12 @@ def handle_ani_ci(self): def build_prefetch_result(self): # unique prefetch values self.jaccard = self.cmp.jaccard - self.f_query_match = self.cmp.mh2_containment_in_mh1 #db_mh.contained_by(query_mh) - self.f_match_query = self.cmp.mh1_containment_in_mh2 #query_mh.contained_by(db_mh) + self.f_query_match = ( + self.cmp.mh2_containment_in_mh1 + ) # db_mh.contained_by(query_mh) + self.f_match_query = ( + self.cmp.mh1_containment_in_mh2 + ) # query_mh.contained_by(db_mh) # set write columns for prefetch result self.write_cols = self.prefetch_write_cols if self.estimate_ani_ci: @@ -433,50 +477,80 @@ class GatherResult(PrefetchResult): sum_weighted_found: int = None total_weighted_hashes: int = None - gather_write_cols = ['intersect_bp', 'f_orig_query', 'f_match', - 'f_unique_to_query', - 'f_unique_weighted','average_abund', - 'median_abund', 'std_abund', 'filename', - 'name', 'md5', - 'f_match_orig', 'unique_intersect_bp', - 'gather_result_rank', 'remaining_bp', - 'query_filename', 'query_name', 'query_md5', - 'query_bp', 'ksize', 'moltype', 'scaled', - 'query_n_hashes', 'query_abundance', - 'query_containment_ani', - 'match_containment_ani', - 'average_containment_ani', - 'max_containment_ani', - 'potential_false_negative', - 'n_unique_weighted_found', - 'sum_weighted_found', - 'total_weighted_hashes'] - - ci_cols = ["query_containment_ani_low", "query_containment_ani_high", - "match_containment_ani_low", "match_containment_ani_high"] + gather_write_cols = [ + "intersect_bp", + "f_orig_query", + "f_match", + "f_unique_to_query", + "f_unique_weighted", + "average_abund", + "median_abund", + "std_abund", + "filename", + "name", + "md5", + "f_match_orig", + "unique_intersect_bp", + "gather_result_rank", + "remaining_bp", + "query_filename", + "query_name", + "query_md5", + "query_bp", + "ksize", + "moltype", + "scaled", + "query_n_hashes", + "query_abundance", + "query_containment_ani", + "match_containment_ani", + "average_containment_ani", + "max_containment_ani", + "potential_false_negative", + "n_unique_weighted_found", + "sum_weighted_found", + "total_weighted_hashes", + ] + + ci_cols = [ + "query_containment_ani_low", + "query_containment_ani_high", + "match_containment_ani_low", + "match_containment_ani_high", + ] gather_write_cols_ci = gather_write_cols + ci_cols def init_gathersketchcomparison(self): # compare remaining gather hashes with match. Force at cmp_scaled. Force match flatten(), bc we don't need abunds. - self.gather_comparison = FracMinHashComparison(self.gather_querymh, self.match.minhash.flatten()) + self.gather_comparison = FracMinHashComparison( + self.gather_querymh, self.match.minhash.flatten() + ) def check_gatherresult_input(self): # check we have what we need: if self.cmp_scaled is None: - raise ValueError("Error: must provide comparison scaled value ('cmp_scaled') for GatherResult") + raise ValueError( + "Error: must provide comparison scaled value ('cmp_scaled') for GatherResult" + ) if self.gather_querymh is None: - raise ValueError("Error: must provide current gather sketch (remaining hashes) for GatherResult") + raise ValueError( + "Error: must provide current gather sketch (remaining hashes) for GatherResult" + ) if self.gather_result_rank is None: raise ValueError("Error: must provide 'gather_result_rank' to GatherResult") - if not self.total_weighted_hashes: # catch total_weighted_hashes = 0 as well - raise ValueError("Error: must provide sum of all abundances ('total_weighted_hashes') to GatherResult") + if not self.total_weighted_hashes: # catch total_weighted_hashes = 0 as well + raise ValueError( + "Error: must provide sum of all abundances ('total_weighted_hashes') to GatherResult" + ) if not self.orig_query_abunds: - raise ValueError("Error: must provide original query abundances ('orig_query_abunds') to GatherResult") + raise ValueError( + "Error: must provide original query abundances ('orig_query_abunds') to GatherResult" + ) def build_gather_result(self): # build gather-specific attributes - + # the 'query' that is passed into gather is all _matched_ hashes, after subtracting noident_mh # this affects estimation of original query information, and requires us to pass in orig_query_len and orig_query_abunds. # we also need to overwrite self.query_bp, self.query_n_hashes, and self.query_abundance @@ -486,43 +560,70 @@ def build_gather_result(self): # calculate intersection with query hashes: self.unique_intersect_bp = self.gather_comparison.total_unique_intersect_hashes - + # calculate fraction of subject match with orig query self.f_match_orig = self.cmp.mh2_containment_in_mh1 # calculate fractions wrt first denominator - genome size - self.f_match = self.gather_comparison.mh2_containment_in_mh1 # unique match containment + self.f_match = ( + self.gather_comparison.mh2_containment_in_mh1 + ) # unique match containment self.f_orig_query = len(self.cmp.intersect_mh) / self.orig_query_len - assert self.gather_comparison.intersect_mh.contained_by(self.gather_comparison.mh1_cmp) == 1.0 - + assert ( + self.gather_comparison.intersect_mh.contained_by( + self.gather_comparison.mh1_cmp + ) + == 1.0 + ) + # calculate fractions wrt second denominator - metagenome size - assert self.gather_comparison.intersect_mh.contained_by(self.gather_comparison.mh2_cmp) == 1.0 - self.f_unique_to_query = len(self.gather_comparison.intersect_mh)/self.orig_query_len + assert ( + self.gather_comparison.intersect_mh.contained_by( + self.gather_comparison.mh2_cmp + ) + == 1.0 + ) + self.f_unique_to_query = ( + len(self.gather_comparison.intersect_mh) / self.orig_query_len + ) # here, need to make sure to use the mh1_cmp (bc was downsampled to cmp_scaled) - self.remaining_bp = (self.gather_comparison.mh1_cmp.unique_dataset_hashes - self.gather_comparison.total_unique_intersect_hashes) + self.remaining_bp = ( + self.gather_comparison.mh1_cmp.unique_dataset_hashes + - self.gather_comparison.total_unique_intersect_hashes + ) # calculate stats on abundances, if desired. self.average_abund, self.median_abund, self.std_abund = None, None, None if not self.ignore_abundance: - self.query_weighted_unique_intersection = self.gather_comparison.weighted_intersection(from_abundD = self.orig_query_abunds) + self.query_weighted_unique_intersection = ( + self.gather_comparison.weighted_intersection( + from_abundD=self.orig_query_abunds + ) + ) self.average_abund = self.query_weighted_unique_intersection.mean_abundance self.median_abund = self.query_weighted_unique_intersection.median_abundance self.std_abund = self.query_weighted_unique_intersection.std_abundance # 'query' will be flattened by default. reset track abundance if we have abunds - self.query_abundance = self.query_weighted_unique_intersection.track_abundance + self.query_abundance = ( + self.query_weighted_unique_intersection.track_abundance + ) # calculate scores weighted by abundances - self.n_unique_weighted_found = self.query_weighted_unique_intersection.sum_abundances - self.f_unique_weighted = self.n_unique_weighted_found / self.total_weighted_hashes + self.n_unique_weighted_found = ( + self.query_weighted_unique_intersection.sum_abundances + ) + self.f_unique_weighted = ( + self.n_unique_weighted_found / self.total_weighted_hashes + ) else: self.f_unique_weighted = self.f_unique_to_query self.query_abundance = False def __post_init__(self): self.check_gatherresult_input() - self.init_sigcomparison() # initialize original sketch vs match sketch comparison (inherited from PrefetchResult) - self.init_gathersketchcomparison() # initialize remaining gather sketch vs match sketch comparison - self.build_gather_result() # build gather-specific attributes + self.init_sigcomparison() # initialize original sketch vs match sketch comparison (inherited from PrefetchResult) + self.init_gathersketchcomparison() # initialize remaining gather sketch vs match sketch comparison + self.build_gather_result() # build gather-specific attributes # set write columns for prefetch result self.write_cols = self.gather_write_cols if self.estimate_ani_ci: @@ -550,8 +651,12 @@ def prefetchresultdict(self): if self.estimate_ani_ci: prefetch_cols = self.prefetch_write_cols_ci self.jaccard = self.cmp.jaccard - self.f_query_match = self.cmp.mh2_containment_in_mh1 #db_mh.contained_by(query_mh) - self.f_match_query = self.cmp.mh1_containment_in_mh2 #query_mh.contained_by(db_mh) + self.f_query_match = ( + self.cmp.mh2_containment_in_mh1 + ) # db_mh.contained_by(query_mh) + self.f_match_query = ( + self.cmp.mh1_containment_in_mh2 + ) # query_mh.contained_by(db_mh) self.prep_prefetch_result() return self.to_write(columns=prefetch_cols) @@ -560,14 +665,14 @@ def format_bp(bp): "Pretty-print bp information." bp = float(bp) if bp < 500: - return '{:.0f} bp'.format(bp) + return f"{bp:.0f} bp" elif bp <= 500e3: - return '{:.1f} kbp'.format(round(bp / 1e3, 1)) + return f"{round(bp / 1e3, 1):.1f} kbp" elif bp < 500e6: - return '{:.1f} Mbp'.format(round(bp / 1e6, 1)) + return f"{round(bp / 1e6, 1):.1f} Mbp" elif bp < 500e9: - return '{:.1f} Gbp'.format(round(bp / 1e9, 1)) - return '???' + return f"{round(bp / 1e9, 1):.1f} Gbp" + return "???" def search_databases_with_flat_query(query, databases, **kwargs): @@ -576,7 +681,7 @@ def search_databases_with_flat_query(query, databases, **kwargs): for db in databases: search_iter = db.search(query, **kwargs) - for (score, match, filename) in search_iter: + for score, match, filename in search_iter: md5 = match.md5sum() if md5 not in found_md5: results.append((score, match, filename)) @@ -589,22 +694,27 @@ def search_databases_with_flat_query(query, databases, **kwargs): # repetitive/not optimal - would it be better to produce SearchResult from db.search? estimate_ani_ci = False search_type = SearchType.JACCARD - if kwargs.get('do_containment'): + if kwargs.get("do_containment"): search_type = SearchType.CONTAINMENT - if kwargs.get('estimate_ani_ci'): + if kwargs.get("estimate_ani_ci"): estimate_ani_ci = True - elif kwargs.get('do_max_containment'): + elif kwargs.get("do_max_containment"): search_type = SearchType.MAX_CONTAINMENT - if kwargs.get('estimate_ani_ci'): + if kwargs.get("estimate_ani_ci"): estimate_ani_ci = True x = [] - for (score, match, filename) in results: - x.append(SearchResult(query, match, - similarity=score, - filename = filename, - searchtype=search_type, - estimate_ani_ci=estimate_ani_ci)) + for score, match, filename in results: + x.append( + SearchResult( + query, + match, + similarity=score, + filename=filename, + searchtype=search_type, + estimate_ani_ci=estimate_ani_ci, + ) + ) return x @@ -612,12 +722,14 @@ def search_databases_with_abund_query(query, databases, **kwargs): results = [] found_md5 = set() - if kwargs.get('do_containment') or kwargs.get('do_max_containment'): + if kwargs.get("do_containment") or kwargs.get("do_max_containment"): raise TypeError("containment searches cannot be done with abund sketches") for db in databases: - search_iter = db.search_abund(query, **kwargs) # could return SearchResult here instead of tuple? - for (score, match, filename) in search_iter: + search_iter = db.search_abund( + query, **kwargs + ) # could return SearchResult here instead of tuple? + for score, match, filename in search_iter: md5 = match.md5sum() if md5 not in found_md5: results.append((score, match, filename)) @@ -627,16 +739,16 @@ def search_databases_with_abund_query(query, databases, **kwargs): results.sort(key=lambda x: -x[0]) x = [] - for (score, match, filename) in results: - x.append(SearchResult(query, match, - similarity=score, - filename = filename)) + for score, match, filename in results: + x.append(SearchResult(query, match, similarity=score, filename=filename)) return x + ### ### gather code ### + def _find_best(counters, query, threshold_bp): """ Search for the best containment, return precisely one match. @@ -667,8 +779,17 @@ def _find_best(counters, query, threshold_bp): class GatherDatabases: "Iterator object for doing gather/min-set-cov." - def __init__(self, query, counters, *, - threshold_bp=0, ignore_abundance=False, noident_mh=None, ident_mh=None, estimate_ani_ci=False): + def __init__( + self, + query, + counters, + *, + threshold_bp=0, + ignore_abundance=False, + noident_mh=None, + ident_mh=None, + estimate_ani_ci=False, + ): # track original query information for later usage? track_abundance = query.minhash.track_abundance and not ignore_abundance self.orig_query = query @@ -683,7 +804,7 @@ def __init__(self, query, counters, *, if track_abundance: orig_query_abunds = query_hashes else: - orig_query_abunds = { k: 1 for k in query_hashes } + orig_query_abunds = {k: 1 for k in query_hashes} # adjust for not found... if noident_mh is None: # create empty @@ -702,7 +823,7 @@ def __init__(self, query, counters, *, query = query.to_mutable() query.minhash = orig_query_mh - cmp_scaled = query.minhash.scaled # initialize with resolution of query + cmp_scaled = query.minhash.scaled # initialize with resolution of query self.result_n = 0 self.query = query @@ -713,10 +834,12 @@ def __init__(self, query, counters, *, self.orig_query_mh = orig_query_mh self.orig_query_abunds = orig_query_abunds - self.cmp_scaled = 0 # initialize with something very low! + self.cmp_scaled = 0 # initialize with something very low! self._update_scaled(cmp_scaled) - self.estimate_ani_ci = estimate_ani_ci # by default, do not report ANI confidence intervals + self.estimate_ani_ci = ( + estimate_ani_ci # by default, do not report ANI confidence intervals + ) def _update_scaled(self, scaled): max_scaled = max(self.cmp_scaled, scaled) @@ -729,10 +852,12 @@ def _update_scaled(self, scaled): # NOTE: orig_query_abunds can be used w/o downsampling orig_query_abunds = self.orig_query_abunds - self.noident_query_sum_abunds = sum(( orig_query_abunds[k] \ - for k in self.noident_mh.hashes )) - self.total_weighted_hashes = sum(( orig_query_abunds[k] \ - for k in self.orig_query_mh.hashes )) + self.noident_query_sum_abunds = sum( + orig_query_abunds[k] for k in self.noident_mh.hashes + ) + self.total_weighted_hashes = sum( + orig_query_abunds[k] for k in self.orig_query_mh.hashes + ) self.total_weighted_hashes += self.noident_query_sum_abunds if max_scaled != scaled: @@ -753,7 +878,6 @@ def __next__(self): # may be changed: counters = self.counters - cmp_scaled = self.cmp_scaled # will not be changed:: threshold_bp = self.threshold_bp @@ -762,7 +886,7 @@ def __next__(self): # find the best match! best_result, intersect_mh = _find_best(counters, query, threshold_bp) - if not best_result: # no matches at all for this cutoff! + if not best_result: # no matches at all for this cutoff! raise StopIteration best_match = best_result.signature @@ -794,24 +918,26 @@ def __next__(self): # compute weighted information for remaining query hashes query_hashes = set(new_query_mh.hashes) - n_weighted_missed = sum((orig_query_abunds[k] for k in query_hashes)) + n_weighted_missed = sum(orig_query_abunds[k] for k in query_hashes) n_weighted_missed += self.noident_query_sum_abunds sum_weighted_found = total_weighted_hashes - n_weighted_missed # build a GatherResult - result = GatherResult(self.orig_query, best_match, - cmp_scaled=scaled, - filename=filename, - gather_result_rank=self.result_n, - gather_querymh=query.minhash, - ignore_abundance=not self.track_abundance, - threshold_bp=threshold_bp, - orig_query_len=orig_query_len, - orig_query_abunds=self.orig_query_abunds, - estimate_ani_ci=self.estimate_ani_ci, - sum_weighted_found=sum_weighted_found, - total_weighted_hashes=total_weighted_hashes, - ) + result = GatherResult( + self.orig_query, + best_match, + cmp_scaled=scaled, + filename=filename, + gather_result_rank=self.result_n, + gather_querymh=query.minhash, + ignore_abundance=not self.track_abundance, + threshold_bp=threshold_bp, + orig_query_len=orig_query_len, + orig_query_abunds=self.orig_query_abunds, + estimate_ani_ci=self.estimate_ani_ci, + sum_weighted_found=sum_weighted_found, + total_weighted_hashes=total_weighted_hashes, + ) self.result_n += 1 self.query = new_query @@ -823,6 +949,7 @@ def __next__(self): ### prefetch code ### + def prefetch_database(query, database, threshold_bp, *, estimate_ani_ci=False): """ Find all matches to `query_mh` >= `threshold_bp` in `database`. @@ -830,7 +957,14 @@ def prefetch_database(query, database, threshold_bp, *, estimate_ani_ci=False): scaled = query.minhash.scaled assert scaled # iterate over all signatures in database, find matches - for result in database.prefetch(query, threshold_bp): # future: could return PrefetchResult directly here - result = PrefetchResult(query, result.signature, threshold_bp=threshold_bp, estimate_ani_ci=estimate_ani_ci) + for result in database.prefetch( + query, threshold_bp + ): # future: could return PrefetchResult directly here + result = PrefetchResult( + query, + result.signature, + threshold_bp=threshold_bp, + estimate_ani_ci=estimate_ani_ci, + ) assert result.pass_threshold yield result diff --git a/src/sourmash/sig/__init__.py b/src/sourmash/sig/__init__.py index 0fafe39246..441c8fa37f 100644 --- a/src/sourmash/sig/__init__.py +++ b/src/sourmash/sig/__init__.py @@ -1,2 +1,2 @@ -from .__main__ import * # bring all functions into top-level +from .__main__ import * # bring all functions into top-level from . import grep diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index d10e8745f9..1a89d6239f 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -1,47 +1,55 @@ """ Command-line entry point for 'python -m sourmash.sig' """ -__all__ = ["cat", - "split", - "describe", - "manifest", - "overlap", - "merge", - "intersect", - "inflate", - "subtract", - "rename", - "extract", - "filter", - "flatten", - "downsample", - "ingest", - "export", - "kmers", - "fileinfo", - "check", - "collect"] +__all__ = [ + "cat", + "split", + "describe", + "manifest", + "overlap", + "merge", + "intersect", + "inflate", + "subtract", + "rename", + "extract", + "filter", + "flatten", + "downsample", + "ingest", + "export", + "kmers", + "fileinfo", + "check", + "collect", +] import sys import csv import json import os from collections import defaultdict, namedtuple, Counter -import json import re import screed import sourmash from sourmash.sourmash_args import FileOutput -from sourmash.logging import (set_quiet, error, notify, print_results, debug, - debug_literal, _debug) +from sourmash.logging import ( + set_quiet, + error, + notify, + print_results, + debug, + debug_literal, + _debug, +) from sourmash import sourmash_args from sourmash.minhash import _get_max_hash_for_scaled from sourmash.manifest import CollectionManifest -usage=''' +usage = """ sourmash signature [] - manipulate/work with signature files. ** Commands can be: @@ -67,15 +75,19 @@ ** Use '-h' to get subcommand-specific help, e.g. sourmash signature merge -h -''' +""" def _check_abundance_compatibility(sig1, sig2): if sig1.minhash.track_abundance != sig2.minhash.track_abundance: - raise ValueError("incompatible signatures: track_abundance is {} in first sig, {} in second".format(sig1.minhash.track_abundance, sig2.minhash.track_abundance)) + raise ValueError( + "incompatible signatures: track_abundance is {} in first sig, {} in second".format( + sig1.minhash.track_abundance, sig2.minhash.track_abundance + ) + ) -def _extend_signatures_with_from_file(args, *, target_attr='signatures'): +def _extend_signatures_with_from_file(args, *, target_attr="signatures"): # extend input signatures with --from-file if args.from_file: more_files = sourmash_args.load_pathlist_from_file(args.from_file) @@ -109,7 +121,7 @@ def cat(args): picklist = sourmash_args.load_picklist(args) pattern_search = sourmash_args.load_include_exclude_db_patterns(args) - encountered_md5sums = defaultdict(int) # used by --unique + encountered_md5sums = defaultdict(int) # used by --unique # open output for saving sigs save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) @@ -119,14 +131,16 @@ def cat(args): # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force, - pattern=pattern_search) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + pattern=pattern_search, + ) for ss, sigloc in loader: md5 = ss.md5sum() encountered_md5sums[md5] += 1 @@ -135,19 +149,19 @@ def cat(args): save_sigs.add(ss) - notify(f'loaded {len(save_sigs)} signatures total.') + notify(f"loaded {len(save_sigs)} signatures total.") if picklist: sourmash_args.report_picklist(args, picklist) save_sigs.close() - notify(f'output {len(save_sigs)} signatures') + notify(f"output {len(save_sigs)} signatures") - multiple_md5 = [ 1 for cnt in encountered_md5sums.values() if cnt > 1 ] + multiple_md5 = [1 for cnt in encountered_md5sums.values() if cnt > 1] if multiple_md5: - notify(f'encountered {sum(multiple_md5)} MinHashes multiple times') + notify(f"encountered {sum(multiple_md5)} MinHashes multiple times") if args.unique: - notify('...and removed the duplicates, because --unique was specified.') + notify("...and removed the duplicates, because --unique was specified.") def split(args): @@ -160,50 +174,59 @@ def split(args): _extend_signatures_with_from_file(args) output_names = set() - output_scaled_template = '{md5sum}.k={ksize}.scaled={scaled}.{moltype}.dup={dup}.{basename}' + args.extension - output_num_template = '{md5sum}.k={ksize}.num={num}.{moltype}.dup={dup}.{basename}' + args.extension + output_scaled_template = ( + "{md5sum}.k={ksize}.scaled={scaled}.{moltype}.dup={dup}.{basename}" + + args.extension + ) + output_num_template = ( + "{md5sum}.k={ksize}.num={num}.{moltype}.dup={dup}.{basename}" + args.extension + ) if args.output_dir: if not os.path.exists(args.output_dir): - notify(f'Creating --output-dir {args.output_dir}') + notify(f"Creating --output-dir {args.output_dir}") os.mkdir(args.output_dir) progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + ) for sig, sigloc in loader: # save each file individually -- md5sum = sig.md5sum()[:8] minhash = sig.minhash basename = os.path.basename(sig.filename) - if not basename or basename == '-': - basename = 'none' - - params = dict(basename=basename, - md5sum=md5sum, - scaled=minhash.scaled, - ksize=minhash.ksize, - num=minhash.num, - moltype=minhash.moltype) + if not basename or basename == "-": + basename = "none" + + params = dict( + basename=basename, + md5sum=md5sum, + scaled=minhash.scaled, + ksize=minhash.ksize, + num=minhash.num, + moltype=minhash.moltype, + ) if minhash.scaled: output_template = output_scaled_template - else: # num + else: # num assert minhash.num output_template = output_num_template # figure out if this is duplicate, build unique filename n = 0 - params['dup'] = n + params["dup"] = n output_name = output_template.format(**params) while output_name in output_names: - params['dup'] = n + params["dup"] = n output_name = output_template.format(**params) n += 1 @@ -218,9 +241,9 @@ def split(args): # save! with sourmash_args.SaveSignaturesToLocation(output_name) as save_sigs: save_sigs.add(sig) - notify(f'writing sig to {output_name}') + notify(f"writing sig to {output_name}") - notify(f'loaded and split {len(progress)} signatures total.') + notify(f"loaded and split {len(progress)} signatures total.") if picklist: sourmash_args.report_picklist(args, picklist) @@ -242,24 +265,39 @@ def describe(args): csv_obj = sourmash_args.FileOutputCSV(args.csv) csv_fp = csv_obj.open() - w = csv.DictWriter(csv_fp, - ['signature_file', 'md5', 'ksize', 'moltype', - 'num', 'scaled', 'n_hashes', 'seed', - 'with_abundance', 'name', 'filename', 'license', - 'sum_hashes'], - extrasaction='ignore') + w = csv.DictWriter( + csv_fp, + [ + "signature_file", + "md5", + "ksize", + "moltype", + "num", + "scaled", + "n_hashes", + "seed", + "with_abundance", + "name", + "filename", + "license", + "sum_hashes", + ], + extrasaction="ignore", + ) w.writeheader() # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force, - pattern=pattern_search) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + pattern=pattern_search, + ) for sig, location in loader: # extract info, write as appropriate. @@ -285,7 +323,8 @@ def describe(args): if w: w.writerow(locals()) - print_results('''\ + print_results( + """\ --- signature filename: {location} signature: {p_name} @@ -295,7 +334,9 @@ def describe(args): size: {n_hashes} sum hashes: {sum_hashes} signature license: {license} -''', **locals()) +""", + **locals(), + ) if csv_obj: csv_obj.close() @@ -311,9 +352,10 @@ def manifest(args): set_quiet(args.quiet, args.debug) try: - loader = sourmash_args.load_file_as_index(args.location, - yield_all_files=args.force) - except ValueError as exc: + loader = sourmash_args.load_file_as_index( + args.location, yield_all_files=args.force + ) + except ValueError: error(f"Cannot open '{args.location}' as a sourmash signature collection.") error("Use -d/--debug for details.") sys.exit(-1) @@ -325,12 +367,11 @@ def manifest(args): else: debug("sig manifest: forcing rebuild.") - manifest = sourmash_args.get_manifest(loader, require=True, - rebuild=rebuild) + manifest = sourmash_args.get_manifest(loader, require=True, rebuild=rebuild) - manifest.write_to_filename(args.output, - database_format=args.manifest_format, - ok_if_exists=args.force) + manifest.write_to_filename( + args.output, database_format=args.manifest_format, ok_if_exists=args.force + ) notify(f"manifest contains {len(manifest)} signatures total.") notify(f"wrote manifest to '{args.output}' ({args.manifest_format})") @@ -343,12 +384,14 @@ def overlap(args): moltype = sourmash_args.calculate_moltype(args) - sig1 = sourmash.load_one_signature(args.signature1, ksize=args.ksize, - select_moltype=moltype) - sig2 = sourmash.load_one_signature(args.signature2, ksize=args.ksize, - select_moltype=moltype) + sig1 = sourmash.load_one_signature( + args.signature1, ksize=args.ksize, select_moltype=moltype + ) + sig2 = sourmash.load_one_signature( + args.signature2, ksize=args.ksize, select_moltype=moltype + ) - notify(f'loaded one signature each from {args.signature1} and {args.signature2}') + notify(f"loaded one signature each from {args.signature1} and {args.signature2}") try: similarity = sig1.similarity(sig2) @@ -384,7 +427,8 @@ def overlap(args): disjoint_2 = len(hashes_2 - hashes_1) num_union = len(hashes_1.union(hashes_2)) - print('''\ + print( + """\ first signature: signature filename: {sig1_file} signature: {name1} @@ -408,7 +452,8 @@ def overlap(args): only in first: {disjoint_1} only in second: {disjoint_2} total (union): {num_union} -'''.format(**locals())) +""".format(**locals()) + ) def merge(args): @@ -425,13 +470,15 @@ def merge(args): # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + ) for sigobj, sigloc in loader: # first signature? initialize a bunch of stuff @@ -452,8 +499,12 @@ def merge(args): mh.merge(sigobj_mh) except (TypeError, ValueError) as exc: - error("ERROR when merging signature '{}' ({}) from file {}", - sigobj, sigobj.md5sum()[:8], sigloc) + error( + "ERROR when merging signature '{}' ({}) from file {}", + sigobj, + sigobj.md5sum()[:8], + sigloc, + ) error(str(exc)) sys.exit(-1) @@ -466,7 +517,7 @@ def merge(args): with sourmash_args.SaveSignaturesToLocation(args.output) as save_sigs: save_sigs.add(merged_sigobj) - notify(f'loaded and merged {len(progress)} signatures') + notify(f"loaded and merged {len(progress)} signatures") if picklist: sourmash_args.report_picklist(args, picklist) @@ -488,13 +539,15 @@ def intersect(args): # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + ) for sigobj, sigloc in loader: if first_sig is None: @@ -519,10 +572,10 @@ def intersect(args): # borrow abundances from a signature? if args.abundances_from: - notify(f'loading signature from {args.abundances_from}, keeping abundances') - abund_sig = sourmash.load_one_signature(args.abundances_from, - ksize=args.ksize, - select_moltype=moltype) + notify(f"loading signature from {args.abundances_from}, keeping abundances") + abund_sig = sourmash.load_one_signature( + args.abundances_from, ksize=args.ksize, select_moltype=moltype + ) if not abund_sig.minhash.track_abundance: error("--track-abundance not set on loaded signature?! exiting.") sys.exit(-1) @@ -533,7 +586,7 @@ def intersect(args): with sourmash_args.SaveSignaturesToLocation(args.output) as save_sigs: save_sigs.add(intersect_sigobj) - notify(f'loaded and intersected {len(progress)} signatures') + notify(f"loaded and intersected {len(progress)} signatures") if picklist: sourmash_args.report_picklist(args, picklist) @@ -546,9 +599,9 @@ def inflate(args): moltype = sourmash_args.calculate_moltype(args) picklist = sourmash_args.load_picklist(args) - inflate_sig = sourmash_args.load_query_signature(args.signature_from, - ksize=args.ksize, - select_moltype=moltype) + inflate_sig = sourmash_args.load_query_signature( + args.signature_from, ksize=args.ksize, select_moltype=moltype + ) inflate_from_mh = inflate_sig.minhash ksize = inflate_from_mh.ksize moltype = inflate_from_mh.moltype @@ -560,19 +613,20 @@ def inflate(args): # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.other_sigs, - ksize=ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force) + loader = sourmash_args.load_many_signatures( + args.other_sigs, + ksize=ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + ) with sourmash_args.SaveSignaturesToLocation(args.output) as save_sigs: for sigobj, sigloc in loader: inflated_mh = sigobj.minhash.inflate(inflate_from_mh) - inflated_sigobj = sourmash.SourmashSignature(inflated_mh, - name=sigobj.name) + inflated_sigobj = sourmash.SourmashSignature(inflated_mh, name=sigobj.name) save_sigs.add(inflated_sigobj) @@ -580,7 +634,7 @@ def inflate(args): error("no signatures to inflate!?") sys.exit(-1) - notify(f'loaded and intersected {len(save_sigs)} signatures') + notify(f"loaded and intersected {len(save_sigs)} signatures") if picklist: sourmash_args.report_picklist(args, picklist) @@ -593,38 +647,41 @@ def subtract(args): moltype = sourmash_args.calculate_moltype(args) from_sigfile = args.signature_from - from_sigobj = sourmash.load_one_signature(from_sigfile, ksize=args.ksize, select_moltype=moltype) + from_sigobj = sourmash.load_one_signature( + from_sigfile, ksize=args.ksize, select_moltype=moltype + ) - if args.abundances_from: # it's ok to work with abund signatures if -A. + if args.abundances_from: # it's ok to work with abund signatures if -A. args.flatten = True from_mh = from_sigobj.minhash if from_mh.track_abundance and not args.flatten: - error('Cannot use subtract on signatures with abundance tracking, sorry!') + error("Cannot use subtract on signatures with abundance tracking, sorry!") sys.exit(1) subtract_mins = set(from_mh.hashes) - notify(f'loaded signature from {from_sigfile}...', end='\r') + notify(f"loaded signature from {from_sigfile}...", end="\r") progress = sourmash_args.SignatureLoadingProgress() for sigfile in args.subtraction_sigs: - for sigobj in sourmash_args.load_file_as_signatures(sigfile, - ksize=args.ksize, - select_moltype=moltype, - progress=progress): + for sigobj in sourmash_args.load_file_as_signatures( + sigfile, ksize=args.ksize, select_moltype=moltype, progress=progress + ): if not sigobj.minhash.is_compatible(from_mh): error("incompatible minhashes; specify -k and/or molecule type.") sys.exit(-1) if sigobj.minhash.track_abundance and not args.flatten: - error('Cannot use subtract on signatures with abundance tracking, sorry!') + error( + "Cannot use subtract on signatures with abundance tracking, sorry!" + ) sys.exit(1) subtract_mins -= set(sigobj.minhash.hashes) - notify(f'loaded and subtracted signatures from {sigfile}...', end='\r') + notify(f"loaded and subtracted signatures from {sigfile}...", end="\r") if not len(progress): error("no signatures to subtract!?") @@ -636,10 +693,10 @@ def subtract(args): # borrow abundances from somewhere? if args.abundances_from: - notify(f'loading signature from {args.abundances_from}, keeping abundances') - abund_sig = sourmash.load_one_signature(args.abundances_from, - ksize=args.ksize, - select_moltype=moltype) + notify(f"loading signature from {args.abundances_from}, keeping abundances") + abund_sig = sourmash.load_one_signature( + args.abundances_from, ksize=args.ksize, select_moltype=moltype + ) if not abund_sig.minhash.track_abundance: error("--track-abundance not set on loaded signature?! exiting.") sys.exit(-1) @@ -651,7 +708,7 @@ def subtract(args): with sourmash_args.SaveSignaturesToLocation(args.output) as save_sigs: save_sigs.add(subtract_sigobj) - notify(f'loaded and subtracted {len(progress)} signatures') + notify(f"loaded and subtracted {len(progress)} signatures") def rename(args): @@ -669,14 +726,16 @@ def rename(args): # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force, - pattern=pattern_search) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + pattern=pattern_search, + ) for sigobj, sigloc in loader: sigobj = sigobj.to_mutable() @@ -703,14 +762,15 @@ def extract(args): # further filtering on md5 or name? filter_fn = None if args.md5 is not None or args.name is not None: + def filter_fn(row): # match? keep = False if args.name: - name = row['name'] or row['filename'] + name = row["name"] or row["filename"] if args.name in name: keep = True - if args.md5 and args.md5 in row['md5']: + if args.md5 and args.md5 in row["md5"]: keep = True return keep @@ -722,13 +782,11 @@ def filter_fn(row): # start loading! total_rows_examined = 0 for filename in args.signatures: - idx = sourmash_args.load_file_as_index(filename, - yield_all_files=args.force) + idx = sourmash_args.load_file_as_index(filename, yield_all_files=args.force) idx = idx.select(ksize=args.ksize, moltype=moltype) - idx = sourmash_args.apply_picklist_and_pattern(idx, picklist, - pattern_search) + idx = sourmash_args.apply_picklist_and_pattern(idx, picklist, pattern_search) manifest = sourmash_args.get_manifest(idx) total_rows_examined += len(manifest) @@ -743,7 +801,9 @@ def filter_fn(row): try: idx = idx.select(picklist=sub_picklist) except ValueError: - error("** This input collection doesn't support 'extract' with picklists or patterns.") + error( + "** This input collection doesn't support 'extract' with picklists or patterns." + ) error("** EXITING.") error("**") error("** You can use 'sourmash sig cat' with a picklist or pattern,") @@ -779,31 +839,29 @@ def filter(args): save_sigs.open() for filename in args.signatures: - siglist = sourmash_args.load_file_as_signatures(filename, - ksize=args.ksize, - select_moltype=moltype, - progress=progress) + siglist = sourmash_args.load_file_as_signatures( + filename, ksize=args.ksize, select_moltype=moltype, progress=progress + ) siglist = list(siglist) # select! if args.md5 is not None: - siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ] + siglist = [ss for ss in siglist if args.md5 in ss.md5sum()] if args.name is not None: - siglist = [ ss for ss in siglist if args.name in str(ss) ] + siglist = [ss for ss in siglist if args.name in str(ss)] for ss in siglist: mh = ss.minhash if not mh.track_abundance: - notify(f'ignoring signature {ss} - track_abundance not set.') + notify(f"ignoring signature {ss} - track_abundance not set.") continue abunds = mh.hashes abunds2 = {} for k, v in abunds.items(): if v >= args.min_abundance: - if args.max_abundance is None or \ - v <= args.max_abundance: - abunds2[k] = v + if args.max_abundance is None or v <= args.max_abundance: + abunds2[k] = v filtered_mh = mh.copy_and_clear() filtered_mh.set_abundances(abunds2) @@ -833,22 +891,24 @@ def flatten(args): # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + ) for ss, sigloc in loader: # select! if args.md5 is not None: if args.md5 not in ss.md5sum(): - continue # skip + continue # skip if args.name is not None: if args.name not in ss.name: - continue # skip + continue # skip ss = ss.to_mutable() ss.minhash = ss.minhash.flatten() @@ -872,11 +932,11 @@ def downsample(args): _extend_signatures_with_from_file(args) if not args.num_hashes and not args.scaled: - error('ERROR: must specify either --num or --scaled value') + error("ERROR: must specify either --num or --scaled value") sys.exit(-1) if args.num_hashes and args.scaled: - error('ERROR: cannot specify both --num and --scaled') + error("ERROR: cannot specify both --num and --scaled") sys.exit(-1) # open output for saving sigs @@ -885,13 +945,15 @@ def downsample(args): # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + ) for ss, sigloc in loader: sigobj = ss.to_mutable() mh = sigobj.minhash @@ -906,7 +968,9 @@ def downsample(args): max_hash = _get_max_hash_for_scaled(args.scaled) mins = mh.hashes if max(mins) < max_hash: - raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.") + raise ValueError( + "this num MinHash does not have enough hashes to convert it into a scaled MinHash." + ) mh_new = mh.copy() _set_num_scaled(mh_new, 0, args.scaled) @@ -923,7 +987,6 @@ def downsample(args): mh_new = mh.copy() _set_num_scaled(mh_new, args.num_hashes, 0) - sigobj.minhash = mh_new save_sigs.add(sigobj) @@ -944,7 +1007,7 @@ def ingest(args): siglist = [] if args.csv: for filename in args.filenames: - with open(filename, newline='') as csv_fp: + with open(filename, newline="") as csv_fp: reader = csv.reader(csv_fp) siglist = [] for row in reader: @@ -952,34 +1015,34 @@ def ingest(args): hashseed = int(row[1]) # only support a limited import type, for now ;) - assert hashfn == 'murmur64' + assert hashfn == "murmur64" assert hashseed == 42 _, _, ksize, name, hashes = row ksize = int(ksize) hashes = hashes.strip() - hashes = list(map(int, hashes.split(' ' ))) + hashes = list(map(int, hashes.split(" "))) e = sourmash.MinHash(len(hashes), ksize) e.add_many(hashes) s = sourmash.SourmashSignature(e, filename=name) siglist.append(s) - notify(f'loaded signature: {name} {s.md5sum()[:8]}') + notify(f"loaded signature: {name} {s.md5sum()[:8]}") else: for filename in args.filenames: with open(filename) as fp: x = json.loads(fp.read()) - ksize = x['kmer'] - num = x['sketchSize'] + ksize = x["kmer"] + num = x["sketchSize"] - assert x['hashType'] == "MurmurHash3_x64_128" - assert x['hashBits'] == 64 - assert x['hashSeed'] == 42 + assert x["hashType"] == "MurmurHash3_x64_128" + assert x["hashBits"] == 64 + assert x["hashSeed"] == 42 - xx = x['sketches'][0] - hashes = xx['hashes'] + xx = x["sketches"][0] + hashes = xx["hashes"] mh = sourmash.MinHash(ksize=ksize, n=num, is_protein=False) mh.add_many(hashes) @@ -987,7 +1050,7 @@ def ingest(args): s = sourmash.SourmashSignature(mh, filename=filename) siglist.append(s) - notify(f'saving {len(siglist)} signatures to JSON') + notify(f"saving {len(siglist)} signatures to JSON") with sourmash_args.SaveSignaturesToLocation(args.output) as save_sigs: save_sigs.add_many(siglist) @@ -999,24 +1062,23 @@ def export(args): set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) - query = sourmash_args.load_query_signature(args.filename, - ksize=args.ksize, - select_moltype=moltype, - select_md5=args.md5) + query = sourmash_args.load_query_signature( + args.filename, ksize=args.ksize, select_moltype=moltype, select_md5=args.md5 + ) mh = query.minhash x = {} - x['kmer'] = mh.ksize - x['sketchSize'] = len(mh) + x["kmer"] = mh.ksize + x["sketchSize"] = len(mh) - x['hashType'] = "MurmurHash3_x64_128" - x['hashBits'] = 64 - x['hashSeed'] = mh.seed + x["hashType"] = "MurmurHash3_x64_128" + x["hashBits"] = 64 + x["hashSeed"] = mh.seed ll = list(mh.hashes) - x['sketches'] = [{ 'hashes': ll }] + x["sketches"] = [{"hashes": ll}] - with FileOutput(args.output, 'wt') as fp: + with FileOutput(args.output, "wt") as fp: print(json.dumps(x), file=fp) notify(f"exported signature {query} ({query.md5sum()[:8]})") @@ -1035,16 +1097,17 @@ def kmers(args): first_sig = None query_mh = None - # start loading! progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force) + loader = sourmash_args.load_many_signatures( + args.signatures, + ksize=args.ksize, + moltype=moltype, + picklist=picklist, + progress=progress, + yield_all_files=args.force, + force=args.force, + ) for sigobj, sigloc in loader: # first signature? initialize a bunch of stuff @@ -1061,8 +1124,12 @@ def kmers(args): query_mh.merge(sigobj_mh) except (TypeError, ValueError) as exc: - error("ERROR when merging signature '{}' ({}) from file {}", - sigobj, sigobj.md5sum()[:8], sigloc) + error( + "ERROR when merging signature '{}' ({}) from file {}", + sigobj, + sigobj.md5sum()[:8], + sigloc, + ) error(str(exc)) sys.exit(-1) @@ -1075,13 +1142,13 @@ def kmers(args): sourmash_args.report_picklist(args, picklist) is_protein = False - if query_mh.moltype == 'DNA': + if query_mh.moltype == "DNA": if args.translate: error("ERROR: cannot use --translate with DNA sketches.") sys.exit(-1) else: is_protein = True - if args.translate: # input sequence is DNA + if args.translate: # input sequence is DNA is_protein = False if not query_mh: @@ -1089,8 +1156,10 @@ def kmers(args): sys.exit(-1) notify("") - notify(f"merged signature has the following properties:") - notify(f"k={query_mh.ksize} molecule={query_mh.moltype} num={query_mh.num} scaled={query_mh.scaled} seed={query_mh.seed}") + notify("merged signature has the following properties:") + notify( + f"k={query_mh.ksize} molecule={query_mh.moltype} num={query_mh.num} scaled={query_mh.scaled} seed={query_mh.seed}" + ) notify(f"total hashes in merged signature: {len(query_mh)}") notify("") notify("now processing sequence files for matches!") @@ -1103,11 +1172,10 @@ def kmers(args): if args.save_kmers: save_kmers = sourmash_args.FileOutputCSV(args.save_kmers) save_kmers.open() - kmer_w = csv.DictWriter(save_kmers.fp, - fieldnames=['sequence_file', - 'sequence_name', - 'kmer', - 'hashval']) + kmer_w = csv.DictWriter( + save_kmers.fp, + fieldnames=["sequence_file", "sequence_name", "kmer", "hashval"], + ) kmer_w.writeheader() save_seqs = None @@ -1117,7 +1185,7 @@ def kmers(args): # figure out protein vs dna is_protein = False - if query_mh.moltype != 'DNA': + if query_mh.moltype != "DNA": if not args.translate: is_protein = True @@ -1143,12 +1211,11 @@ def kmers(args): seq_mh.add_protein(record.sequence) else: try: - seq_mh.add_sequence(record.sequence, - not args.check_sequence) + seq_mh.add_sequence(record.sequence, not args.check_sequence) except ValueError as exc: seqname = record.name if len(seqname) > 40: - seqname = seqname[:37] + '...' + seqname = seqname[:37] + "..." notify(f"ERROR in sequence '{seqname}', file '{filename}'") notify(str(exc)) if args.force: @@ -1169,15 +1236,19 @@ def kmers(args): # output matching k-mers: if kmer_w: seq = record.sequence - kh_iter = seq_mh.kmers_and_hashes(seq, force=False, - is_protein=is_protein) + kh_iter = seq_mh.kmers_and_hashes( + seq, force=False, is_protein=is_protein + ) for kmer, hashval in kh_iter: if hashval in query_mh.hashes: found_mh.add_hash(hashval) n_kmers_found += 1 - d = dict(sequence_file=filename, - sequence_name=record.name, - kmer=kmer, hashval=hashval) + d = dict( + sequence_file=filename, + sequence_name=record.name, + kmer=kmer, + hashval=hashval, + ) kmer_w.writerow(d) # add seq_mh to found_mh @@ -1188,7 +1259,9 @@ def kmers(args): n_bp_searched += len(record.sequence) if n_bp_searched >= progress_threshold: - notify(f"... searched {n_bp_searched} from {n_files_searched} files so far") + notify( + f"... searched {n_bp_searched} from {n_files_searched} files so far" + ) while n_bp_searched >= progress_threshold: progress_threshold += progress_interval @@ -1205,10 +1278,14 @@ def kmers(args): # ...and report! notify("DONE.") - notify(f"searched {n_sequences_searched} sequences from {n_files_searched} files, containing a total of {format_bp(n_bp_searched)}.") + notify( + f"searched {n_sequences_searched} sequences from {n_files_searched} files, containing a total of {format_bp(n_bp_searched)}." + ) if save_seqs: - notify(f"matched and saved a total of {n_sequences_found} sequences with {format_bp(n_bp_saved)}.") + notify( + f"matched and saved a total of {n_sequences_found} sequences with {format_bp(n_bp_saved)}." + ) if kmer_w: notify(f"matched and saved a total of {n_kmers_found} k-mers.") @@ -1226,7 +1303,7 @@ def kmers(args): notify("NOTE: see --save-kmers or --save-sequences for output options.") -_SketchInfo = namedtuple('_SketchInfo', 'ksize, moltype, scaled, num, abund') +_SketchInfo = namedtuple("_SketchInfo", "ksize, moltype, scaled, num, abund") def _summarize_manifest(manifest): @@ -1237,22 +1314,26 @@ def _summarize_manifest(manifest): counter = Counter() hashcounts = Counter() for row in manifest.rows: - ski = _SketchInfo(ksize=row['ksize'], moltype=row['moltype'], - scaled=row['scaled'], num=row['num'], - abund=row['with_abundance']) + ski = _SketchInfo( + ksize=row["ksize"], + moltype=row["moltype"], + scaled=row["scaled"], + num=row["num"], + abund=row["with_abundance"], + ) counter[ski] += 1 - hashcounts[ski] += row['n_hashes'] - total_size += row['n_hashes'] + hashcounts[ski] += row["n_hashes"] + total_size += row["n_hashes"] # store in info_d - info_d['total_hashes'] = total_size + info_d["total_hashes"] = total_size sketch_info = [] for ski, count in counter.items(): sketch_d = dict(ski._asdict()) - sketch_d['count'] = count - sketch_d['n_hashes'] = hashcounts[ski] + sketch_d["count"] = count + sketch_d["n_hashes"] = hashcounts[ski] sketch_info.append(sketch_d) - info_d['sketch_info'] = sketch_info + info_d["sketch_info"] = sketch_info return info_d @@ -1271,22 +1352,21 @@ def fileinfo(args): # load as index! try: notify(f"** loading from '{args.path}'") - idx = sourmash_args.load_file_as_index(args.path, - yield_all_files=args.force) + idx = sourmash_args.load_file_as_index(args.path, yield_all_files=args.force) except ValueError: error(f"Cannot open '{args.path}' as a sourmash signature collection.") error("Use -d/--debug for details.") sys.exit(-1) - print_bool = lambda x: "yes" if x else "no" - print_none = lambda x: "n/a" if x is None else x + def print_bool(x): + return "yes" if x else "no" info_d = {} - info_d['path_filetype'] = type(idx).__name__ - info_d['location'] = "" if not idx.location else idx.location - info_d['is_database'] = bool(idx.is_database) - info_d['has_manifest'] = bool(idx.manifest) - info_d['num_sketches'] = len(idx) + info_d["path_filetype"] = type(idx).__name__ + info_d["location"] = "" if not idx.location else idx.location + info_d["is_database"] = bool(idx.is_database) + info_d["has_manifest"] = bool(idx.manifest) + info_d["num_sketches"] = len(idx) if text_out: print_results(f"path filetype: {info_d['path_filetype']}") @@ -1298,8 +1378,9 @@ def fileinfo(args): # also have arg to fileinfo to force recalculation notify("** examining manifest...") - manifest = sourmash_args.get_manifest(idx, rebuild=args.rebuild_manifest, - require=False) + manifest = sourmash_args.get_manifest( + idx, rebuild=args.rebuild_manifest, require=False + ) if manifest is None: # actually can't find any file type to trigger this, but leaving it @@ -1313,9 +1394,9 @@ def fileinfo(args): print_results(f"total hashes: {info_d['total_hashes']}") print_results("summary of sketches:") - for ski in info_d['sketch_info']: - mh_type = f"num={ski['num']}" if ski['num'] else f"scaled={ski['scaled']}" - mh_abund = ", abund" if ski['abund'] else "" + for ski in info_d["sketch_info"]: + mh_type = f"num={ski['num']}" if ski["num"] else f"scaled={ski['scaled']}" + mh_abund = ", abund" if ski["abund"] else "" sketch_str = f"{ski['count']} sketches with {ski['moltype']}, k={ski['ksize']}, {mh_type}{mh_abund}" @@ -1331,10 +1412,11 @@ def check(args): check signature db(s) against a picklist. """ from sourmash.picklist import PickStyle + set_quiet(args.quiet, args.debug) moltype = sourmash_args.calculate_moltype(args) picklist = sourmash_args.load_picklist(args) - pattern_search = sourmash_args.load_include_exclude_db_patterns(args) + sourmash_args.load_include_exclude_db_patterns(args) _extend_signatures_with_from_file(args) if not picklist: @@ -1358,8 +1440,7 @@ def check(args): # start loading! total_rows_examined = 0 for filename in args.signatures: - idx = sourmash_args.load_file_as_index(filename, - yield_all_files=args.force) + idx = sourmash_args.load_file_as_index(filename, yield_all_files=args.force) idx = idx.select(ksize=args.ksize, moltype=moltype) @@ -1376,14 +1457,15 @@ def check(args): # rewrite locations so that each signature can be found by filename # of its container; this follows `sig collect` logic. - rows = [] for row in sub_manifest.rows: - row['internal_location'] = filename + row["internal_location"] = filename total_manifest_rows.add_row(row) # the len(sub_manifest) here should only be run when needed :) if _debug: - debug_literal(f"examined {len(new_manifest)} new rows, found {len(sub_manifest)} matching rows") + debug_literal( + f"examined {len(new_manifest)} new rows, found {len(sub_manifest)} matching rows" + ) notify(f"loaded {total_rows_examined} signatures.") @@ -1399,7 +1481,7 @@ def check(args): n_output = 0 with sourmash_args.FileInputCSV(pickfile) as r: - with open(args.output_missing, "w", newline='') as outfp: + with open(args.output_missing, "w", newline="") as outfp: w = csv.DictWriter(outfp, fieldnames=r.fieldnames) w.writeheader() @@ -1408,18 +1490,27 @@ def check(args): if not picklist.matched_csv_row(row): n_output += 1 w.writerow(row) - notify(f"saved {n_output} non-matching rows of {n_input} picklist rows to '{args.output_missing}'") + notify( + f"saved {n_output} non-matching rows of {n_input} picklist rows to '{args.output_missing}'" + ) elif args.output_missing: - notify(f"(no remaining picklist entries; not saving to '{args.output_missing}')") + notify( + f"(no remaining picklist entries; not saving to '{args.output_missing}')" + ) # save manifest of matching! if args.save_manifest_matching and total_manifest_rows: mf = total_manifest_rows - mf.write_to_filename(args.save_manifest_matching, - database_format=args.manifest_format) - notify(f"wrote {len(mf)} matching manifest rows to '{args.save_manifest_matching}'") + mf.write_to_filename( + args.save_manifest_matching, database_format=args.manifest_format + ) + notify( + f"wrote {len(mf)} matching manifest rows to '{args.save_manifest_matching}'" + ) elif args.save_manifest_matching: - notify(f"(not saving matching manifest to '{args.save_manifest_matching}' because no matches)") + notify( + f"(not saving matching manifest to '{args.save_manifest_matching}' because no matches)" + ) if args.fail_if_missing and n_missing: error("** ERROR: missing values, and --fail-if-missing requested. Exiting.") @@ -1437,15 +1528,17 @@ def collect(args): pass else: error(f"ERROR: '{args.output}' already exists!") - error(f"ERROR: please remove it, or use --merge-previous to merge") + error("ERROR: please remove it, or use --merge-previous to merge") sys.exit(-1) elif args.merge_previous: - notify(f"WARNING: --merge-previous specified, but output file '{args.output}' does not already exist?") + notify( + f"WARNING: --merge-previous specified, but output file '{args.output}' does not already exist?" + ) # load previous manifest for --merge-previous. This gets tricky with # mismatched manifest types, which we forbid. try: - if args.manifest_format == 'sql': + if args.manifest_format == "sql": # create on-disk manifest from sourmash.index.sqlite_index import SqliteCollectionManifest @@ -1455,7 +1548,7 @@ def collect(args): collected_mf = SqliteCollectionManifest.create(args.output) else: # create in-memory manifest that will be saved as CSV - assert args.manifest_format == 'csv' + assert args.manifest_format == "csv" if args.merge_previous and os.path.exists(args.output): collected_mf = CollectionManifest.load_from_filename(args.output) @@ -1465,7 +1558,9 @@ def collect(args): if not isinstance(collected_mf, CollectionManifest): raise Exception except: - error(f"ERROR loading '{args.output}' with --merge-previous. Is it of type {args.manifest_format}?") + error( + f"ERROR loading '{args.output}' with --merge-previous. Is it of type {args.manifest_format}?" + ) sys.exit(-1) if args.merge_previous: @@ -1482,35 +1577,37 @@ def collect(args): n_files = 0 # load from_file - _extend_signatures_with_from_file(args, target_attr='locations') + _extend_signatures_with_from_file(args, target_attr="locations") # convert to abspath if args.abspath: - args.locations = [ os.path.abspath(iloc) for iloc in args.locations ] + args.locations = [os.path.abspath(iloc) for iloc in args.locations] # iterate through, loading all the manifests from all the locations. for n_files, loc in enumerate(args.locations): notify(f"Loading signature information from {loc}.") if n_files % 100 == 0: - notify(f'... loaded {len(collected_mf)} sigs from {n_files} files') + notify(f"... loaded {len(collected_mf)} sigs from {n_files} files") idx = sourmash.load_file_as_index(loc) if idx.manifest is None and require_manifest: error(f"ERROR on location '{loc}'") - error(f"sig collect requires a manifest by default, but no manifest present.") + error( + "sig collect requires a manifest by default, but no manifest present." + ) error("specify --no-require-manifest to dynamically generate one.") sys.exit(-1) mf = sourmash_args.get_manifest(idx) - rows = [] for row in mf.rows: - row['internal_location'] = loc + row["internal_location"] = loc collected_mf.add_row(row) - if args.manifest_format == 'csv': - collected_mf.write_to_filename(args.output, database_format='csv', - ok_if_exists=args.merge_previous) + if args.manifest_format == "csv": + collected_mf.write_to_filename( + args.output, database_format="csv", ok_if_exists=args.merge_previous + ) else: collected_mf.close() @@ -1522,9 +1619,9 @@ def collect(args): def main(arglist=None): args = sourmash.cli.get_parser().parse_args(arglist) submod = getattr(sourmash.cli.sig, args.subcmd) - mainmethod = getattr(submod, 'main') + mainmethod = getattr(submod, "main") return mainmethod(args) -if __name__ == '__main__': +if __name__ == "__main__": main(sys.argv) diff --git a/src/sourmash/sig/grep.py b/src/sourmash/sig/grep.py index e131ca501e..cfdc857779 100644 --- a/src/sourmash/sig/grep.py +++ b/src/sourmash/sig/grep.py @@ -28,9 +28,13 @@ def main(args): pattern = re.compile(pattern) if args.invert_match: - search_pattern = lambda vals: all(not pattern.search(val) for val in vals) + + def search_pattern(vals): + return all(not pattern.search(val) for val in vals) else: - search_pattern = lambda vals: any(pattern.search(val) for val in vals) + + def search_pattern(vals): + return any(pattern.search(val) for val in vals) # require manifests? require_manifest = True @@ -63,28 +67,27 @@ def main(args): # start loading! total_rows_examined = 0 for filename in args.signatures: - idx = sourmash_args.load_file_as_index(filename, - yield_all_files=args.force) + idx = sourmash_args.load_file_as_index(filename, yield_all_files=args.force) - idx = idx.select(ksize=args.ksize, - moltype=moltype, - picklist=picklist) + idx = idx.select(ksize=args.ksize, moltype=moltype, picklist=picklist) # get (and maybe generate) the manifest. manifest = idx.manifest if manifest is None: if require_manifest: error(f"ERROR on filename '{filename}'.") - error("sig grep requires a manifest by default, but no manifest present.") + error( + "sig grep requires a manifest by default, but no manifest present." + ) error("specify --no-require-manifest to dynamically generate one.") sys.exit(-1) else: - manifest = sourmash_args.get_manifest(idx, - require=False) + manifest = sourmash_args.get_manifest(idx, require=False) # find all matching rows. - sub_manifest = manifest.filter_on_columns(search_pattern, - ["name", "filename", "md5"]) + sub_manifest = manifest.filter_on_columns( + search_pattern, ["name", "filename", "md5"] + ) total_rows_examined += len(manifest) # write out to CSV, if desired. @@ -119,7 +122,9 @@ def main(args): notify(f"loaded {total_rows_examined} total that matched ksize & molecule type") if save_sigs: - notify(f"extracted {len(save_sigs)} signatures from {len(args.signatures)} file(s)") + notify( + f"extracted {len(save_sigs)} signatures from {len(args.signatures)} file(s)" + ) save_sigs.close() else: error("no matching signatures found!") diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index 1fd34d35e6..3faa5e856b 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -40,12 +40,9 @@ def __init__(self, minhash, name="", filename=""): self.minhash = minhash - @property def minhash(self): - return FrozenMinHash._from_objptr( - self._methodcall(lib.signature_first_mh) - ) + return FrozenMinHash._from_objptr(self._methodcall(lib.signature_first_mh)) @minhash.setter def minhash(self, value): @@ -62,11 +59,11 @@ def __repr__(self): name = self.name md5pref = self.md5sum()[:8] if name == md5pref: - return "SourmashSignature({})".format(md5pref) - else: # name != md5pref: - return "SourmashSignature('{}', {})".format(name, md5pref) + return f"SourmashSignature({md5pref})" + else: # name != md5pref: + return f"SourmashSignature('{name}', {md5pref})" - #def minhashes(self): + # def minhashes(self): # size = ffi.new("uintptr_t *") # mhs_ptr = self._methodcall(lib.signature_get_mhs, size) # size = ffi.unpack(size, 1)[0] @@ -134,40 +131,77 @@ def _display_name(self, max_length=0): def similarity(self, other, ignore_abundance=False, downsample=False): "Compute similarity with the other signature." - return self.minhash.similarity(other.minhash, - ignore_abundance=ignore_abundance, - downsample=downsample) + return self.minhash.similarity( + other.minhash, ignore_abundance=ignore_abundance, downsample=downsample + ) def jaccard(self, other): "Compute Jaccard similarity with the other MinHash signature." - return self.minhash.similarity(other.minhash, ignore_abundance=True, - downsample=False) + return self.minhash.similarity( + other.minhash, ignore_abundance=True, downsample=False + ) - def jaccard_ani(self, other, *, downsample=False, jaccard=None, prob_threshold=1e-3, err_threshold=1e-4): + def jaccard_ani( + self, + other, + *, + downsample=False, + jaccard=None, + prob_threshold=1e-3, + err_threshold=1e-4, + ): "Use jaccard to estimate ANI between two FracMinHash signatures." - return self.minhash.jaccard_ani(other.minhash, downsample=downsample, - jaccard=jaccard, prob_threshold=prob_threshold, - err_threshold=err_threshold) + return self.minhash.jaccard_ani( + other.minhash, + downsample=downsample, + jaccard=jaccard, + prob_threshold=prob_threshold, + err_threshold=err_threshold, + ) def contained_by(self, other, downsample=False): "Compute containment by the other signature. Note: ignores abundance." return self.minhash.contained_by(other.minhash, downsample=downsample) - def containment_ani(self, other, *, downsample=False, containment=None, confidence=0.95, estimate_ci=False): + def containment_ani( + self, + other, + *, + downsample=False, + containment=None, + confidence=0.95, + estimate_ci=False, + ): "Use containment to estimate ANI between two FracMinHash signatures." - return self.minhash.containment_ani(other.minhash, downsample=downsample, - containment=containment, confidence=confidence, - estimate_ci=estimate_ci) + return self.minhash.containment_ani( + other.minhash, + downsample=downsample, + containment=containment, + confidence=confidence, + estimate_ci=estimate_ci, + ) def max_containment(self, other, downsample=False): "Compute max containment w/other signature. Note: ignores abundance." return self.minhash.max_containment(other.minhash, downsample=downsample) - def max_containment_ani(self, other, *, downsample=False, max_containment=None, confidence=0.95, estimate_ci=False): + def max_containment_ani( + self, + other, + *, + downsample=False, + max_containment=None, + confidence=0.95, + estimate_ci=False, + ): "Use max containment to estimate ANI between two FracMinHash signatures." - return self.minhash.max_containment_ani(other.minhash, downsample=downsample, - max_containment=max_containment, confidence=confidence, - estimate_ci=estimate_ci) + return self.minhash.max_containment_ani( + other.minhash, + downsample=downsample, + max_containment=max_containment, + confidence=confidence, + estimate_ci=estimate_ci, + ) def avg_containment(self, other, downsample=False): """ @@ -218,11 +252,7 @@ def __setstate__(self, tup): def __reduce__(self): return ( SourmashSignature, - ( - self.minhash, - self.name, - self.filename - ), + (self.minhash, self.name, self.filename), ) def __copy__(self): @@ -279,6 +309,7 @@ def add_protein(self, sequence): def __copy__(self): return self + copy = __copy__ def to_frozen(self): @@ -325,7 +356,9 @@ def _detect_input_type(data): - Compressed memory buffers - filename """ - if hasattr(data, 'read') or hasattr(data, "fileno") or hasattr(data, "mode"): # file-like object + if ( + hasattr(data, "read") or hasattr(data, "fileno") or hasattr(data, "mode") + ): # file-like object return SigInput.FILE_LIKE elif hasattr(data, "find"): # check if it is uncompressed sig try: @@ -334,7 +367,7 @@ def _detect_input_type(data): except TypeError: if data.find(b"sourmash_signature") > 0: return SigInput.BUFFER - elif data.startswith(b'\x1F\x8B'): # gzip compressed + elif data.startswith(b"\x1F\x8B"): # gzip compressed return SigInput.BUFFER try: @@ -347,7 +380,11 @@ def _detect_input_type(data): def load_signatures( - data, ksize=None, select_moltype=None, ignore_md5sum=False, do_raise=False, + data, + ksize=None, + select_moltype=None, + ignore_md5sum=False, + do_raise=False, ): """Load a JSON string with signatures into classes. @@ -374,14 +411,18 @@ def load_signatures( input_type = _detect_input_type(data) if input_type == SigInput.UNKNOWN: if do_raise: - raise ValueError("Error in parsing signature; quitting. Cannot open file or invalid signature") + raise ValueError( + "Error in parsing signature; quitting. Cannot open file or invalid signature" + ) return size = ffi.new("uintptr_t *") try: if input_type == SigInput.FILE_LIKE: - if hasattr(data, "mode") and "t" in data.mode: # need to reopen handler as binary + if ( + hasattr(data, "mode") and "t" in data.mode + ): # need to reopen handler as binary data = data.buffer buf = data.read() @@ -423,7 +464,7 @@ def load_signatures( for sig in sigs: yield sig.to_frozen() - except Exception as e: + except Exception: if do_raise: raise @@ -461,8 +502,9 @@ def save_signatures(siglist, fp=None, compression=0): size = ffi.new("uintptr_t *") # save signature into a string (potentially compressed) - rawbuf = rustcall(lib.signatures_save_buffer, siglist_c, len(collected), - compression, size) + rawbuf = rustcall( + lib.signatures_save_buffer, siglist_c, len(collected), compression, size + ) size = size[0] # associate a finalizer with rawbuf so that it gets freed @@ -472,11 +514,11 @@ def save_signatures(siglist, fp=None, compression=0): else: result = ffi.string(buf, size) - if fp is None: # return string + if fp is None: # return string return result else: - try: # write to file + try: # write to file fp.write(result) except TypeError: - fp.write(result.decode('utf-8')) + fp.write(result.decode("utf-8")) return None diff --git a/src/sourmash/sketchcomparison.py b/src/sourmash/sketchcomparison.py index db36d20ac3..e80013edaa 100644 --- a/src/sourmash/sketchcomparison.py +++ b/src/sourmash/sketchcomparison.py @@ -4,14 +4,16 @@ import numpy as np from dataclasses import dataclass -from .signature import MinHash +from .minhash import MinHash + @dataclass class BaseMinHashComparison: """Class for standard comparison between two MinHashes""" + mh1: MinHash mh2: MinHash - ignore_abundance: bool = False # optionally ignore abundances + ignore_abundance: bool = False # optionally ignore abundances jaccard_ani_untrustworthy: bool = False def downsample_and_handle_ignore_abundance(self, cmp_num=None, cmp_scaled=None): @@ -34,11 +36,15 @@ def downsample_and_handle_ignore_abundance(self, cmp_num=None, cmp_scaled=None): raise ValueError("Error: must pass in a comparison scaled or num value.") def check_compatibility_and_downsample(self, cmp_num=None, cmp_scaled=None): - if not any([(self.mh1.num and self.mh2.num), (self.mh1.scaled and self.mh2.scaled)]): + if not any( + [(self.mh1.num and self.mh2.num), (self.mh1.scaled and self.mh2.scaled)] + ): raise TypeError("Error: Both sketches must be 'num' or 'scaled'.") - #need to downsample first because is_compatible checks scaled (though does not check num) - self.downsample_and_handle_ignore_abundance(cmp_num=cmp_num, cmp_scaled=cmp_scaled) + # need to downsample first because is_compatible checks scaled (though does not check num) + self.downsample_and_handle_ignore_abundance( + cmp_num=cmp_num, cmp_scaled=cmp_scaled + ) if not self.mh1_cmp.is_compatible(self.mh2_cmp): raise TypeError("Error: Cannot compare incompatible sketches.") self.ksize = self.mh1.ksize @@ -69,30 +75,34 @@ def angular_similarity(self): @property def cosine_similarity(self): return self.angular_similarity - + + @dataclass class NumMinHashComparison(BaseMinHashComparison): """Class for standard comparison between two num minhashes""" + cmp_num: int = None def __post_init__(self): "Initialize NumMinHashComparison using values from provided MinHashes" - if self.cmp_num is None: # record the num we're doing this comparison on + if self.cmp_num is None: # record the num we're doing this comparison on self.cmp_num = min(self.mh1.num, self.mh2.num) self.check_compatibility_and_downsample(cmp_num=self.cmp_num) @property def size_may_be_inaccurate(self): - return False # not using size estimation, can ignore + return False # not using size estimation, can ignore + @dataclass class FracMinHashComparison(BaseMinHashComparison): """Class for standard comparison between two scaled minhashes""" - cmp_scaled: int = None # optionally force scaled value for this comparison + + cmp_scaled: int = None # optionally force scaled value for this comparison threshold_bp: int = 0 estimate_ani_ci: bool = False ani_confidence: float = 0.95 -# pfn_threshold: float = 1e-3 + # pfn_threshold: float = 1e-3 def __post_init__(self): "Initialize ScaledComparison using values from provided FracMinHashes" @@ -121,19 +131,23 @@ def total_unique_intersect_hashes(self): To get true bp estimates, we would need to add `(k-1)`. However, this complicates the iterative gather algorithm, so let's stick with hashes. """ - return len(self.intersect_mh) * self.cmp_scaled # + (ksize-1) #for bp estimation + return ( + len(self.intersect_mh) * self.cmp_scaled + ) # + (ksize-1) #for bp estimation @property def mh1_containment_in_mh2(self): return self.mh1_cmp.contained_by(self.mh2_cmp) - def estimate_ani_from_mh1_containment_in_mh2(self, containment = None): + def estimate_ani_from_mh1_containment_in_mh2(self, containment=None): # build result once - m1_cani = self.mh1_cmp.containment_ani(self.mh2_cmp, - containment=containment, - confidence=self.ani_confidence, - estimate_ci=self.estimate_ani_ci) -# prob_threshold=self.pfn_threshold) + m1_cani = self.mh1_cmp.containment_ani( + self.mh2_cmp, + containment=containment, + confidence=self.ani_confidence, + estimate_ci=self.estimate_ani_ci, + ) + # prob_threshold=self.pfn_threshold) # propagate params self.ani_from_mh1_containment_in_mh2 = m1_cani.ani if m1_cani.p_exceeds_threshold: @@ -148,28 +162,32 @@ def mh2_containment_in_mh1(self): return self.mh2_cmp.contained_by(self.mh1_cmp) def estimate_ani_from_mh2_containment_in_mh1(self, containment=None): - m2_cani = self.mh2_cmp.containment_ani(self.mh1_cmp, - containment=containment, - confidence=self.ani_confidence, - estimate_ci=self.estimate_ani_ci) -# prob_threshold=self.pfn_threshold) + m2_cani = self.mh2_cmp.containment_ani( + self.mh1_cmp, + containment=containment, + confidence=self.ani_confidence, + estimate_ci=self.estimate_ani_ci, + ) + # prob_threshold=self.pfn_threshold) self.ani_from_mh2_containment_in_mh1 = m2_cani.ani if m2_cani.p_exceeds_threshold: self.potential_false_negative = True if self.estimate_ani_ci: self.ani_from_mh2_containment_in_mh1_low = m2_cani.ani_low self.ani_from_mh2_containment_in_mh1_high = m2_cani.ani_high - + @property def max_containment(self): return self.mh1_cmp.max_containment(self.mh2_cmp) def estimate_max_containment_ani(self, max_containment=None): - mc_ani_info = self.mh1_cmp.max_containment_ani(self.mh2_cmp, - max_containment=max_containment, - confidence=self.ani_confidence, - estimate_ci=self.estimate_ani_ci) -# prob_threshold=self.pfn_threshold) + mc_ani_info = self.mh1_cmp.max_containment_ani( + self.mh2_cmp, + max_containment=max_containment, + confidence=self.ani_confidence, + estimate_ci=self.estimate_ani_ci, + ) + # prob_threshold=self.pfn_threshold) # propagate params self.max_containment_ani = mc_ani_info.ani if mc_ani_info.p_exceeds_threshold: @@ -187,23 +205,41 @@ def avg_containment_ani(self): "Returns single average_containment_ani value. Sets self.potential_false_negative internally." self.estimate_ani_from_mh1_containment_in_mh2() self.estimate_ani_from_mh2_containment_in_mh1() - if any([self.ani_from_mh1_containment_in_mh2 is None, self.ani_from_mh2_containment_in_mh1 is None]): + if any( + [ + self.ani_from_mh1_containment_in_mh2 is None, + self.ani_from_mh2_containment_in_mh1 is None, + ] + ): return None else: - return (self.ani_from_mh1_containment_in_mh2 + self.ani_from_mh2_containment_in_mh1)/2 + return ( + self.ani_from_mh1_containment_in_mh2 + + self.ani_from_mh2_containment_in_mh1 + ) / 2 def estimate_all_containment_ani(self): "Estimate all containment ANI values." self.estimate_ani_from_mh1_containment_in_mh2() self.estimate_ani_from_mh2_containment_in_mh1() - if any([self.ani_from_mh1_containment_in_mh2 is None, self.ani_from_mh2_containment_in_mh1 is None]): -# self.estimate_max_containment_ani() + if any( + [ + self.ani_from_mh1_containment_in_mh2 is None, + self.ani_from_mh2_containment_in_mh1 is None, + ] + ): + # self.estimate_max_containment_ani() self.max_containment_ani = None else: - self.max_containment_ani = max([self.ani_from_mh1_containment_in_mh2, self.ani_from_mh2_containment_in_mh1]) + self.max_containment_ani = max( + [ + self.ani_from_mh1_containment_in_mh2, + self.ani_from_mh2_containment_in_mh1, + ] + ) def weighted_intersection(self, from_mh=None, from_abundD={}): - # map abundances to all intersection hashes. + # map abundances to all intersection hashes. abund_mh = self.intersect_mh.copy_and_clear() abund_mh.track_abundance = True # if from_mh is provided, it takes precedence over from_abund dict @@ -211,7 +247,7 @@ def weighted_intersection(self, from_mh=None, from_abundD={}): from_abundD = from_mh.hashes if from_abundD: # this sets any hash not present in abundD to 1. Is that desired? Or should we return 0? - abunds = {k: from_abundD.get(k, 1) for k in self.intersect_mh.hashes } + abunds = {k: from_abundD.get(k, 1) for k in self.intersect_mh.hashes} abund_mh.set_abundances(abunds) return abund_mh # if no abundances are passed in, return intersect_mh diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 8b149d7d1d..fdbc0e4cf6 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -51,8 +51,7 @@ from .index import LinearIndex from .picklist import SignaturePicklist, PickStyle from .manifest import CollectionManifest -from .save_load import (SaveSignaturesToLocation, load_file_as_index, - _load_database) +from .save_load import SaveSignaturesToLocation, load_file_as_index, _load_database DEFAULT_LOAD_K = 31 @@ -64,9 +63,9 @@ def check_scaled_bounds(arg): if f < 0: raise argparse.ArgumentTypeError("ERROR: scaled value must be positive") if f < 100: - notify('WARNING: scaled value should be >= 100. Continuing anyway.') + notify("WARNING: scaled value should be >= 100. Continuing anyway.") if f > 1e6: - notify('WARNING: scaled value should be <= 1e6. Continuing anyway.') + notify("WARNING: scaled value should be <= 1e6. Continuing anyway.") return f @@ -76,18 +75,18 @@ def check_num_bounds(arg): if f < 0: raise argparse.ArgumentTypeError("ERROR: num value must be positive") if f < 50: - notify('WARNING: num value should be >= 50. Continuing anyway.') + notify("WARNING: num value should be >= 50. Continuing anyway.") if f > 50000: - notify('WARNING: num value should be <= 50000. Continuing anyway.') + notify("WARNING: num value should be <= 50000. Continuing anyway.") return f def get_moltype(sig, require=False): mh = sig.minhash - if mh.moltype in ('DNA', 'dayhoff', 'hp', 'protein'): + if mh.moltype in ("DNA", "dayhoff", "hp", "protein"): moltype = mh.moltype else: - raise ValueError('unknown molecule type for sig {}'.format(sig)) + raise ValueError(f"unknown molecule type for sig {sig}") return moltype @@ -97,20 +96,22 @@ def calculate_moltype(args, default=None): n = 0 if args.dna: - moltype = 'DNA' + moltype = "DNA" n += 1 if args.dayhoff: - moltype = 'dayhoff' + moltype = "dayhoff" n += 1 if args.hp: - moltype = 'hp' + moltype = "hp" n += 1 if args.protein: - moltype = 'protein' + moltype = "protein" n += 1 if n > 1: - error("cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff") + error( + "cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff" + ) sys.exit(-1) return moltype @@ -123,7 +124,9 @@ def load_picklist(args): try: picklist = SignaturePicklist.from_picklist_args(args.picklist) - notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'") + notify( + f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'" + ) n_empty_val, dup_vals = picklist.load() except ValueError as exc: @@ -133,19 +136,27 @@ def load_picklist(args): notify(f"loaded {len(picklist.pickset)} distinct values into picklist.") if n_empty_val: - notify(f"WARNING: {n_empty_val} empty values in column '{picklist.column_name}' in picklist file") + notify( + f"WARNING: {n_empty_val} empty values in column '{picklist.column_name}' in picklist file" + ) if dup_vals: - notify(f"WARNING: {len(dup_vals)} values in picklist column '{picklist.column_name}' were not distinct") + notify( + f"WARNING: {len(dup_vals)} values in picklist column '{picklist.column_name}' were not distinct" + ) return picklist def report_picklist(args, picklist): if picklist.pickstyle == PickStyle.INCLUDE: - notify(f"for given picklist, found {len(picklist.found)} matches to {len(picklist.pickset)} distinct values") + notify( + f"for given picklist, found {len(picklist.found)} matches to {len(picklist.pickset)} distinct values" + ) n_missing = len(picklist.pickset - picklist.found) elif picklist.pickstyle == PickStyle.EXCLUDE: - notify(f"for given picklist, found {len(picklist.found)} matches by excluding {len(picklist.pickset)} distinct values") + notify( + f"for given picklist, found {len(picklist.found)} matches by excluding {len(picklist.pickset)} distinct values" + ) n_missing = 0 if n_missing: notify(f"WARNING: {n_missing} missing picklist values.") @@ -157,19 +168,27 @@ def report_picklist(args, picklist): def load_include_exclude_db_patterns(args): if args.picklist and (args.include_db_pattern or args.exclude_db_pattern): - error("ERROR: --picklist and --include-db-pattern/--exclude cannot be used together.") + error( + "ERROR: --picklist and --include-db-pattern/--exclude cannot be used together." + ) sys.exit(-1) if args.include_db_pattern and args.exclude_db_pattern: - error("ERROR: --include-db-pattern and --exclude-db-pattern cannot be used together.") + error( + "ERROR: --include-db-pattern and --exclude-db-pattern cannot be used together." + ) sys.exit(-1) if args.include_db_pattern: pattern = re.compile(args.include_db_pattern, re.IGNORECASE) - search_pattern = lambda vals: any(pattern.search(val) for val in vals) + + def search_pattern(vals): + return any(pattern.search(val) for val in vals) elif args.exclude_db_pattern: pattern = re.compile(args.exclude_db_pattern, re.IGNORECASE) - search_pattern = lambda vals: all(not pattern.search(val) for val in vals) + + def search_pattern(vals): + return all(not pattern.search(val) for val in vals) else: search_pattern = None @@ -187,8 +206,7 @@ def apply_picklist_and_pattern(db, picklist, pattern): error("--include-db-pattern/--exclude-db-pattern require a manifest.") sys.exit(-1) - manifest = manifest.filter_on_columns(pattern, - ["name", "filename", "md5"]) + manifest = manifest.filter_on_columns(pattern, ["name", "filename", "md5"]) pattern_picklist = manifest.to_picklist() db = db.select(picklist=pattern_picklist) @@ -202,8 +220,9 @@ def load_query_signature(filename, ksize, select_moltype, select_md5=None): and indexed databases. """ try: - sl = load_file_as_signatures(filename, ksize=ksize, - select_moltype=select_moltype) + sl = load_file_as_signatures( + filename, ksize=ksize, select_moltype=select_moltype + ) sl = list(sl) except (OSError, ValueError): error(f"Cannot open query file '{filename}'") @@ -225,21 +244,21 @@ def load_query_signature(filename, ksize, select_moltype, select_md5=None): sl = [found_sig] if len(sl) and ksize is None: - ksizes = set([ ss.minhash.ksize for ss in sl ]) + ksizes = set([ss.minhash.ksize for ss in sl]) if len(ksizes) == 1: ksize = ksizes.pop() - sl = [ ss for ss in sl if ss.minhash.ksize == ksize ] - notify(f'select query k={ksize} automatically.') + sl = [ss for ss in sl if ss.minhash.ksize == ksize] + notify(f"select query k={ksize} automatically.") elif DEFAULT_LOAD_K in ksizes: - sl = [ ss for ss in sl if ss.minhash.ksize == DEFAULT_LOAD_K ] - notify(f'selecting default query k={DEFAULT_LOAD_K}.') + sl = [ss for ss in sl if ss.minhash.ksize == DEFAULT_LOAD_K] + notify(f"selecting default query k={DEFAULT_LOAD_K}.") elif ksize: - notify(f'selecting specified query k={ksize}') + notify(f"selecting specified query k={ksize}") if len(sl) != 1: error(f"When loading query from '{filename}'", filename) - error(f'{len(sl)} signatures matching ksize and molecule type;') - error('need exactly one. Specify --ksize or --dna, --rna, or --protein.') + error(f"{len(sl)} signatures matching ksize and molecule type;") + error("need exactly one. Specify --ksize or --dna, --rna, or --protein.") sys.exit(-1) return sl[0] @@ -259,7 +278,7 @@ def traverse_find_sigs(filenames, yield_all_files=False): If 'yield_all_files' is True, this will return _all_ files (but not directories). """ - endings = ('.sig', '.sig.gz') + endings = (".sig", ".sig.gz") for filename in filenames: # check for files in filenames: if os.path.isfile(filename): @@ -275,9 +294,16 @@ def traverse_find_sigs(filenames, yield_all_files=False): yield fullname -def load_dbs_and_sigs(filenames, query, is_similarity_query, *, - cache_size=None, picklist=None, pattern=None, - fail_on_empty_database=False): +def load_dbs_and_sigs( + filenames, + query, + is_similarity_query, + *, + cache_size=None, + picklist=None, + pattern=None, + fail_on_empty_database=False, +): """ Load one or more Index objects to search - databases, etc. @@ -294,7 +320,7 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, total_signatures_loaded = 0 sum_signatures_after_select = 0 for filename in filenames: - notify(f"loading from '{filename}'...", end='\r') + notify(f"loading from '{filename}'...", end="\r") try: db = _load_database(filename, False, cache_size=cache_size) @@ -308,11 +334,13 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, # get compatible signatures - moltype/ksize/num/scaled try: - db = db.select(moltype=query_mh.moltype, - ksize=query_mh.ksize, - num=query_mh.num, - scaled=query_mh.scaled, - containment=containment) + db = db.select( + moltype=query_mh.moltype, + ksize=query_mh.ksize, + num=query_mh.num, + scaled=query_mh.scaled, + containment=containment, + ) except ValueError as exc: # incompatible collection specified! notify(f"ERROR: cannot use '{filename}' for this query.") @@ -337,9 +365,13 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, # display num loaded/num selected notify("--") - notify(f"loaded {total_signatures_loaded} total signatures from {len(databases)} locations.") - notify(f"after selecting signatures compatible with search, {sum_signatures_after_select} remain.") - print('') + notify( + f"loaded {total_signatures_loaded} total signatures from {len(databases)} locations." + ) + notify( + f"after selecting signatures compatible with search, {sum_signatures_after_select} remain." + ) + print("") return databases @@ -347,15 +379,17 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, def load_pathlist_from_file(filename): "Load a list-of-files text file." try: - with open(filename, 'rt') as fp: - file_list = [ x.rstrip('\r\n') for x in fp ] + with open(filename) as fp: + file_list = [x.rstrip("\r\n") for x in fp] file_list = set(file_list) if not file_list: raise ValueError("pathlist is empty") for checkfile in file_list: if not os.path.exists(checkfile): - raise ValueError(f"file '{checkfile}' inside the pathlist does not exist") - except IOError: + raise ValueError( + f"file '{checkfile}' inside the pathlist does not exist" + ) + except OSError: raise ValueError(f"pathlist file '{filename}' does not exist") except OSError: raise ValueError(f"cannot open file '{filename}'") @@ -385,7 +419,8 @@ class FileOutput: will properly handle no argument or '-' as sys.stdout. """ - def __init__(self, filename, mode='wt', *, newline=None, encoding='utf-8'): + + def __init__(self, filename, mode="wt", *, newline=None, encoding="utf-8"): self.filename = filename self.mode = mode self.fp = None @@ -393,14 +428,15 @@ def __init__(self, filename, mode='wt', *, newline=None, encoding='utf-8'): self.encoding = encoding def open(self): - if self.filename == '-' or self.filename is None: + if self.filename == "-" or self.filename is None: return sys.stdout - self.fp = open(self.filename, self.mode, newline=self.newline, - encoding=self.encoding) + self.fp = open( + self.filename, self.mode, newline=self.newline, encoding=self.encoding + ) return self.fp def close(self): - if self.fp is not None: # in case of stdout + if self.fp is not None: # in case of stdout self.fp.close() def __enter__(self): @@ -435,17 +471,18 @@ class FileOutputCSV(FileOutput): will properly handle no argument or '-' as sys.stdout. """ + def __init__(self, filename): self.filename = filename self.fp = None def open(self): - if self.filename == '-' or self.filename is None: + if self.filename == "-" or self.filename is None: return sys.stdout - if self.filename.endswith('.gz'): - self.fp = gzip.open(self.filename, 'wt', newline='') + if self.filename.endswith(".gz"): + self.fp = gzip.open(self.filename, "wt", newline="") else: - self.fp = open(self.filename, 'w', newline='') + self.fp = open(self.filename, "w", newline="") return self.fp @@ -457,38 +494,44 @@ class _DictReader_with_version: The version is stored as a 2-tuple in the 'version_info' attribute. """ - def __init__(self, textfp, *, delimiter=','): + + def __init__(self, textfp, *, delimiter=","): self.version_info = [] # is there a '#' in the raw buffer pos 0? ch = textfp.buffer.peek(1) try: - ch = ch.decode('utf-8') + ch = ch.decode("utf-8") except UnicodeDecodeError: raise csv.Error("unable to read CSV file") # yes - read a line from the text buffer => parse - if ch.startswith('#'): + if ch.startswith("#"): line = textfp.readline() - assert line.startswith('# '), line + assert line.startswith("# "), line # note, this can set version_info to lots of different things. # revisit later, I guess. CTB. - self.version_info = line[2:].strip().split(': ', 2) + self.version_info = line[2:].strip().split(": ", 2) # build a DictReader from the remaining stream self.reader = csv.DictReader(textfp, delimiter=delimiter) self.fieldnames = self.reader.fieldnames def __iter__(self): - for row in self.reader: - yield row + yield from self.reader @contextlib.contextmanager -def FileInputCSV(filename, *, encoding='utf-8', default_csv_name=None, - zipfile_obj=None, delimiter=','): +def FileInputCSV( + filename, + *, + encoding="utf-8", + default_csv_name=None, + zipfile_obj=None, + delimiter=",", +): """A context manager for reading in CSV files in gzip, zip or text format. Assumes comma delimiter, and uses csv.DictReader. @@ -513,24 +556,20 @@ def FileInputCSV(filename, *, encoding='utf-8', default_csv_name=None, try: zi = zipfile_obj.getinfo(default_csv_name) with zipfile_obj.open(zi) as fp: - textfp = TextIOWrapper(fp, - encoding=encoding, - newline="") + textfp = TextIOWrapper(fp, encoding=encoding, newline="") r = _DictReader_with_version(textfp, delimiter=delimiter) yield r except (zipfile.BadZipFile, KeyError): - pass # uh oh, we were given a zipfile_obj and it FAILED. + pass # uh oh, we were given a zipfile_obj and it FAILED. # no matter what, if given zipfile_obj don't try .gz or regular csv return else: try: - with zipfile.ZipFile(filename, 'r') as zip_fp: + with zipfile.ZipFile(filename, "r") as zip_fp: zi = zip_fp.getinfo(default_csv_name) with zip_fp.open(zi) as fp: - textfp = TextIOWrapper(fp, - encoding=encoding, - newline="") + textfp = TextIOWrapper(fp, encoding=encoding, newline="") r = _DictReader_with_version(textfp, delimiter=delimiter) yield r @@ -545,7 +584,7 @@ def FileInputCSV(filename, *, encoding='utf-8', default_csv_name=None, # ok, not a zip file - try .gz: try: with gzip.open(filename, "rt", newline="", encoding=encoding) as fp: - fp.buffer.peek(1) # force exception if not a gzip file + fp.buffer.peek(1) # force exception if not a gzip file r = _DictReader_with_version(fp, delimiter=delimiter) yield r return @@ -553,7 +592,7 @@ def FileInputCSV(filename, *, encoding='utf-8', default_csv_name=None, pass # neither zip nor gz; regular file! - with open(filename, 'rt', newline="", encoding=encoding) as fp: + with open(filename, newline="", encoding=encoding) as fp: r = _DictReader_with_version(fp, delimiter=delimiter) yield r @@ -569,6 +608,7 @@ class SignatureLoadingProgress: You can optionally notify of reading a file with `.notify(location)`. """ + def __init__(self, reporting_interval=10): self.n_sig = 0 self.interval = reporting_interval @@ -584,17 +624,19 @@ def short_notify(self, msg_template, *args, **kwargs): """ msg = msg_template.format(*args, **kwargs) - end = kwargs.get('end', '\n') + end = kwargs.get("end", "\n") w = self.screen_width if len(msg) > w: truncate_len = len(msg) - w + 3 - msg = '<<<' + msg[truncate_len:] + msg = "<<<" + msg[truncate_len:] notify(msg, end=end) def notify(self, location): - self.short_notify(f"...{self.n_sig} sigs so far. Now reading from file '{location}'", end='\r') + self.short_notify( + f"...{self.n_sig} sigs so far. Now reading from file '{location}'", end="\r" + ) def start_file(self, location, loader): n_this = 0 @@ -606,24 +648,35 @@ def start_file(self, location, loader): n_this += 1 n_total = n_before + n_this if n_this and n_total % self.interval == 0: - self.short_notify("...loading from '{}' / {} sigs total", - location, n_total, end='\r') + self.short_notify( + "...loading from '{}' / {} sigs total", + location, + n_total, + end="\r", + ) yield result except KeyboardInterrupt: # might as well nicely handle CTRL-C while we're at it! - notify('\n(CTRL-C received! quitting.)') + notify("\n(CTRL-C received! quitting.)") sys.exit(-1) finally: self.n_sig += n_this - self.short_notify(f"Loaded {n_this} sigs from '{location}'", - end='\r') + self.short_notify(f"Loaded {n_this} sigs from '{location}'", end="\r") -def load_many_signatures(locations, progress, *, yield_all_files=False, - ksize=None, moltype=None, picklist=None, force=False, - pattern=None): +def load_many_signatures( + locations, + progress, + *, + yield_all_files=False, + ksize=None, + moltype=None, + picklist=None, + force=False, + pattern=None, +): """ Load many signatures from multiple files, with progress indicators. @@ -648,11 +701,11 @@ def load_many_signatures(locations, progress, *, yield_all_files=False, loader = idx.signatures_with_location() # go! - n = 0 # count signatures loaded + n = 0 # count signatures loaded for sig, sigloc in progress.start_file(loc, loader): yield sig, sigloc n += 1 - notify(f"loaded {n} signatures from '{loc}'", end='\r') + notify(f"loaded {n} signatures from '{loc}'", end="\r") except ValueError as exc: # trap expected errors, and either power through or display + exit. if force: @@ -693,8 +746,9 @@ def get_manifest(idx, *, require=True, rebuild=False): # need to build one... try: notify("Generating a manifest...") - m = CollectionManifest.create_manifest(idx._signatures_with_internal(), - include_signature=False) + m = CollectionManifest.create_manifest( + idx._signatures_with_internal(), include_signature=False + ) debug_literal("get_manifest: rebuilt manifest.") except NotImplementedError: if require: @@ -707,12 +761,17 @@ def get_manifest(idx, *, require=True, rebuild=False): return m -def load_file_as_signatures(filename, *, select_moltype=None, ksize=None, - picklist=None, - yield_all_files=False, - progress=None, - pattern=None, - _use_manifest=True): +def load_file_as_signatures( + filename, + *, + select_moltype=None, + ksize=None, + picklist=None, + yield_all_files=False, + progress=None, + pattern=None, + _use_manifest=True, +): """Load 'filename' as a collection of signatures. Return an iterable. If 'filename' contains an SBT or LCA indexed database, or a regular diff --git a/src/sourmash/sqlite_utils.py b/src/sourmash/sqlite_utils.py index 2b7503a2d8..8efb754a23 100644 --- a/src/sourmash/sqlite_utils.py +++ b/src/sourmash/sqlite_utils.py @@ -31,13 +31,13 @@ def open_sqlite_db(filename): # check for the 'sourmash_internal' table. cursor = conn.cursor() try: - cursor.execute('SELECT DISTINCT key, value FROM sourmash_internal') + cursor.execute("SELECT DISTINCT key, value FROM sourmash_internal") except (sqlite3.OperationalError, sqlite3.DatabaseError): debug_literal("open_sqlite_db: cannot read sourmash_internal.") # is this a taxonomy DB? try: - cursor.execute('SELECT * FROM taxonomy LIMIT 1') + cursor.execute("SELECT * FROM taxonomy LIMIT 1") except (sqlite3.OperationalError, sqlite3.DatabaseError): debug_literal("open_sqlite_db: cannot read 'taxonomy', either.") return None @@ -49,12 +49,14 @@ def add_sourmash_internal(cursor, use_type, version): """ Add use_type/version to sourmash_internal table. """ - cursor.execute(""" + cursor.execute( + """ CREATE TABLE IF NOT EXISTS sourmash_internal ( key TEXT UNIQUE, value TEXT ) - """) + """ + ) d = get_sourmash_internal(cursor) @@ -62,18 +64,23 @@ def add_sourmash_internal(cursor, use_type, version): if val is not None: # do version compatibility foo here? if version != val: - raise Exception(f"sqlite problem: for {use_type}, want version {version}, got version {val}") + raise Exception( + f"sqlite problem: for {use_type}, want version {version}, got version {val}" + ) else: - cursor.execute(""" + cursor.execute( + """ INSERT INTO sourmash_internal (key, value) VALUES (?, ?) - """, (use_type, version)) + """, + (use_type, version), + ) def get_sourmash_internal(cursor): """ Retrieve a key/value dictionary from sourmash_internal. """ - cursor.execute('SELECT DISTINCT key, value FROM sourmash_internal') + cursor.execute("SELECT DISTINCT key, value FROM sourmash_internal") d = dict(cursor) return d diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index b6ff3d9dd2..8e490ae545 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -13,9 +13,14 @@ from sourmash.logging import set_quiet, error, notify, print_results from . import tax_utils -from .tax_utils import MultiLineageDB, RankLineageInfo, LINLineageInfo, AnnotateTaxResult - -usage=''' +from .tax_utils import ( + MultiLineageDB, + RankLineageInfo, + LINLineageInfo, + AnnotateTaxResult, +) + +usage = """ sourmash taxonomy [] - manipulate/work with taxonomy information. or sourmash tax [] @@ -30,31 +35,32 @@ ** Use '-h' to get subcommand-specific help, e.g. sourmash taxonomy metagenome -h -''' +""" # outfile utils _output_type_to_ext = { - 'csv_summary': '.summarized.csv', - 'classification': '.classifications.csv', - 'krona': '.krona.tsv', - 'lineage_summary': '.lineage_summary.tsv', - 'annotate': '.with-lineages.csv', - 'human': '.human.txt', - 'lineage_csv': '.lineage.csv', - 'kreport': ".kreport.txt", - 'lingroup': ".lingroup.tsv", - 'bioboxes': '.bioboxes.profile' - } - -def make_outfile(base, output_type, *, output_dir = ""): - limit_float_decimals=False + "csv_summary": ".summarized.csv", + "classification": ".classifications.csv", + "krona": ".krona.tsv", + "lineage_summary": ".lineage_summary.tsv", + "annotate": ".with-lineages.csv", + "human": ".human.txt", + "lineage_csv": ".lineage.csv", + "kreport": ".kreport.txt", + "lingroup": ".lingroup.tsv", + "bioboxes": ".bioboxes.profile", +} + + +def make_outfile(base, output_type, *, output_dir=""): + limit_float_decimals = False if base == "-": - limit_float_decimals=True + limit_float_decimals = True return base, limit_float_decimals ext = _output_type_to_ext[output_type] - fname = base+ext + fname = base + ext if output_dir: fname = os.path.join(output_dir, fname) notify(f"saving '{output_type}' output to '{fname}'.") @@ -70,50 +76,70 @@ def metagenome(args): # first, load taxonomic_assignments try: - tax_assign = MultiLineageDB.load(args.taxonomy_csv, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions=args.keep_identifier_versions, - force=args.force, lins=args.lins) + tax_assign = MultiLineageDB.load( + args.taxonomy_csv, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + force=args.force, + lins=args.lins, + ) available_ranks = tax_assign.available_ranks except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) if not tax_assign: - error(f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.') + error( + f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.' + ) sys.exit(-1) if args.rank and args.rank not in available_ranks: - error(f"ERROR: No taxonomic information provided for rank {args.rank}: cannot summarize at this rank") + error( + f"ERROR: No taxonomic information provided for rank {args.rank}: cannot summarize at this rank" + ) sys.exit(-1) # next, collect and load gather results - gather_csvs = tax_utils.collect_gather_csvs(args.gather_csv, from_file= args.from_file) + gather_csvs = tax_utils.collect_gather_csvs( + args.gather_csv, from_file=args.from_file + ) try: - query_gather_results = tax_utils.check_and_load_gather_csvs(gather_csvs, tax_assign, force=args.force, - fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions = args.keep_identifier_versions, - lins=args.lins, - ) + query_gather_results = tax_utils.check_and_load_gather_csvs( + gather_csvs, + tax_assign, + force=args.force, + fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + lins=args.lins, + ) except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) if not query_gather_results: - notify('No gather results loaded. Exiting.') + notify("No gather results loaded. Exiting.") sys.exit(-1) - single_query_output_formats = ['csv_summary', 'kreport'] + single_query_output_formats = ["csv_summary", "kreport"] desired_single_outputs = [] - if len(query_gather_results) > 1: # working with multiple queries - desired_single_outputs = [x for x in args.output_format if x in single_query_output_formats] + if len(query_gather_results) > 1: # working with multiple queries + desired_single_outputs = [ + x for x in args.output_format if x in single_query_output_formats + ] if desired_single_outputs: - notify(f"WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping {', '.join(desired_single_outputs)}") + notify( + f"WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping {', '.join(desired_single_outputs)}" + ) # remove single query outputs from output format - args.output_format = [x for x in args.output_format if x not in single_query_output_formats] - if not args.output_format: # or do we want to insert `human` here so we always report something? - error(f"ERROR: No output formats remaining.") + args.output_format = [ + x for x in args.output_format if x not in single_query_output_formats + ] + if ( + not args.output_format + ): # or do we want to insert `human` here so we always report something? + error("ERROR: No output formats remaining.") sys.exit(-1) # for each queryResult, actually summarize at rank, reporting any errors that occur. @@ -126,47 +152,66 @@ def metagenome(args): # write summarized output in human-readable format if "lineage_summary" in args.output_format: - lineage_outfile, limit_float = make_outfile(args.output_base, "lineage_summary", output_dir=args.output_dir) + lineage_outfile, limit_float = make_outfile( + args.output_base, "lineage_summary", output_dir=args.output_dir + ) ## aggregate by lineage by query - lineageD, query_names= tax_utils.aggregate_by_lineage_at_rank(query_gather_results=query_gather_results, - rank=args.rank, by_query=True) + lineageD, query_names = tax_utils.aggregate_by_lineage_at_rank( + query_gather_results=query_gather_results, rank=args.rank, by_query=True + ) with FileOutputCSV(lineage_outfile) as out_fp: - tax_utils.write_lineage_sample_frac(query_names, lineageD, out_fp, sep='\t') + tax_utils.write_lineage_sample_frac(query_names, lineageD, out_fp, sep="\t") # write summarized --> krona output tsv if "krona" in args.output_format: - krona_results, header = tax_utils.format_for_krona(query_gather_results, rank=args.rank) + krona_results, header = tax_utils.format_for_krona( + query_gather_results, rank=args.rank + ) - krona_outfile, limit_float = make_outfile(args.output_base, "krona", output_dir=args.output_dir) + krona_outfile, limit_float = make_outfile( + args.output_base, "krona", output_dir=args.output_dir + ) with FileOutputCSV(krona_outfile) as out_fp: tax_utils.write_krona(header, krona_results, out_fp) if "human" in args.output_format: - summary_outfile, limit_float = make_outfile(args.output_base, "human", output_dir=args.output_dir) + summary_outfile, limit_float = make_outfile( + args.output_base, "human", output_dir=args.output_dir + ) with FileOutput(summary_outfile) as out_fp: human_display_rank = args.rank or "species" if args.lins and not args.rank: - human_display_rank = query_gather_results[0].ranks[-1] # lowest rank + human_display_rank = query_gather_results[0].ranks[-1] # lowest rank - tax_utils.write_human_summary(query_gather_results, out_fp, human_display_rank) + tax_utils.write_human_summary( + query_gather_results, out_fp, human_display_rank + ) # write summarized output csv single_query_results = query_gather_results[0] if "csv_summary" in args.output_format: - summary_outfile, limit_float = make_outfile(args.output_base, "csv_summary", output_dir=args.output_dir) + summary_outfile, limit_float = make_outfile( + args.output_base, "csv_summary", output_dir=args.output_dir + ) with FileOutputCSV(summary_outfile) as out_fp: - tax_utils.write_summary(query_gather_results, out_fp, limit_float_decimals=limit_float) + tax_utils.write_summary( + query_gather_results, out_fp, limit_float_decimals=limit_float + ) # write summarized --> kreport output tsv if "kreport" in args.output_format: - kreport_outfile, limit_float = make_outfile(args.output_base, "kreport", output_dir=args.output_dir) + kreport_outfile, limit_float = make_outfile( + args.output_base, "kreport", output_dir=args.output_dir + ) with FileOutputCSV(kreport_outfile) as out_fp: header, kreport_results = single_query_results.make_kreport_results() - tax_utils.write_output(header, kreport_results, out_fp, sep="\t", write_header=False) + tax_utils.write_output( + header, kreport_results, out_fp, sep="\t", write_header=False + ) # write summarized --> LINgroup output tsv if "lingroup" in args.output_format: @@ -176,15 +221,23 @@ def metagenome(args): error(f"ERROR: {str(exc)}") sys.exit(-1) - lingroupfile, limit_float = make_outfile(args.output_base, "lingroup", output_dir=args.output_dir) + lingroupfile, limit_float = make_outfile( + args.output_base, "lingroup", output_dir=args.output_dir + ) with FileOutputCSV(lingroupfile) as out_fp: - header, lgreport_results = single_query_results.make_lingroup_results(LINgroupsD = lingroups) - tax_utils.write_output(header, lgreport_results, out_fp, sep="\t", write_header=True) + header, lgreport_results = single_query_results.make_lingroup_results( + LINgroupsD=lingroups + ) + tax_utils.write_output( + header, lgreport_results, out_fp, sep="\t", write_header=True + ) # write cami bioboxes format if "bioboxes" in args.output_format: - bbfile, limit_float = make_outfile(args.output_base, "bioboxes", output_dir=args.output_dir) + bbfile, limit_float = make_outfile( + args.output_base, "bioboxes", output_dir=args.output_dir + ) with FileOutputCSV(bbfile) as out_fp: header_lines, bb_results = single_query_results.make_cami_bioboxes() @@ -199,14 +252,17 @@ def genome(args): # first, load taxonomic_assignments try: - tax_assign = MultiLineageDB.load(args.taxonomy_csv, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions=args.keep_identifier_versions, - force=args.force, lins=args.lins) + tax_assign = MultiLineageDB.load( + args.taxonomy_csv, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + force=args.force, + lins=args.lins, + ) available_ranks = tax_assign.available_ranks - lg_ranks=None - all_lgs=None + lg_ranks = None + all_lgs = None if args.lingroup: lingroups = tax_utils.read_lingroups(args.lingroup) lg_ranks, all_lgs = tax_utils.parse_lingroups(lingroups) @@ -216,38 +272,51 @@ def genome(args): sys.exit(-1) if not tax_assign: - error(f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.') + error( + f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.' + ) sys.exit(-1) if args.rank and args.rank not in available_ranks: - error(f"ERROR: No taxonomic information provided for rank {args.rank}: cannot classify at this rank") + error( + f"ERROR: No taxonomic information provided for rank {args.rank}: cannot classify at this rank" + ) sys.exit(-1) # get gather_csvs from args - gather_csvs = tax_utils.collect_gather_csvs(args.gather_csv, from_file=args.from_file) + gather_csvs = tax_utils.collect_gather_csvs( + args.gather_csv, from_file=args.from_file + ) try: - query_gather_results = tax_utils.check_and_load_gather_csvs(gather_csvs, tax_assign, force=args.force, - fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions = args.keep_identifier_versions, - lins=args.lins) + query_gather_results = tax_utils.check_and_load_gather_csvs( + gather_csvs, + tax_assign, + force=args.force, + fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + lins=args.lins, + ) except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) if not query_gather_results: - notify('No results for classification. Exiting.') + notify("No results for classification. Exiting.") sys.exit(-1) # for each queryResult, summarize at rank and classify according to thresholds, reporting any errors that occur. for queryResult in query_gather_results: try: - queryResult.build_classification_result(rank=args.rank, - ani_threshold=args.ani_threshold, - containment_threshold=args.containment_threshold, - lingroup_ranks=lg_ranks, lingroups=all_lgs) + queryResult.build_classification_result( + rank=args.rank, + ani_threshold=args.ani_threshold, + containment_threshold=args.containment_threshold, + lingroup_ranks=lg_ranks, + lingroups=all_lgs, + ) except ValueError as exc: error(f"ERROR: {str(exc)}") @@ -255,42 +324,65 @@ def genome(args): # write outputs if "csv_summary" in args.output_format: - summary_outfile, limit_float = make_outfile(args.output_base, "classification", output_dir=args.output_dir) + summary_outfile, limit_float = make_outfile( + args.output_base, "classification", output_dir=args.output_dir + ) with FileOutputCSV(summary_outfile) as out_fp: - tax_utils.write_summary(query_gather_results, out_fp, limit_float_decimals=limit_float, classification=True) + tax_utils.write_summary( + query_gather_results, + out_fp, + limit_float_decimals=limit_float, + classification=True, + ) # write summarized output in human-readable format if "human" in args.output_format: - summary_outfile, limit_float = make_outfile(args.output_base, "human", output_dir=args.output_dir) + summary_outfile, limit_float = make_outfile( + args.output_base, "human", output_dir=args.output_dir + ) with FileOutput(summary_outfile) as out_fp: - tax_utils.write_human_summary(query_gather_results, out_fp, args.rank or "species", classification=True) + tax_utils.write_human_summary( + query_gather_results, + out_fp, + args.rank or "species", + classification=True, + ) # The following require a single rank: # note: interactive krona can handle mult ranks, do we want to enable? if "krona" in args.output_format: - krona_results, header = tax_utils.format_for_krona(query_gather_results=query_gather_results, rank=args.rank, classification=True) - krona_outfile, limit_float = make_outfile(args.output_base, "krona", output_dir=args.output_dir) + krona_results, header = tax_utils.format_for_krona( + query_gather_results=query_gather_results, + rank=args.rank, + classification=True, + ) + krona_outfile, limit_float = make_outfile( + args.output_base, "krona", output_dir=args.output_dir + ) with FileOutputCSV(krona_outfile) as out_fp: tax_utils.write_krona(header, krona_results, out_fp) if "lineage_csv" in args.output_format: - lineage_outfile, _ = make_outfile(args.output_base, "lineage_csv", - output_dir=args.output_dir) + lineage_outfile, _ = make_outfile( + args.output_base, "lineage_csv", output_dir=args.output_dir + ) lineage_results = [] header = None for q_res in query_gather_results: if not header: ranks = list(q_res.ranks) - if 'strain' in ranks: # maintains prior functionality.. but we could keep strain now, i think? - ranks.remove('strain') + if ( + "strain" in ranks + ): # maintains prior functionality.. but we could keep strain now, i think? + ranks.remove("strain") header = ["ident", *ranks] - lineageD = q_res.classification_result.as_lineage_dict(q_res.query_info, ranks) + lineageD = q_res.classification_result.as_lineage_dict( + q_res.query_info, ranks + ) lineage_results.append(lineageD) with FileOutputCSV(lineage_outfile) as out_fp: tax_utils.write_output(header, lineage_results, out_fp) - - def annotate(args): @@ -304,21 +396,28 @@ def annotate(args): try: # first, load taxonomic_assignments - tax_assign = MultiLineageDB.load(args.taxonomy_csv, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions=args.keep_identifier_versions, - force=args.force, lins=args.lins) + tax_assign = MultiLineageDB.load( + args.taxonomy_csv, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + force=args.force, + lins=args.lins, + ) except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) if not tax_assign: - error(f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.') + error( + f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.' + ) sys.exit(-1) # get csv from args - input_csvs = tax_utils.collect_gather_csvs(args.gather_csv, from_file=args.from_file) + input_csvs = tax_utils.collect_gather_csvs( + args.gather_csv, from_file=args.from_file + ) # handle each gather csv separately for n, in_csv in enumerate(input_csvs): @@ -332,22 +431,28 @@ def annotate(args): # look for the column to match with taxonomic identifier id_col = None - col_options = ['name', 'match_name', 'ident', 'accession'] + col_options = ["name", "match_name", "ident", "accession"] for colname in col_options: if colname in header: id_col = colname break if not id_col: - raise ValueError(f"Cannot find taxonomic identifier column in '{in_csv}'. Tried: {', '.join(col_options)}") + raise ValueError( + f"Cannot find taxonomic identifier column in '{in_csv}'. Tried: {', '.join(col_options)}" + ) - notify(f"Starting annotation on '{in_csv}'. Using ID column: '{id_col}'") + notify( + f"Starting annotation on '{in_csv}'. Using ID column: '{id_col}'" + ) # make output file for this input - out_base = os.path.basename(in_csv.rsplit('.csv')[0]) - this_outfile, _ = make_outfile(out_base, "annotate", output_dir=args.output_dir) + out_base = os.path.basename(in_csv.rsplit(".csv")[0]) + this_outfile, _ = make_outfile( + out_base, "annotate", output_dir=args.output_dir + ) - out_header = header + ['lineage'] + out_header = header + ["lineage"] with FileOutputCSV(this_outfile) as out_fp: w = csv.DictWriter(out_fp, out_header) @@ -357,25 +462,36 @@ def annotate(args): n_missed = 0 for n, row in enumerate(r): # find lineage and write annotated row - taxres = AnnotateTaxResult(raw=row, id_col=id_col, lins=args.lins, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions=args.keep_identifier_versions) - taxres.get_match_lineage(tax_assignments=tax_assign, fail_on_missing_taxonomy=args.fail_on_missing_taxonomy) - - if taxres.missed_ident: # could not assign taxonomy - n_missed+=1 + taxres = AnnotateTaxResult( + raw=row, + id_col=id_col, + lins=args.lins, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + ) + taxres.get_match_lineage( + tax_assignments=tax_assign, + fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, + ) + + if taxres.missed_ident: # could not assign taxonomy + n_missed += 1 w.writerow(taxres.row_with_lineages()) - rows_annotated = (n+1) - n_missed + rows_annotated = (n + 1) - n_missed if not rows_annotated: - raise ValueError(f"Could not annotate any rows from '{in_csv}'.") + raise ValueError( + f"Could not annotate any rows from '{in_csv}'." + ) else: - notify(f"Annotated {rows_annotated} of {n+1} total rows from '{in_csv}'.") + notify( + f"Annotated {rows_annotated} of {n+1} total rows from '{in_csv}'." + ) except ValueError as exc: if args.force: notify(str(exc)) - notify('--force is set. Attempting to continue to next file.') + notify("--force is set. Attempting to continue to next file.") else: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -385,10 +501,12 @@ def prepare(args): "Combine multiple taxonomy databases into one and/or translate formats." notify("loading taxonomies...") try: - tax_assign = MultiLineageDB.load(args.taxonomy_csv, - force=args.force, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions=args.keep_identifier_versions) + tax_assign = MultiLineageDB.load( + args.taxonomy_csv, + force=args.force, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + ) except ValueError as exc: error("ERROR while loading taxonomies!") error(str(exc)) @@ -409,14 +527,15 @@ def prepare(args): def grep(args): term = args.pattern - tax_assign = MultiLineageDB.load(args.taxonomy_csv, - force=args.force) + tax_assign = MultiLineageDB.load(args.taxonomy_csv, force=args.force) silent = args.silent or args.count notify(f"searching {len(args.taxonomy_csv)} taxonomy files for '{term}'") if args.invert_match: - notify("-v/--invert-match specified; returning only lineages that do not match.") + notify( + "-v/--invert-match specified; returning only lineages that do not match." + ) if args.rank: notify(f"limiting matches to {args.rank} level") @@ -436,6 +555,7 @@ def find_pattern(lineage, select_rank): return False if args.invert_match: + def search_pattern(l, r): return not find_pattern(l, r) else: @@ -452,22 +572,26 @@ def search_pattern(l, r): else: with FileOutputCSV(args.output) as fp: w = csv.writer(fp) - w.writerow(['ident'] + list(RankLineageInfo().taxlist[:-1])) + w.writerow(["ident"] + list(RankLineageInfo().taxlist[:-1])) for ident, lineage in sorted(match_ident): - w.writerow([ident] + [ x.name for x in lineage ]) + w.writerow([ident] + [x.name for x in lineage]) - notify(f"found {len(match_ident)} matches; saved identifiers to picklist file '{args.output}'") + notify( + f"found {len(match_ident)} matches; saved identifiers to picklist file '{args.output}'" + ) def summarize(args): "Summarize multiple taxonomy databases." notify("loading taxonomies...") try: - tax_assign = MultiLineageDB.load(args.taxonomy_files, - force=args.force, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions=args.keep_identifier_versions, - lins=args.lins) + tax_assign = MultiLineageDB.load( + args.taxonomy_files, + force=args.force, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + lins=args.lins, + ) except ValueError as exc: error("ERROR while loading taxonomies!") error(str(exc)) @@ -481,7 +605,6 @@ def summarize(args): rank_counts = defaultdict(int) name_seen = set() for v in tax_assign.values(): - sofar = [] for vv in v: name = vv.name rank = vv.rank @@ -507,7 +630,7 @@ def summarize(args): with FileOutputCSV(args.output_lineage_information) as fp: w = csv.writer(fp) - w.writerow(['rank', 'lineage_count', 'lineage']) + w.writerow(["rank", "lineage_count", "lineage"]) # output in order of most common for lineage, count in lineage_counts.most_common(): @@ -526,9 +649,9 @@ def summarize(args): def main(arglist=None): args = sourmash.cli.get_parser().parse_args(arglist) submod = getattr(sourmash.cli.sig, args.subcmd) - mainmethod = getattr(submod, 'main') + mainmethod = getattr(submod, "main") return mainmethod(args) -if __name__ == '__main__': +if __name__ == "__main__": main(sys.argv) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index df69f0ee6a..55b30a540e 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -16,24 +16,45 @@ import sqlite3 -__all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs', - 'load_gather_results', 'check_and_load_gather_csvs' - 'report_missing_and_skipped_identities', 'aggregate_by_lineage_at_rank' - 'format_for_krona', 'write_output', 'write_bioboxes', 'parse_lingroups', - 'combine_sumgather_csvs_by_lineage', 'write_lineage_sample_frac', - 'MultiLineageDB', 'RankLineageInfo', 'LINLineageInfo'] +__all__ = [ + "get_ident", + "ascending_taxlist", + "collect_gather_csvs", + "load_gather_results", + "check_and_load_gather_csvs" "report_missing_and_skipped_identities", + "aggregate_by_lineage_at_rank" "format_for_krona", + "write_output", + "write_bioboxes", + "parse_lingroups", + "combine_sumgather_csvs_by_lineage", + "write_lineage_sample_frac", + "MultiLineageDB", + "RankLineageInfo", + "LINLineageInfo", +] from sourmash.logging import notify from sourmash.sourmash_args import load_pathlist_from_file -RANKCODE = { "superkingdom": "D", "kingdom": "K", "phylum": "P", "class": "C", - "order": "O", "family":"F", "genus": "G", "species": "S", "unclassified": "U"} +RANKCODE = { + "superkingdom": "D", + "kingdom": "K", + "phylum": "P", + "class": "C", + "order": "O", + "family": "F", + "genus": "G", + "species": "S", + "unclassified": "U", +} + class LineagePair(NamedTuple): rank: str name: str = None taxid: int = None + @dataclass(frozen=True, order=True) class BaseLineageInfo: """ @@ -53,10 +74,13 @@ class BaseLineageInfo: Input lineage information is only used for initialization of the final `lineage` and will not be used or compared in any other class methods. """ + # need to set compare=False for any mutable type to keep this class hashable - ranks: tuple() # require ranks - lineage: tuple = None # tuple of LineagePairs - lineage_str: str = field(default=None, compare=False) # ';'- or ','-separated str of lineage names + ranks: () # require ranks + lineage: tuple = None # tuple of LineagePairs + lineage_str: str = field( + default=None, compare=False + ) # ';'- or ','-separated str of lineage names def __post_init__(self): "Initialize according to passed values" @@ -71,9 +95,11 @@ def __post_init__(self): self._init_empty() def __eq__(self, other): - if other == (): # just handy: if comparing to a null tuple, don't try to find its lineage before returning False + if ( + other == () + ): # just handy: if comparing to a null tuple, don't try to find its lineage before returning False return False - return all([self.ranks == other.ranks and self.lineage==other.lineage]) + return all([self.ranks == other.ranks and self.lineage == other.lineage]) @property def taxlist(self): @@ -108,7 +134,7 @@ def filled_lineage(self): if not self.filled_ranks: return () lowest_filled_rank_idx = self.rank_index(self.filled_ranks[-1]) - return self.lineage[:lowest_filled_rank_idx+1] + return self.lineage[: lowest_filled_rank_idx + 1] @property def lowest_lineage_name(self): @@ -125,7 +151,7 @@ def lowest_lineage_taxid(self): return self.filled_lineage[-1].taxid def _init_empty(self): - 'initialize empty genome lineage' + "initialize empty genome lineage" new_lineage = [] for rank in self.ranks: new_lineage.append(LineagePair(rank=rank)) @@ -134,7 +160,7 @@ def _init_empty(self): object.__setattr__(self, "filled_ranks", ()) def _init_from_lineage_tuples(self): - 'initialize from tuple/list of LineagePairs, allowing empty ranks and reordering if necessary' + "initialize from tuple/list of LineagePairs, allowing empty ranks and reordering if necessary" new_lineage = [] # check this is a list or tuple of lineage tuples: for rank in self.ranks: @@ -143,12 +169,14 @@ def _init_from_lineage_tuples(self): # now add input tuples in correct spots. This corrects for order and allows empty values. if not isinstance(lin_tup, LineagePair): raise ValueError(f"{lin_tup} is not tax_utils LineagePair.") - if lin_tup.rank: # skip this tuple if rank is None or "" (empty lineage tuple. is this needed?) + if lin_tup.rank: # skip this tuple if rank is None or "" (empty lineage tuple. is this needed?) try: # find index for this rank rank_idx = self.rank_index(lin_tup.rank) except ValueError as e: - raise ValueError(f"Rank '{lin_tup.rank}' not present in {', '.join(self.ranks)}") from e + raise ValueError( + f"Rank '{lin_tup.rank}' not present in {', '.join(self.ranks)}" + ) from e new_lineage[rank_idx] = lin_tup # build list of filled ranks @@ -161,10 +189,13 @@ def _init_from_lineage_str(self): """ Turn a ; or ,-separated set of lineages into a list of LineagePair objs. """ - new_lineage = self.lineage_str.split(';') + new_lineage = self.lineage_str.split(";") if len(new_lineage) == 1: - new_lineage = self.lineage_str.split(',') - new_lineage = [ LineagePair(rank=rank, name=n) for (rank, n) in zip_longest(self.ranks, new_lineage) ] + new_lineage = self.lineage_str.split(",") + new_lineage = [ + LineagePair(rank=rank, name=n) + for (rank, n) in zip_longest(self.ranks, new_lineage) + ] # build list of filled ranks filled_ranks = [a.rank for a in new_lineage if a.name is not None] object.__setattr__(self, "lineage", tuple(new_lineage)) @@ -180,7 +211,7 @@ def zip_lineage(self, truncate_empty=False): zipped = [a.name for a in self.lineage] # replace None with empty string ("") if None in zipped: - zipped = ['' if x is None else x for x in zipped] + zipped = ["" if x is None else x for x in zipped] return zipped @@ -193,11 +224,11 @@ def zip_taxid(self, truncate_empty=False): else: zipped = [a.taxid for a in self.lineage] # replace None with empty string (""); cast taxids to str - zipped = ['' if x is None else str(x) for x in zipped] + zipped = ["" if x is None else str(x) for x in zipped] return zipped - def display_lineage(self, truncate_empty=True, null_as_unclassified=False, sep = ';'): + def display_lineage(self, truncate_empty=True, null_as_unclassified=False, sep=";"): "Return lineage names as ';'-separated list" lin = sep.join(self.zip_lineage(truncate_empty=truncate_empty)) if null_as_unclassified and lin == "" or lin is None: @@ -205,12 +236,12 @@ def display_lineage(self, truncate_empty=True, null_as_unclassified=False, sep = else: return lin - def display_taxid(self, truncate_empty=True, sep = ";"): + def display_taxid(self, truncate_empty=True, sep=";"): "Return lineage taxids as ';'-separated list" return sep.join(self.zip_taxid(truncate_empty=truncate_empty)) def check_rank_availability(self, rank): - if rank in self.ranks: # rank is available + if rank in self.ranks: # rank is available return True raise ValueError(f"Desired Rank '{rank}' not available for this lineage.") @@ -234,12 +265,14 @@ def is_lineage_match(self, other, rank): """ self.check_rank_availability(rank) if not self.is_compatible(other): - raise ValueError("Cannot compare lineages from taxonomies with different ranks.") + raise ValueError( + "Cannot compare lineages from taxonomies with different ranks." + ) # always return false if rank is not filled in either of the two lineages if self.rank_is_filled(rank, other=other): rank_idx = self.rank_index(rank) - a_lin = self.lineage[:rank_idx+1] - b_lin = other.lineage[:rank_idx+1] + a_lin = self.lineage[: rank_idx + 1] + b_lin = other.lineage[: rank_idx + 1] if a_lin == b_lin: return 1 return 0 @@ -252,7 +285,7 @@ def pop_to_rank(self, rank): return replace(self) # if not, make filled_lineage at this rank + use to generate new LineageInfo new_lineage = self.lineage_at_rank(rank) - new = replace(self, lineage = new_lineage) + new = replace(self, lineage=new_lineage) # replace doesn't run the __post_init__ properly. reinitialize. new._init_from_lineage_tuples() return new @@ -265,7 +298,7 @@ def lineage_at_rank(self, rank): return self.filled_lineage # if not, return lineage tuples down to desired rank rank_idx = self.rank_index(rank) - return self.filled_lineage[:rank_idx+1] + return self.filled_lineage[: rank_idx + 1] def find_lca(self, other): """ @@ -298,8 +331,18 @@ class RankLineageInfo(BaseLineageInfo): Input lineage information is only used for initialization of the final `lineage` and will not be used or compared in any other class methods. """ - ranks: tuple = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain') - lineage_dict: dict = field(default=None, compare=False) # dict of rank: name + + ranks: tuple = ( + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + "strain", + ) + lineage_dict: dict = field(default=None, compare=False) # dict of rank: name def __post_init__(self): "Initialize according to passed values" @@ -321,21 +364,23 @@ def _init_from_lineage_dict(self): Use NCBI taxids if available as '|'-separated 'taxpath' column. Allows empty ranks/extra columns and reordering if necessary """ - null_names = set(['[Blank]', 'na', 'null', 'NA', '']) + null_names = set(["[Blank]", "na", "null", "NA", ""]) if not isinstance(self.lineage_dict, (dict)): raise ValueError(f"{self.lineage_dict} is not dictionary") new_lineage = [] - taxpath=[] + taxpath = [] # build empty lineage and taxpath for rank in self.ranks: new_lineage.append(LineagePair(rank=rank)) # check for NCBI taxpath information - taxpath_str = self.lineage_dict.get('taxpath', []) + taxpath_str = self.lineage_dict.get("taxpath", []) if taxpath_str: - taxpath = taxpath_str.split('|') + taxpath = taxpath_str.split("|") if len(taxpath) > len(self.ranks): - raise ValueError(f"Number of NCBI taxids ({len(taxpath)}) exceeds number of ranks ({len(self.ranks)})") + raise ValueError( + f"Number of NCBI taxids ({len(taxpath)}) exceeds number of ranks ({len(self.ranks)})" + ) # now add rank information in correct spots. This corrects for order and allows empty ranks and extra dict keys for key, val in self.lineage_dict.items(): @@ -344,7 +389,7 @@ def _init_from_lineage_dict(self): rank, name = key, val rank_idx = self.rank_index(rank) except ValueError: - continue # ignore dictionary entries (columns) that don't match a rank + continue # ignore dictionary entries (columns) that don't match a rank if taxpath: try: @@ -353,8 +398,8 @@ def _init_from_lineage_dict(self): taxid = None # filter null if name is not None and name.strip() in null_names: - name = None - new_lineage[rank_idx] = LineagePair(rank=rank, name=name, taxid=taxid) + name = None + new_lineage[rank_idx] = LineagePair(rank=rank, name=name, taxid=taxid) # build list of filled ranks filled_ranks = [a.rank for a in new_lineage if a.name] @@ -382,7 +427,10 @@ class LINLineageInfo(BaseLineageInfo): Input lineage information is only used for initialization of the final `lineage` and will not be used or compared in any other class methods. """ - ranks: tuple = field(default=None, init=False, compare=False)# we will set this within class instead + + ranks: tuple = field( + default=None, init=False, compare=False + ) # we will set this within class instead lineage: tuple = None # init with n_positions if you want to set a specific number of positions n_lin_positions: int = field(default=None, compare=False) @@ -403,9 +451,11 @@ def __eq__(self, other): total ranks, with full LINs, we only check for the filled_lineage to match and don't check that the number of lin_positions match. """ - if other == (): # if comparing to a null tuple, don't try to find its lineage before returning False + if ( + other == () + ): # if comparing to a null tuple, don't try to find its lineage before returning False return False - return self.filled_lineage==other.filled_lineage + return self.filled_lineage == other.filled_lineage def _init_ranks_from_n_lin_positions(self): new_ranks = [str(x) for x in range(0, self.n_lin_positions)] @@ -418,7 +468,7 @@ def _init_empty(self): # set n_lin_positions to 0 for completely empty LINLineageInfo object.__setattr__(self, "n_lin_positions", 0) self._init_ranks_from_n_lin_positions() - new_lineage=[] + new_lineage = [] for rank in self.ranks: new_lineage.append(LineagePair(rank=rank)) # set lineage and filled_ranks (because frozen, need to do it this way) @@ -430,12 +480,16 @@ def _init_from_lineage_str(self): """ Turn a ; or ,-separated set of lineages into a list of LineagePair objs. """ - new_lineage = self.lineage_str.split(';') + new_lineage = self.lineage_str.split(";") if len(new_lineage) == 1: - new_lineage = self.lineage_str.split(',') + new_lineage = self.lineage_str.split(",") if self.n_lin_positions is not None: if self.n_lin_positions < len(new_lineage): - raise(ValueError("Provided 'n_lin_positions' has fewer positions than provided 'lineage_str'.")) + raise ( + ValueError( + "Provided 'n_lin_positions' has fewer positions than provided 'lineage_str'." + ) + ) self._init_ranks_from_n_lin_positions() else: n_lin_positions = len(new_lineage) @@ -443,14 +497,17 @@ def _init_from_lineage_str(self): self._init_ranks_from_n_lin_positions() # build lineage and n_filled_pos, filled_ranks - new_lineage = [ LineagePair(rank=rank, name=n) for (rank, n) in zip_longest(self.ranks, new_lineage) ] + new_lineage = [ + LineagePair(rank=rank, name=n) + for (rank, n) in zip_longest(self.ranks, new_lineage) + ] filled_ranks = [a.rank for a in new_lineage if a.name is not None] object.__setattr__(self, "lineage", tuple(new_lineage)) object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) object.__setattr__(self, "n_filled_pos", len(filled_ranks)) - def _init_from_lineage_tuples(self): - 'initialize from tuple/list of LineagePairs, building ranks as you go' + def _init_from_lineage_tuples(self): + "initialize from tuple/list of LineagePairs, building ranks as you go" new_lineage = [] ranks = [] # check this is a list or tuple of lineage tuples: @@ -469,7 +526,6 @@ def _init_from_lineage_tuples(self): object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) object.__setattr__(self, "n_filled_pos", len(filled_ranks)) - def is_compatible(self, other): """ Since we sometimes want to match LINprefixes with full LINs, @@ -486,7 +542,6 @@ def is_compatible(self, other): return False - @dataclass class LineageTree: """ @@ -494,6 +549,7 @@ class LineageTree: LineageInfo objects in 'assignments'. This tree can then be used to find lowest common ancestor agreements/confusion. """ + assignments: list = field(compare=False) def __post_init__(self): @@ -501,7 +557,7 @@ def __post_init__(self): self.add_lineages(self.assignments) def add_lineage(self, lineage): - if isinstance(lineage, (BaseLineageInfo, RankLineageInfo, LINLineageInfo)): + if isinstance(lineage, BaseLineageInfo | RankLineageInfo | LINLineageInfo): lineage = lineage.filled_lineage node = self.tree for lineage_tup in lineage: @@ -515,7 +571,9 @@ def add_lineages(self, lineages): if not lineages: raise ValueError("empty assignment passed to build_tree") if not isinstance(lineages, abc.Iterable): - raise ValueError("Must pass in an iterable containing LineagePair or LineageInfo objects.") + raise ValueError( + "Must pass in an iterable containing LineagePair or LineageInfo objects." + ) for lineageInf in lineages: self.add_lineage(lineageInf) @@ -529,13 +587,13 @@ def find_lca(self): node = self.tree lca = [] while 1: - if len(node) == 1: # descend to only child; track path + if len(node) == 1: # descend to only child; track path lineage_tup = next(iter(node.keys())) lca.append(lineage_tup) node = node[lineage_tup] - elif len(node) == 0: # at leaf; end + elif len(node) == 0: # at leaf; end return tuple(lca), 0 - else: # len(node) > 1 => confusion!! + else: # len(node) > 1 => confusion!! return tuple(lca), len(node) def ordered_paths(self, include_internal=False): @@ -550,7 +608,7 @@ def ordered_paths(self, include_internal=False): while stack: path, node = stack.pop() for key, val in node.items(): - if len(val) == 0: # leaf node + if len(val) == 0: # leaf node # if want internal paths, build up from leaf if include_internal: internal_path = path @@ -561,20 +619,19 @@ def ordered_paths(self, include_internal=False): internal_path = internal_path[:-1] # now add leaf path paths.append(path + (key,)) - else: # not leaf, add to stack + else: # not leaf, add to stack stack.append((path + (key,), val)) return paths -def get_ident(ident, *, - keep_full_identifiers=False, keep_identifier_versions=False): +def get_ident(ident, *, keep_full_identifiers=False, keep_identifier_versions=False): # split identifiers = split on whitespace # keep identifiers = don't split .[12] from assembly accessions "Hack and slash identifiers." if not keep_full_identifiers: - ident = ident.split(' ')[0] + ident = ident.split(" ")[0] if not keep_identifier_versions: - ident = ident.split('.')[0] + ident = ident.split(".")[0] return ident @@ -582,12 +639,18 @@ def ascending_taxlist(include_strain=True): """ Provide an ordered list of taxonomic ranks: strain --> superkingdom """ - ascending_taxlist = ['species', 'genus', 'family', 'order', - 'class', 'phylum', 'superkingdom'] + ascending_taxlist = [ + "species", + "genus", + "family", + "order", + "class", + "phylum", + "superkingdom", + ] if include_strain: - ascending_taxlist = ['strain'] + ascending_taxlist - for k in ascending_taxlist: - yield k + ascending_taxlist = ["strain"] + ascending_taxlist + yield from ascending_taxlist def collect_gather_csvs(cmdline_gather_input, *, from_file=None): @@ -600,7 +663,7 @@ def collect_gather_csvs(cmdline_gather_input, *, from_file=None): if gf not in gather_csvs: gather_csvs.append(gf) else: - notify(f'ignoring duplicated reference to file: {gf}') + notify(f"ignoring duplicated reference to file: {gf}") # ignore pathlist duplicates if from_file: more_files = load_pathlist_from_file(from_file) @@ -608,25 +671,29 @@ def collect_gather_csvs(cmdline_gather_input, *, from_file=None): if gf not in gather_csvs: gather_csvs.append(gf) else: - notify(f'ignoring duplicated reference to file: {gf}') + notify(f"ignoring duplicated reference to file: {gf}") return gather_csvs def read_lingroups(lingroup_csv): lingroupD = {} - n=None + n = None with sourmash_args.FileInputCSV(lingroup_csv) as r: header = r.fieldnames # check for empty file if not header: - raise ValueError(f"Cannot read lingroups from '{lingroup_csv}'. Is file empty?") + raise ValueError( + f"Cannot read lingroups from '{lingroup_csv}'. Is file empty?" + ) if "lin" not in header or "name" not in header: - raise ValueError(f"'{lingroup_csv}' must contain the following columns: 'name', 'lin'.") + raise ValueError( + f"'{lingroup_csv}' must contain the following columns: 'name', 'lin'." + ) for n, row in enumerate(r): - lingroupD[row['lin']] = row['name'] + lingroupD[row["lin"]] = row["name"] if n is None: - raise ValueError(f'No lingroups loaded from {lingroup_csv}.') + raise ValueError(f"No lingroups loaded from {lingroup_csv}.") n_lg = len(lingroupD.keys()) notify(f"Read {n+1} lingroup rows and found {n_lg} distinct lingroup prefixes.") return lingroupD @@ -646,20 +713,30 @@ def parse_lingroups(lingroupD): return lg_ranks, all_lgs -def load_gather_results(gather_csv, tax_assignments, *, seen_queries=None, force=False, - skip_idents = None, fail_on_missing_taxonomy=False, - keep_full_identifiers=False, keep_identifier_versions=False, - lins=False): +def load_gather_results( + gather_csv, + tax_assignments, + *, + seen_queries=None, + force=False, + skip_idents=None, + fail_on_missing_taxonomy=False, + keep_full_identifiers=False, + keep_identifier_versions=False, + lins=False, +): "Load a single gather csv" if not seen_queries: - seen_queries=set() + seen_queries = set() header = [] gather_results = {} with sourmash_args.FileInputCSV(gather_csv) as r: header = r.fieldnames # check for empty file if not header: - raise ValueError(f"Cannot read gather results from '{gather_csv}'. Is file empty?") + raise ValueError( + f"Cannot read gather results from '{gather_csv}'. Is file empty?" + ) this_querytaxres = None for n, row in enumerate(r): @@ -667,72 +744,101 @@ def load_gather_results(gather_csv, tax_assignments, *, seen_queries=None, force try: gatherRow = GatherRow(**row) except TypeError as exc: - raise ValueError(f"'{gather_csv}' is missing columns needed for taxonomic summarization. Please run gather with sourmash >= 4.4.") from exc + raise ValueError( + f"'{gather_csv}' is missing columns needed for taxonomic summarization. Please run gather with sourmash >= 4.4." + ) from exc # check if we've seen this query already in a different gather CSV if gatherRow.query_name in seen_queries: # do not allow loading of same query from a second CSV. - raise ValueError(f"Gather query {gatherRow.query_name} was found in more than one CSV. Cannot load from '{gather_csv}'.") - taxres = TaxResult(raw=gatherRow, keep_full_identifiers=keep_full_identifiers, - keep_identifier_versions=keep_identifier_versions, - lins=lins) - taxres.get_match_lineage(tax_assignments=tax_assignments, skip_idents=skip_idents, - fail_on_missing_taxonomy=fail_on_missing_taxonomy) + raise ValueError( + f"Gather query {gatherRow.query_name} was found in more than one CSV. Cannot load from '{gather_csv}'." + ) + taxres = TaxResult( + raw=gatherRow, + keep_full_identifiers=keep_full_identifiers, + keep_identifier_versions=keep_identifier_versions, + lins=lins, + ) + taxres.get_match_lineage( + tax_assignments=tax_assignments, + skip_idents=skip_idents, + fail_on_missing_taxonomy=fail_on_missing_taxonomy, + ) # add to matching QueryTaxResult or create new one if not this_querytaxres or not this_querytaxres.is_compatible(taxres): # get existing or initialize new - this_querytaxres = gather_results.get(gatherRow.query_name, QueryTaxResult(taxres.query_info, lins=lins)) + this_querytaxres = gather_results.get( + gatherRow.query_name, QueryTaxResult(taxres.query_info, lins=lins) + ) this_querytaxres.add_taxresult(taxres) gather_results[gatherRow.query_name] = this_querytaxres if not gather_results: - raise ValueError(f'No gather results loaded from {gather_csv}.') + raise ValueError(f"No gather results loaded from {gather_csv}.") else: notify(f"loaded {len(gather_results)} gather results from '{gather_csv}'.") - return gather_results, header #, gather_queries # can use the gather_results keys instead - - -def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxonomy=False, force=False, - keep_full_identifiers=False,keep_identifier_versions=False, lins=False): - ''' + return ( + gather_results, + header, + ) # , gather_queries # can use the gather_results keys instead + + +def check_and_load_gather_csvs( + gather_csvs, + tax_assign, + *, + fail_on_missing_taxonomy=False, + force=False, + keep_full_identifiers=False, + keep_identifier_versions=False, + lins=False, +): + """ Load gather csvs, checking for empties and ids missing from taxonomic assignments. - ''' + """ if not isinstance(gather_csvs, list): gather_csvs = [gather_csvs] gather_results = {} - total_missed = 0 - all_ident_missed = set() header = [] n_ignored = 0 for n, gather_csv in enumerate(gather_csvs): these_results = {} try: - these_results, header = load_gather_results(gather_csv, tax_assign, - seen_queries=gather_results.keys(), - force=force, keep_full_identifiers=keep_full_identifiers, - keep_identifier_versions = keep_identifier_versions, - fail_on_missing_taxonomy=fail_on_missing_taxonomy, - lins=lins) + these_results, header = load_gather_results( + gather_csv, + tax_assign, + seen_queries=gather_results.keys(), + force=force, + keep_full_identifiers=keep_full_identifiers, + keep_identifier_versions=keep_identifier_versions, + fail_on_missing_taxonomy=fail_on_missing_taxonomy, + lins=lins, + ) except ValueError as exc: if force: if "found in more than one CSV" in str(exc): - notify('Cannot force past duplicated gather query. Exiting.') + notify("Cannot force past duplicated gather query. Exiting.") raise if "Failing, as requested via --fail-on-missing-taxonomy" in str(exc): raise notify(str(exc)) - notify('--force is set. Attempting to continue to next set of gather results.') - n_ignored+=1 + notify( + "--force is set. Attempting to continue to next set of gather results." + ) + n_ignored += 1 continue else: - notify('Exiting.') + notify("Exiting.") raise # add these results to gather_results gather_results.update(these_results) - + # some reporting - num_gather_csvs_loaded = n+1 - n_ignored - notify(f'loaded results for {len(gather_results)} queries from {str(num_gather_csvs_loaded)} gather CSVs') + num_gather_csvs_loaded = n + 1 - n_ignored + notify( + f"loaded results for {len(gather_results)} queries from {str(num_gather_csvs_loaded)} gather CSVs" + ) # count and report missing and skipped idents report_missing_and_skipped_identities(gather_results) @@ -748,8 +854,8 @@ def report_missing_and_skipped_identities(gather_results): that are not present in taxonomic assignments, either by accident (missed) or request (skipped). """ - ident_missed= set() - ident_skipped= set() + ident_missed = set() + ident_skipped = set() total_n_missed = 0 total_n_skipped = 0 total_taxresults = 0 @@ -757,20 +863,24 @@ def report_missing_and_skipped_identities(gather_results): ident_missed.update(querytaxres.missed_idents) ident_skipped.update(querytaxres.skipped_idents) # totals are total rows in gather that were missed - do we want to report these at all? - total_n_missed+= querytaxres.n_missed - total_n_skipped+= querytaxres.n_skipped + total_n_missed += querytaxres.n_missed + total_n_skipped += querytaxres.n_skipped total_taxresults += len(querytaxres.raw_taxresults) if ident_missed: - notify(f'of {total_taxresults} gather results, lineage assignments for {total_n_missed} results were missed.') - notify(f'The following are missing from the taxonomy information: {", ".join(ident_missed)}') + notify( + f"of {total_taxresults} gather results, lineage assignments for {total_n_missed} results were missed." + ) + notify( + f'The following are missing from the taxonomy information: {", ".join(ident_missed)}' + ) def aggregate_by_lineage_at_rank(query_gather_results, rank, *, by_query=False): - ''' - Aggregate list of summarized_lineage_results at rank, keeping + """ + Aggregate list of summarized_lineage_results at rank, keeping query names or not (but this aggregates across queries if multiple). - ''' + """ lineage_summary = defaultdict(float) if by_query: lineage_summary = defaultdict(dict) @@ -784,9 +894,11 @@ def aggregate_by_lineage_at_rank(query_gather_results, rank, *, by_query=False): raise ValueError(f"Error: rank '{rank}' not available for aggregation.") for res in queryResult.summarized_lineage_results[rank]: - lineage = res.lineage.display_lineage(null_as_unclassified = True) + lineage = res.lineage.display_lineage(null_as_unclassified=True) if by_query: - lineage_summary[lineage][query_name] = res.fraction # v5?: res.f_weighted_at_rank + lineage_summary[lineage][ + query_name + ] = res.fraction # v5?: res.f_weighted_at_rank else: lineage_summary[lineage] += res.fraction @@ -794,21 +906,23 @@ def aggregate_by_lineage_at_rank(query_gather_results, rank, *, by_query=False): if not by_query: n_queries = len(all_queries) for lin, fraction in lineage_summary.items(): - lineage_summary[lin] = fraction/n_queries + lineage_summary[lin] = fraction / n_queries return lineage_summary, all_queries def format_for_krona(query_gather_results, rank, *, classification=False): - ''' + """ Aggregate and format for krona output. Single query recommended, but we don't want query headers. - ''' + """ # make header header = query_gather_results[0].make_krona_header(min_rank=rank) krona_results = [] # do we want to block more than one query for summarization? if len(query_gather_results) > 1: - notify('WARNING: results from more than one query found. Krona summarization not recommended.\n' \ - 'Percentage assignment will be normalized by the number of queries to maintain range 0-100%.') + notify( + "WARNING: results from more than one query found. Krona summarization not recommended.\n" + "Percentage assignment will be normalized by the number of queries to maintain range 0-100%." + ) if classification: # for classification, just write the results @@ -820,13 +934,17 @@ def format_for_krona(query_gather_results, rank, *, classification=False): # but also misleading, since we're using best_only and there may # be more matches that are not included here, making % unclassified seem higher than it would # be with summarization. We previously excluded it -- is that the behavior we want to keep? - krona_results.extend([q_res.krona_classified])#, q_res.krona_unclassified]) + krona_results.extend( + [q_res.krona_classified] + ) # , q_res.krona_unclassified]) else: - lineage_summary, _ = aggregate_by_lineage_at_rank(query_gather_results, rank, by_query=False) + lineage_summary, _ = aggregate_by_lineage_at_rank( + query_gather_results, rank, by_query=False + ) # sort by fraction lin_items = list(lineage_summary.items()) - lin_items.sort(key = lambda x: -x[1]) + lin_items.sort(key=lambda x: -x[1]) # reformat lineage for krona_results printing unclassified_fraction = 0 @@ -836,20 +954,20 @@ def format_for_krona(query_gather_results, rank, *, classification=False): unclassified_fraction = fraction continue else: - lin_list = lin.split(';') + lin_list = lin.split(";") krona_results.append((fraction, *lin_list)) # handle unclassified if unclassified_fraction: - len_unclassified_lin = len(header) -1 - unclassifed_lin = ["unclassified"]*len_unclassified_lin + len_unclassified_lin = len(header) - 1 + unclassifed_lin = ["unclassified"] * len_unclassified_lin krona_results.append((unclassified_fraction, *unclassifed_lin)) return krona_results, header -def write_krona(header, krona_results, out_fp, *, sep='\t'): - 'write krona output' +def write_krona(header, krona_results, out_fp, *, sep="\t"): + "write krona output" # CTB: do we want to optionally allow restriction to a specific rank # & above? NTP: think we originally kept krona to a specific rank, but # that may have been how we were plotting, since krona plots can be @@ -861,7 +979,7 @@ def write_krona(header, krona_results, out_fp, *, sep='\t'): tsv_output.writerow(res) -def write_output(header, results, out_fp, *, sep=',', write_header=True): +def write_output(header, results, out_fp, *, sep=",", write_header=True): """ write pre-generated results list of rows, with each row being a dictionary @@ -873,25 +991,34 @@ def write_output(header, results, out_fp, *, sep=',', write_header=True): output.writerow(res) -def write_bioboxes(header_lines, results, out_fp, *, sep='\t'): +def write_bioboxes(header_lines, results, out_fp, *, sep="\t"): """ write pre-generated results list of rows, with each row being list. """ for inf in header_lines: - out_fp.write(inf + '\n') + out_fp.write(inf + "\n") for res in results: - res = sep.join(res) + '\n' + res = sep.join(res) + "\n" out_fp.write(res) -def write_summary(query_gather_results, csv_fp, *, sep=',', limit_float_decimals=False, classification=False): - ''' +def write_summary( + query_gather_results, + csv_fp, + *, + sep=",", + limit_float_decimals=False, + classification=False, +): + """ Write taxonomy-summarized gather results for each rank. - ''' - w= None + """ + w = None for q_res in query_gather_results: - header, summary = q_res.make_full_summary(limit_float=limit_float_decimals, classification=classification) + header, summary = q_res.make_full_summary( + limit_float=limit_float_decimals, classification=classification + ) if w is None: w = csv.DictWriter(csv_fp, header, delimiter=sep) w.writeheader() @@ -899,29 +1026,41 @@ def write_summary(query_gather_results, csv_fp, *, sep=',', limit_float_decimals w.writerow(res) -def write_human_summary(query_gather_results, out_fp, display_rank, classification=False): - ''' +def write_human_summary( + query_gather_results, out_fp, display_rank, classification=False +): + """ Write human-readable taxonomy-summarized gather results for a specific rank. - ''' + """ for queryResult in query_gather_results: - results = queryResult.make_human_summary(display_rank=display_rank, classification=classification) + results = queryResult.make_human_summary( + display_rank=display_rank, classification=classification + ) if classification: out_fp.write("sample name status proportion cANI lineage\n") out_fp.write("----------- ------ ---------- ---- -------\n") for rD in results: - out_fp.write("{query_name:<15s} {status} {f_weighted_at_rank} {query_ani_at_rank} {lineage}\n".format(**rD)) + out_fp.write( + "{query_name:<15s} {status} {f_weighted_at_rank} {query_ani_at_rank} {lineage}\n".format( + **rD + ) + ) else: out_fp.write("sample name proportion cANI lineage\n") out_fp.write("----------- ---------- ---- -------\n") for rD in results: - out_fp.write("{query_name:<15s} {f_weighted_at_rank} {query_ani_at_rank} {lineage}\n".format(**rD)) + out_fp.write( + "{query_name:<15s} {f_weighted_at_rank} {query_ani_at_rank} {lineage}\n".format( + **rD + ) + ) -def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, sep='\t'): - ''' +def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, sep="\t"): + """ takes in a lineage dictionary with sample counts (output of aggregate_by_lineage_at_rank) and produces a tab-separated file with fractions for each sample. @@ -935,7 +1074,7 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, sep='\t'): lin_a 0.4 0.17 0.6 lin_b 0.0 0.0 0.1 lin_c 0.3 0.4 0.2 - ''' + """ header = ["lineage"] + sample_names w = csv.DictWriter(out_fp, header, delimiter=sep) @@ -943,14 +1082,14 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, sep='\t'): blank_row = {query_name: 0 for query_name in sample_names} unclassified_row = None for lin, sampleinfo in sorted(lineage_dict.items()): - #add lineage and 0 placeholders - row = {'lineage': lin} + # add lineage and 0 placeholders + row = {"lineage": lin} row.update(blank_row) # add info for query_names that exist for this lineage row.update(sampleinfo) # if unclassified, save this row for the end - if lin== "unclassified": - row.update({'lineage': 'unclassified'}) + if lin == "unclassified": + row.update({"lineage": "unclassified"}) unclassified_row = row continue # write row @@ -961,6 +1100,7 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, sep='\t'): class LineageDB(abc.Mapping): "Base LineageDB class built around an assignments dictionary." + def __init__(self, assign_d, avail_ranks): self.assignments = assign_d self.available_ranks = set(avail_ranks) @@ -982,8 +1122,16 @@ def __bool__(self): return bool(self.assignments) @classmethod - def load(cls, filename, *, delimiter=',', force=False, - keep_full_identifiers=False, keep_identifier_versions=True, lins=False): + def load( + cls, + filename, + *, + delimiter=",", + force=False, + keep_full_identifiers=False, + keep_identifier_versions=True, + lins=False, + ): """ Load a taxonomy assignment CSV file into a LineageDB. @@ -993,9 +1141,11 @@ def load(cls, filename, *, delimiter=',', force=False, 'keep_identifier_versions=False' will remove trailing versions, e.g. 'IDENT.1' => 'IDENT'. """ - include_strain=False + include_strain = False if not keep_identifier_versions and keep_full_identifiers: - raise ValueError("keep_identifer_versions=False doesn't make sense with keep_full_identifiers=True") + raise ValueError( + "keep_identifer_versions=False doesn't make sense with keep_full_identifiers=True" + ) if not os.path.exists(filename): raise ValueError(f"'{filename}' does not exist") @@ -1006,42 +1156,46 @@ def load(cls, filename, *, delimiter=',', force=False, with sourmash_args.FileInputCSV(filename) as r: header = r.fieldnames if not header: - raise ValueError(f'cannot read taxonomy assignments from {filename}') + raise ValueError(f"cannot read taxonomy assignments from {filename}") identifier = "ident" # check for ident/identifier, handle some common alternatives if "ident" not in header: # check for ident/identifier, handle some common alternatives - if 'identifiers' in header: - identifier = 'identifiers' + if "identifiers" in header: + identifier = "identifiers" header = ["ident" if "identifiers" == x else x for x in header] - elif 'accession' in header: - identifier = 'accession' + elif "accession" in header: + identifier = "accession" header = ["ident" if "accession" == x else x for x in header] - elif 'name' in header and 'lineage' in header: - return cls.load_from_gather_with_lineages(filename, - force=force, - lins=lins) + elif "name" in header and "lineage" in header: + return cls.load_from_gather_with_lineages( + filename, force=force, lins=lins + ) else: header_str = ",".join([repr(x) for x in header]) - raise ValueError(f'No taxonomic identifiers found; headers are {header_str}') + raise ValueError( + f"No taxonomic identifiers found; headers are {header_str}" + ) if lins and "lin" not in header: - raise ValueError(f"'lin' column not found: cannot read LIN taxonomy assignments from {filename}.") + raise ValueError( + f"'lin' column not found: cannot read LIN taxonomy assignments from {filename}." + ) if not lins: # is "strain" an available rank? if "strain" in header: - include_strain=True + include_strain = True # check that all ranks are in header ranks = list(RankLineageInfo().taxlist) if not include_strain: - ranks.remove('strain') + ranks.remove("strain") if not set(ranks).issubset(header): # for now, just raise err if not all ranks are present. # in future, we can define `ranks` differently if desired # return them from this function so we can check the `available` ranks - raise ValueError('Not all taxonomy ranks present') + raise ValueError("Not all taxonomy ranks present") assignments = {} num_rows = 0 @@ -1053,13 +1207,17 @@ def load(cls, filename, *, delimiter=',', force=False, for n, row in enumerate(r): num_rows += 1 if lins: - lineageInfo = LINLineageInfo(lineage_str=row['lin']) + lineageInfo = LINLineageInfo(lineage_str=row["lin"]) if n_pos is not None: if lineageInfo.n_lin_positions != n_pos: - raise ValueError(f"For taxonomic summarization, all LIN assignments must use the same number of LIN positions.") + raise ValueError( + "For taxonomic summarization, all LIN assignments must use the same number of LIN positions." + ) else: - n_pos = lineageInfo.n_lin_positions # set n_pos with first entry - ranks=lineageInfo.ranks + n_pos = ( + lineageInfo.n_lin_positions + ) # set n_pos with first entry + ranks = lineageInfo.ranks else: # read lineage from row dictionary lineageInfo = RankLineageInfo(lineage_dict=row) @@ -1067,9 +1225,11 @@ def load(cls, filename, *, delimiter=',', force=False, ident = row[identifier] # fold, spindle, and mutilate ident? - ident = get_ident(ident, - keep_full_identifiers=keep_full_identifiers, - keep_identifier_versions=keep_identifier_versions) + ident = get_ident( + ident, + keep_full_identifiers=keep_full_identifiers, + keep_identifier_versions=keep_identifier_versions, + ) # store lineage tuple lineage = lineageInfo.filled_lineage @@ -1078,27 +1238,27 @@ def load(cls, filename, *, delimiter=',', force=False, if ident in assignments: if assignments[ident] != lineage: if not force: - raise ValueError(f"multiple lineages for identifier {ident}") + raise ValueError( + f"multiple lineages for identifier {ident}" + ) else: assignments[ident] = lineage if not lins: - if lineage[-1].rank == 'species': + if lineage[-1].rank == "species": n_species += 1 - elif lineage[-1].rank == 'strain': + elif lineage[-1].rank == "strain": n_species += 1 n_strains += 1 return LineageDB(assignments, ranks) - @classmethod def load_from_gather_with_lineages(cls, filename, *, force=False, lins=False): """ Load an annotated gather-with-lineages CSV file produced by 'tax annotate' into a LineageDB. """ - include_strain = False if not os.path.exists(filename): raise ValueError(f"'{filename}' does not exist") @@ -1109,12 +1269,14 @@ def load_from_gather_with_lineages(cls, filename, *, force=False, lins=False): with sourmash_args.FileInputCSV(filename) as r: header = r.fieldnames if not header: - raise ValueError(f'cannot read taxonomy assignments from {filename}') + raise ValueError(f"cannot read taxonomy assignments from {filename}") if "name" not in header or "lineage" not in header: - raise ValueError(f"Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?") + raise ValueError( + "Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?" + ) - ranks=None + ranks = None assignments = {} num_rows = 0 n_species = 0 @@ -1124,13 +1286,13 @@ def load_from_gather_with_lineages(cls, filename, *, force=False, lins=False): for n, row in enumerate(r): num_rows += 1 - name = row['name'] + name = row["name"] ident = get_ident(name) if lins: - lineageInfo = LINLineageInfo(lineage_str=row['lineage']) + lineageInfo = LINLineageInfo(lineage_str=row["lineage"]) else: - lineageInfo = RankLineageInfo(lineage_str= row['lineage']) + lineageInfo = RankLineageInfo(lineage_str=row["lineage"]) if ranks is None: ranks = lineageInfo.taxlist @@ -1142,14 +1304,16 @@ def load_from_gather_with_lineages(cls, filename, *, force=False, lins=False): # this should not happen with valid # sourmash tax annotate output, but check anyway. if not force: - raise ValueError(f"multiple lineages for identifier {ident}") + raise ValueError( + f"multiple lineages for identifier {ident}" + ) else: assignments[ident] = lineage if isinstance(lineageInfo, RankLineageInfo): - if lineage[-1].rank == 'species': + if lineage[-1].rank == "species": n_species += 1 - elif lineage[-1].rank == 'strain': + elif lineage[-1].rank == "strain": n_species += 1 n_strains += 1 @@ -1160,10 +1324,19 @@ class LineageDB_Sqlite(abc.Mapping): """ A LineageDB based on a sqlite3 database with a 'sourmash_taxonomy' table. """ + # NOTE: 'order' is a reserved name in sql, so we have to use 'order_'. - columns = ('superkingdom', 'phylum', 'order_', 'class', 'family', - 'genus', 'species', 'strain') - table_name = 'sourmash_taxonomy' + columns = ( + "superkingdom", + "phylum", + "order_", + "class", + "family", + "genus", + "species", + "strain", + ) + table_name = "sourmash_taxonomy" def __init__(self, conn, *, table_name=None): self.conn = conn @@ -1175,10 +1348,10 @@ def __init__(self, conn, *, table_name=None): # check that the right table is there. c = conn.cursor() try: - c.execute(f'SELECT * FROM {self.table_name} LIMIT 1') + c.execute(f"SELECT * FROM {self.table_name} LIMIT 1") except (sqlite3.DatabaseError, sqlite3.OperationalError): raise ValueError("not a taxonomy database") - + # check: can we do a 'select' on the right table? self.__len__() c = conn.cursor() @@ -1188,7 +1361,7 @@ def __init__(self, conn, *, table_name=None): for column, rank in zip(self.columns, RankLineageInfo().taxlist): query = f'SELECT COUNT({column}) FROM {self.table_name} WHERE {column} IS NOT NULL AND {column} != ""' c.execute(query) - cnt, = c.fetchone() + (cnt,) = c.fetchone() if cnt: ranks.add(rank) @@ -1209,16 +1382,16 @@ def load(cls, location): except sqlite3.OperationalError: info = {} - if 'SqliteLineage' in info: - if info['SqliteLineage'] != '1.0': + if "SqliteLineage" in info: + if info["SqliteLineage"] != "1.0": raise IndexNotSupported - table_name = 'sourmash_taxonomy' + table_name = "sourmash_taxonomy" else: # legacy support for old taxonomy DB, pre sourmash_internal. try: - c.execute('SELECT * FROM taxonomy LIMIT 1') - table_name = 'taxonomy' + c.execute("SELECT * FROM taxonomy LIMIT 1") + table_name = "taxonomy" except sqlite3.OperationalError: pass @@ -1229,13 +1402,16 @@ def load(cls, location): def _make_tup(self, row): "build a tuple of LineagePairs for this sqlite row" - tup = [ LineagePair(n, r) for (n, r) in zip(RankLineageInfo().taxlist, row) ] + tup = [LineagePair(n, r) for (n, r) in zip(RankLineageInfo().taxlist, row)] return tuple(tup) def __getitem__(self, ident): "Retrieve lineage for identifer" c = self.cursor - c.execute(f'SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name} WHERE ident=?', (ident,)) + c.execute( + f"SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name} WHERE ident=?", + (ident,), + ) # retrieve names list... names = c.fetchone() @@ -1256,24 +1432,26 @@ def __bool__(self): def __len__(self): "Return number of rows" c = self.conn.cursor() - c.execute(f'SELECT COUNT(DISTINCT ident) FROM {self.table_name}') - nrows, = c.fetchone() + c.execute(f"SELECT COUNT(DISTINCT ident) FROM {self.table_name}") + (nrows,) = c.fetchone() return nrows def __iter__(self): "Return all identifiers" # create new cursor so as to allow other operations c = self.conn.cursor() - c.execute(f'SELECT DISTINCT ident FROM {self.table_name}') + c.execute(f"SELECT DISTINCT ident FROM {self.table_name}") - for ident, in c: + for (ident,) in c: yield ident def items(self): "return all items in the sqlite database" c = self.conn.cursor() - c.execute(f'SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name}') + c.execute( + f"SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name}" + ) for ident, *names in c: yield ident, self._make_tup(names) @@ -1347,10 +1525,10 @@ def __len__(self): def __bool__(self): "True if any contained database has content." - return any( bool(db) for db in self.lineage_dbs ) + return any(bool(db) for db in self.lineage_dbs) def save(self, filename_or_fp, file_format): - assert file_format in ('sql', 'csv') + assert file_format in ("sql", "csv") is_filename = False try: @@ -1358,18 +1536,20 @@ def save(self, filename_or_fp, file_format): except AttributeError: is_filename = True - if file_format == 'sql': + if file_format == "sql": if not is_filename: - raise ValueError("file format '{file_format}' requires a filename, not a file handle") + raise ValueError( + "file format '{file_format}' requires a filename, not a file handle" + ) self._save_sqlite(filename_or_fp) - elif file_format == 'csv': + elif file_format == "csv": # we need a file handle; open file. fp = filename_or_fp if is_filename: - if filename_or_fp.endswith('.gz'): - fp = gzip.open(filename_or_fp, 'wt', newline="") + if filename_or_fp.endswith(".gz"): + fp = gzip.open(filename_or_fp, "wt", newline="") else: - fp = open(filename_or_fp, 'w', newline="") + fp = open(filename_or_fp, "w", newline="") try: self._save_csv(fp) @@ -1389,13 +1569,14 @@ def _save_sqlite(self, filename, *, conn=None): cursor = db.cursor() try: - sqlite_utils.add_sourmash_internal(cursor, 'SqliteLineage', '1.0') + sqlite_utils.add_sourmash_internal(cursor, "SqliteLineage", "1.0") except sqlite3.OperationalError: raise ValueError("attempt to write a readonly database") try: # CTB: could add 'IF NOT EXIST' here; would need tests, too. - cursor.execute(""" + cursor.execute( + """ CREATE TABLE sourmash_taxonomy ( ident TEXT NOT NULL, @@ -1408,49 +1589,54 @@ class TEXT, species TEXT, strain TEXT ) - """) - did_create = True + """ + ) except sqlite3.OperationalError: # already exists? raise ValueError(f"taxonomy table already exists in '{filename}'") # follow up and create index - cursor.execute("CREATE UNIQUE INDEX sourmash_taxonomy_ident ON sourmash_taxonomy(ident);") + cursor.execute( + "CREATE UNIQUE INDEX sourmash_taxonomy_ident ON sourmash_taxonomy(ident);" + ) for ident, tax in self.items(): - x = [ident, *[ t.name for t in tax ]] + x = [ident, *[t.name for t in tax]] # fill the taxonomy tuple with empty values until it's the # right length for the SQL statement - while len(x) < 9: - x.append('') + x.append("") - cursor.execute('INSERT INTO sourmash_taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x) + cursor.execute( + "INSERT INTO sourmash_taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + x, + ) db.commit() def _save_csv(self, fp): - headers = ['identifiers'] + list(RankLineageInfo().taxlist) + headers = ["identifiers"] + list(RankLineageInfo().taxlist) w = csv.DictWriter(fp, fieldnames=headers) w.writeheader() for n, (ident, tax) in enumerate(self.items()): row = {} - row['identifiers'] = ident + row["identifiers"] = ident # convert tax LineagePairs into dictionary for t in tax: row[t.rank] = t.name # add strain if needed - if 'strain' not in row: - row['strain'] = '' + if "strain" not in row: + row["strain"] = "" w.writerow(row) @classmethod def load(cls, locations, **kwargs): "Load one or more taxonomies from the given location(s)" - force = kwargs.get('force', False) + force = kwargs.get("force", False) if isinstance(locations, str): raise TypeError("'locations' should be a list, not a string") @@ -1475,7 +1661,9 @@ def load(cls, locations, **kwargs): except (ValueError, csv.Error) as exc: # for the last loader, just pass along ValueError... if not force: - raise ValueError(f"cannot read taxonomy assignments from '{location}': {str(exc)}") + raise ValueError( + f"cannot read taxonomy assignments from '{location}': {str(exc)}" + ) # nothing loaded, goodbye! if not loaded and not force: @@ -1506,7 +1694,7 @@ class GatherRow: # essential columns query_name: str - name: str # match_name + name: str # match_name f_unique_weighted: float f_unique_to_query: float unique_intersect_bp: int @@ -1549,6 +1737,7 @@ class GatherRow: @dataclass class QueryInfo: "Class for storing query information" + query_name: str query_md5: str query_filename: str @@ -1564,7 +1753,9 @@ def __post_init__(self): self.ksize = int(self.ksize) self.scaled = int(self.scaled) self.query_n_hashes = int(self.query_n_hashes) if self.query_n_hashes else 0 - self.total_weighted_hashes = int(self.total_weighted_hashes) if self.total_weighted_hashes else 0 + self.total_weighted_hashes = ( + int(self.total_weighted_hashes) if self.total_weighted_hashes else 0 + ) @property def total_weighted_bp(self): @@ -1576,7 +1767,8 @@ class BaseTaxResult: """ Base class for sourmash taxonomic annotation. """ - raw: dict # csv row + + raw: dict # csv row keep_full_identifiers: bool = False keep_identifier_versions: bool = False match_ident: str = field(init=False) @@ -1594,29 +1786,32 @@ def get_ident(self, id_col=None): else: self.match_ident = self.raw.name if not self.keep_full_identifiers: - self.match_ident = self.match_ident.split(' ')[0] + self.match_ident = self.match_ident.split(" ")[0] else: - #overrides version bc can't keep full without keeping version + # overrides version bc can't keep full without keeping version self.keep_identifier_versions = True if not self.keep_identifier_versions: - self.match_ident = self.match_ident.split('.')[0] + self.match_ident = self.match_ident.split(".")[0] - - def get_match_lineage(self, tax_assignments, skip_idents=None, fail_on_missing_taxonomy=False): + def get_match_lineage( + self, tax_assignments, skip_idents=None, fail_on_missing_taxonomy=False + ): if skip_idents and self.match_ident in skip_idents: self.skipped_ident = True else: lin = tax_assignments.get(self.match_ident) if lin: if self.lins: - self.lineageInfo = LINLineageInfo(lineage = lin) + self.lineageInfo = LINLineageInfo(lineage=lin) else: - self.lineageInfo = RankLineageInfo(lineage = lin) + self.lineageInfo = RankLineageInfo(lineage=lin) else: - self.missed_ident=True + self.missed_ident = True self.match_lineage_attempted = True if self.missed_ident and fail_on_missing_taxonomy: - raise ValueError(f"Error: ident '{self.match_ident}' is not in the taxonomy database. Failing, as requested via --fail-on-missing-taxonomy") + raise ValueError( + f"Error: ident '{self.match_ident}' is not in the taxonomy database. Failing, as requested via --fail-on-missing-taxonomy" + ) @dataclass @@ -1624,7 +1819,8 @@ class AnnotateTaxResult(BaseTaxResult): """ Class to enable taxonomic annotation of any sourmash CSV. """ - id_col: str = 'name' + + id_col: str = "name" def __post_init__(self): if self.id_col not in self.raw.keys(): @@ -1667,22 +1863,24 @@ class TaxResult(BaseTaxResult): Use RankLineageInfo or LINLineageInfo to store lineage information. """ + raw: GatherRow query_name: str = field(init=False) query_info: QueryInfo = field(init=False) def __post_init__(self): self.get_ident() - self.query_name = self.raw.query_name # convenience - self.query_info = QueryInfo(query_name = self.raw.query_name, - query_md5=self.raw.query_md5, - query_filename = self.raw.query_filename, - query_bp = self.raw.query_bp, - query_n_hashes = self.raw.query_n_hashes, - total_weighted_hashes = self.raw.total_weighted_hashes, - ksize = self.raw.ksize, - scaled = self.raw.scaled - ) + self.query_name = self.raw.query_name # convenience + self.query_info = QueryInfo( + query_name=self.raw.query_name, + query_md5=self.raw.query_md5, + query_filename=self.raw.query_filename, + query_bp=self.raw.query_bp, + query_n_hashes=self.raw.query_n_hashes, + total_weighted_hashes=self.raw.total_weighted_hashes, + ksize=self.raw.ksize, + scaled=self.raw.scaled, + ) # cast and store the imp bits self.f_unique_to_query = float(self.raw.f_unique_to_query) self.f_unique_weighted = float(self.raw.f_unique_weighted) @@ -1701,6 +1899,7 @@ class SummarizedGatherResult: Methods included for returning formatted results for different outputs. """ + rank: str fraction: float lineage: RankLineageInfo @@ -1713,23 +1912,32 @@ def __post_init__(self): def check_values(self): if any([self.fraction > 1, self.f_weighted_at_rank > 1]): - raise ValueError(f"Summarized fraction is > 100% of the query! This should not be possible. Please check that your input files come directly from a single gather run per query.") + raise ValueError( + "Summarized fraction is > 100% of the query! This should not be possible. Please check that your input files come directly from a single gather run per query." + ) # is this true for weighted too, or is that set to 0 when --ignore-abundance is used? - if any([self.fraction <=0, self.f_weighted_at_rank <= 0]): # this shouldn't actually happen, but it breaks ANI estimation, so let's check for it. - raise ValueError(f"Summarized fraction is <=0% of the query! This should not occur.") + if any( + [self.fraction <= 0, self.f_weighted_at_rank <= 0] + ): # this shouldn't actually happen, but it breaks ANI estimation, so let's check for it. + raise ValueError( + "Summarized fraction is <=0% of the query! This should not occur." + ) def set_query_ani(self, query_info): - self.query_ani_at_rank = containment_to_distance(self.fraction, query_info.ksize, query_info.scaled, - n_unique_kmers=query_info.query_n_hashes, - sequence_len_bp=query_info.query_bp).ani - + self.query_ani_at_rank = containment_to_distance( + self.fraction, + query_info.ksize, + query_info.scaled, + n_unique_kmers=query_info.query_n_hashes, + sequence_len_bp=query_info.query_bp, + ).ani def as_lineage_dict(self, query_info, ranks): - ''' + """ Format to dict for writing lineage-CSV file suitable for use with sourmash tax ... -t. - ''' + """ lD = {} - lD['ident'] = query_info.query_name + lD["ident"] = query_info.query_name for rank in ranks: lin_name = self.lineage.name_at_rank(rank) if lin_name is None: @@ -1739,52 +1947,54 @@ def as_lineage_dict(self, query_info, ranks): def as_summary_dict(self, query_info, limit_float=False): sD = asdict(self) - sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True) - sD['query_name'] = query_info.query_name - sD['query_md5'] = query_info.query_md5 - sD['query_filename'] = query_info.query_filename - sD['total_weighted_hashes'] = str(query_info.total_weighted_hashes) - sD['bp_match_at_rank'] = str(self.bp_match_at_rank) + sD["lineage"] = self.lineage.display_lineage(null_as_unclassified=True) + sD["query_name"] = query_info.query_name + sD["query_md5"] = query_info.query_md5 + sD["query_filename"] = query_info.query_filename + sD["total_weighted_hashes"] = str(query_info.total_weighted_hashes) + sD["bp_match_at_rank"] = str(self.bp_match_at_rank) if limit_float: - sD['fraction'] = f'{self.fraction:.3f}' - sD['f_weighted_at_rank'] = f'{self.f_weighted_at_rank:.3f}' + sD["fraction"] = f"{self.fraction:.3f}" + sD["f_weighted_at_rank"] = f"{self.f_weighted_at_rank:.3f}" if self.query_ani_at_rank: - sD['query_ani_at_rank'] = f'{self.query_ani_at_rank:.3f}' + sD["query_ani_at_rank"] = f"{self.query_ani_at_rank:.3f}" else: - sD['fraction'] = str(self.fraction) - sD['f_weighted_at_rank'] = str(self.f_weighted_at_rank) + sD["fraction"] = str(self.fraction) + sD["f_weighted_at_rank"] = str(self.f_weighted_at_rank) - return(sD) + return sD def as_human_friendly_dict(self, query_info): sD = self.as_summary_dict(query_info=query_info, limit_float=True) - sD['f_weighted_at_rank'] = f"{self.f_weighted_at_rank*100:>4.1f}%" + sD["f_weighted_at_rank"] = f"{self.f_weighted_at_rank*100:>4.1f}%" if self.query_ani_at_rank is not None: - sD['query_ani_at_rank'] = f"{self.query_ani_at_rank*100:>3.1f}%" + sD["query_ani_at_rank"] = f"{self.query_ani_at_rank*100:>3.1f}%" else: - sD['query_ani_at_rank'] = '- ' + sD["query_ani_at_rank"] = "- " return sD def as_kreport_dict(self, query_info): """ Produce kreport dict for named taxonomic groups. """ - lowest_assignment_rank = 'species' + lowest_assignment_rank = "species" sD = {} - sD['num_bp_assigned'] = str(0) - sD['ncbi_taxid'] = None + sD["num_bp_assigned"] = str(0) + sD["ncbi_taxid"] = None # total percent containment, weighted to include abundance info - sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' - sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp)) + sD["percent_containment"] = f"{self.f_weighted_at_rank * 100:.2f}" + sD["num_bp_contained"] = str( + int(self.f_weighted_at_rank * query_info.total_weighted_bp) + ) if isinstance(self.lineage, LINLineageInfo): raise ValueError("Cannot produce 'kreport' with LIN taxonomy.") if self.lineage != RankLineageInfo(): this_rank = self.lineage.lowest_rank - sD['rank_code'] = RANKCODE[this_rank] - sD['sci_name'] = self.lineage.lowest_lineage_name + sD["rank_code"] = RANKCODE[this_rank] + sD["sci_name"] = self.lineage.lowest_lineage_name taxid = self.lineage.lowest_lineage_taxid if taxid: - sD['ncbi_taxid'] = str(taxid) + sD["ncbi_taxid"] = str(taxid) # the number of bp actually 'assigned' at this rank. Sourmash assigns everything # at genome level, but since kreport traditionally doesn't include 'strain' or genome, # it is reasonable to state that sourmash assigns at 'species' level for this. @@ -1792,19 +2002,21 @@ def as_kreport_dict(self, query_info): if this_rank == lowest_assignment_rank: sD["num_bp_assigned"] = sD["num_bp_contained"] else: - sD['sci_name'] = 'unclassified' - sD['rank_code'] = RANKCODE['unclassified'] + sD["sci_name"] = "unclassified" + sD["rank_code"] = RANKCODE["unclassified"] sD["num_bp_assigned"] = sD["num_bp_contained"] return sD - + def as_lingroup_dict(self, query_info, lg_name): """ Produce lingroup report dict for lingroups. """ sD = {} # total percent containment, weighted to include abundance info - sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' - sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp)) + sD["percent_containment"] = f"{self.f_weighted_at_rank * 100:.2f}" + sD["num_bp_contained"] = str( + int(self.f_weighted_at_rank * query_info.total_weighted_bp) + ) sD["lin"] = self.lineage.display_lineage() sD["name"] = lg_name return sD @@ -1814,11 +2026,11 @@ def as_cami_bioboxes(self): Format taxonomy-summarized gather results as CAMI profiling Bioboxes format. - Columns are: TAXID RANK TAXPATH TAXPATHSN PERCENTAGE + Columns are: TAXID RANK TAXPATH TAXPATHSN PERCENTAGE """ if isinstance(self.lineage, LINLineageInfo): raise ValueError("Cannot produce 'bioboxes' with LIN taxonomy.") - if self.lineage != RankLineageInfo(): # if not unassigned + if self.lineage != RankLineageInfo(): # if not unassigned taxid = self.lineage.lowest_lineage_taxid if taxid: taxpath = self.lineage.display_taxid(sep="|") @@ -1826,7 +2038,9 @@ def as_cami_bioboxes(self): else: taxpath = None taxpathsn = self.lineage.display_lineage(sep="|") - percentage = f"{(self.f_weighted_at_rank * 100):.2f}" # fix at 2 decimal points + percentage = ( + f"{(self.f_weighted_at_rank * 100):.2f}" # fix at 2 decimal points + ) return [taxid, self.rank, taxpath, taxpathsn, percentage] return [] @@ -1842,39 +2056,47 @@ class ClassificationResult(SummarizedGatherResult): Methods included for returning formatted results for different outputs. """ + "Class for storing query classification information" status: str = field(init=False) def __post_init__(self): # check for out of bounds values, default "nomatch" if no match at all self.check_values() - self.status = 'nomatch' #None? + self.status = "nomatch" # None? def set_status(self, query_info, containment_threshold=None, ani_threshold=None): # if any matches, use 'below_threshold' as default; set 'match' if meets threshold if any([containment_threshold is not None, ani_threshold is not None]): - self.status="below_threshold" + self.status = "below_threshold" self.set_query_ani(query_info=query_info) - if ani_threshold is not None: # if provided, just use ani thresh, don't use containment threshold + if ( + ani_threshold is not None + ): # if provided, just use ani thresh, don't use containment threshold if self.query_ani_at_rank >= ani_threshold: - self.status = 'match' + self.status = "match" # v5?: switch to using self.f_weighted_at_rank here - elif containment_threshold is not None and self.fraction >= containment_threshold: - self.status = 'match' + elif ( + containment_threshold is not None and self.fraction >= containment_threshold + ): + self.status = "match" def build_krona_result(self, rank=None): krona_classified, krona_unclassified = None, None if rank is not None and rank == self.rank: - lin_as_list = self.lineage.display_lineage().split(';') - krona_classification = (self.fraction, *lin_as_list) # v5?: f_weighted_at_rank - krona_classified = (krona_classification) + lin_as_list = self.lineage.display_lineage().split(";") + krona_classification = ( + self.fraction, + *lin_as_list, + ) # v5?: f_weighted_at_rank + krona_classified = krona_classification # handle unclassified - do we want/need this? - unclassified_fraction= 1.0-self.fraction #v5?: f_weighted_at_rank + unclassified_fraction = 1.0 - self.fraction # v5?: f_weighted_at_rank len_unclassified_lin = len(lin_as_list) - unclassifed_lin = ["unclassified"]*(len_unclassified_lin) + unclassifed_lin = ["unclassified"] * (len_unclassified_lin) krona_unclassified = (unclassified_fraction, *unclassifed_lin) return krona_classified, krona_unclassified - + @dataclass class QueryTaxResult: @@ -1887,11 +2109,12 @@ class QueryTaxResult: Contains methods for formatting results for different outputs. """ - query_info: QueryInfo # initialize with QueryInfo dataclass + + query_info: QueryInfo # initialize with QueryInfo dataclass lins: bool = False def __post_init__(self): - self.query_name = self.query_info.query_name # for convenience + self.query_name = self.query_info.query_name # for convenience self._init_taxresult_vars() self._init_summarization_vars() self._init_classification_results() @@ -1899,7 +2122,7 @@ def __post_init__(self): def _init_taxresult_vars(self): self.ranks = [] self.raw_taxresults = [] - self.skipped_idents= set() + self.skipped_idents = set() self.missed_idents = set() self.n_missed = 0 self.n_skipped = 0 @@ -1913,13 +2136,13 @@ def _init_summarization_vars(self): self._init_summarization_results() def _init_summarization_results(self): - self.total_f_weighted = defaultdict(float) #0.0 - self.total_f_classified = defaultdict(float)#0.0 - self.total_bp_classified = defaultdict(int) #0 + self.total_f_weighted = defaultdict(float) # 0.0 + self.total_f_classified = defaultdict(float) # 0.0 + self.total_bp_classified = defaultdict(int) # 0 self.summarized_lineage_results = defaultdict(list) def _init_classification_results(self): - self.status = 'nomatch' + self.status = "nomatch" self.classified_ranks = [] self.classification_result = None self.krona_classified = None @@ -1940,76 +2163,114 @@ def add_taxresult(self, taxresult): # check that all query parameters match if self.is_compatible(taxresult=taxresult): if not taxresult.match_lineage_attempted: - raise ValueError("Error: Cannot add TaxResult. Please use get_match_lineage() to add taxonomic lineage information first.") + raise ValueError( + "Error: Cannot add TaxResult. Please use get_match_lineage() to add taxonomic lineage information first." + ) if not self.ranks: self.ranks = taxresult.lineageInfo.ranks if taxresult.skipped_ident: - self.n_skipped +=1 + self.n_skipped += 1 self.skipped_idents.add(taxresult.match_ident) elif taxresult.missed_ident: - self.n_missed +=1 + self.n_missed += 1 self.missed_idents.add(taxresult.match_ident) self.raw_taxresults.append(taxresult) else: - raise ValueError("Error: Cannot add TaxResult: query information does not match.") + raise ValueError( + "Error: Cannot add TaxResult: query information does not match." + ) def summarize_up_ranks(self, single_rank=None, force_resummarize=False): - if self.summarized_ranks: # has already been summarized + if self.summarized_ranks: # has already been summarized if force_resummarize: self._init_summarization_vars() else: - raise ValueError("Error: already summarized using rank(s): '{', '.join(self.summarized_ranks)}'. Use 'force_resummarize=True' to reset and resummarize") + raise ValueError( + "Error: already summarized using rank(s): '{', '.join(self.summarized_ranks)}'. Use 'force_resummarize=True' to reset and resummarize" + ) # set ranks levels to summarize self.summarized_ranks = self.ascending_ranks if single_rank: if single_rank not in self.summarized_ranks: - raise ValueError(f"Error: rank '{single_rank}' not in available ranks ({', '.join(self.summarized_ranks)})") + raise ValueError( + f"Error: rank '{single_rank}' not in available ranks ({', '.join(self.summarized_ranks)})" + ) self.summarized_ranks = [single_rank] - notify(f"Starting summarization up rank(s): {', '.join(self.summarized_ranks)} ") + notify( + f"Starting summarization up rank(s): {', '.join(self.summarized_ranks)} " + ) for taxres in self.raw_taxresults: lininfo = taxres.lineageInfo - if lininfo and lininfo.filled_lineage: # won't always have lineage to summarize (skipped idents, missed idents) + if ( + lininfo and lininfo.filled_lineage + ): # won't always have lineage to summarize (skipped idents, missed idents) # notify + track perfect matches if taxres.f_unique_to_query >= 1.0: if taxres.match_ident not in self.perfect_match: - notify(f"WARNING: 100% match! Is query '{self.query_name}' identical to its database match, '{taxres.match_ident}'?") + notify( + f"WARNING: 100% match! Is query '{self.query_name}' identical to its database match, '{taxres.match_ident}'?" + ) self.perfect_match.add(taxres.match_ident) # add this taxresult to summary for rank in self.summarized_ranks: - if rank in lininfo.filled_ranks: # only store if this rank is filled. + if ( + rank in lininfo.filled_ranks + ): # only store if this rank is filled. lin_at_rank = lininfo.pop_to_rank(rank) - self.sum_uniq_weighted[rank][lin_at_rank] += taxres.f_unique_weighted - self.sum_uniq_to_query[rank][lin_at_rank] += taxres.f_unique_to_query - self.sum_uniq_bp[rank][lin_at_rank] += taxres.unique_intersect_bp + self.sum_uniq_weighted[rank][ + lin_at_rank + ] += taxres.f_unique_weighted + self.sum_uniq_to_query[rank][ + lin_at_rank + ] += taxres.f_unique_to_query + self.sum_uniq_bp[rank][ + lin_at_rank + ] += taxres.unique_intersect_bp # reset ranks levels to the ones that were actually summarized + that we can access for summarized result - self.summarized_ranks = [x for x in self.summarized_ranks if x in self.sum_uniq_bp.keys()] + self.summarized_ranks = [ + x for x in self.summarized_ranks if x in self.sum_uniq_bp.keys() + ] if single_rank and single_rank not in self.summarized_ranks: - raise ValueError(f"Error: rank '{single_rank}' was not available for any matching lineages.") + raise ValueError( + f"Error: rank '{single_rank}' was not available for any matching lineages." + ) def build_summarized_result(self, single_rank=None, force_resummarize=False): # just reset if we've already built summarized result (avoid adding to existing)? Or write in an error/force option? self._init_summarization_results() # if taxresults haven't been summarized, do that first if not self.summarized_ranks or force_resummarize: - self.summarize_up_ranks(single_rank=single_rank, force_resummarize=force_resummarize) + self.summarize_up_ranks( + single_rank=single_rank, force_resummarize=force_resummarize + ) # catch potential error from running summarize_up_ranks separately and passing in different single_rank if single_rank and single_rank not in self.summarized_ranks: - raise ValueError(f"Error: rank '{single_rank}' not in summarized rank(s), {','.join(self.summarized_ranks)}") + raise ValueError( + f"Error: rank '{single_rank}' not in summarized rank(s), {','.join(self.summarized_ranks)}" + ) # rank loop is currently done in __main__ - for rank in self.summarized_ranks[::-1]: # reverse so that results are in descending order - sum_uniq_to_query = self.sum_uniq_to_query[rank] #should be lineage: value + for rank in self.summarized_ranks[ + ::-1 + ]: # reverse so that results are in descending order + sum_uniq_to_query = self.sum_uniq_to_query[rank] # should be lineage: value # first, sort sorted_sum_uniq_to_query = list(sum_uniq_to_query.items()) - sorted_sum_uniq_to_query.sort(key = lambda x: -x[1]) + sorted_sum_uniq_to_query.sort(key=lambda x: -x[1]) for lineage, f_unique in sorted_sum_uniq_to_query: # does this ever happen? do we need it? - if f_unique == 0: #no annotated results for this query. do we need to handle this differently now? + if ( + f_unique == 0 + ): # no annotated results for this query. do we need to handle this differently now? continue f_weighted_at_rank = self.sum_uniq_weighted[rank][lineage] bp_intersect_at_rank = self.sum_uniq_bp[rank][lineage] - sres = SummarizedGatherResult(lineage=lineage, rank=rank, - f_weighted_at_rank=f_weighted_at_rank, fraction=f_unique, - bp_match_at_rank=bp_intersect_at_rank) + sres = SummarizedGatherResult( + lineage=lineage, + rank=rank, + f_weighted_at_rank=f_weighted_at_rank, + fraction=f_unique, + bp_match_at_rank=bp_intersect_at_rank, + ) sres.set_query_ani(query_info=self.query_info) self.summarized_lineage_results[rank].append(sres) @@ -2028,43 +2289,69 @@ def build_summarized_result(self, single_rank=None, force_resummarize=False): f_unique = 1.0 - self.total_f_classified[rank] if f_unique > 0: f_weighted_at_rank = 1.0 - self.total_f_weighted[rank] - bp_intersect_at_rank = self.query_info.query_bp - self.total_bp_classified[rank] - sres = SummarizedGatherResult(lineage=lineage, rank=rank, f_weighted_at_rank=f_weighted_at_rank, - fraction=f_unique, bp_match_at_rank=bp_intersect_at_rank, query_ani_at_rank=query_ani) + bp_intersect_at_rank = ( + self.query_info.query_bp - self.total_bp_classified[rank] + ) + sres = SummarizedGatherResult( + lineage=lineage, + rank=rank, + f_weighted_at_rank=f_weighted_at_rank, + fraction=f_unique, + bp_match_at_rank=bp_intersect_at_rank, + query_ani_at_rank=query_ani, + ) self.summarized_lineage_results[rank].append(sres) - def build_classification_result(self, rank=None, ani_threshold=None, containment_threshold=0.1, force_resummarize=False, lingroup_ranks=None, lingroups=None): + def build_classification_result( + self, + rank=None, + ani_threshold=None, + containment_threshold=0.1, + force_resummarize=False, + lingroup_ranks=None, + lingroups=None, + ): if containment_threshold is not None and not 0 <= containment_threshold <= 1: - raise ValueError(f"Containment threshold must be between 0 and 1 (input value: {containment_threshold}).") + raise ValueError( + f"Containment threshold must be between 0 and 1 (input value: {containment_threshold})." + ) if ani_threshold is not None and not 0 <= ani_threshold <= 1: - raise ValueError(f"ANI threshold must be between 0 and 1 (input value: {ani_threshold}).") - self._init_classification_results() # init some fields + raise ValueError( + f"ANI threshold must be between 0 and 1 (input value: {ani_threshold})." + ) + self._init_classification_results() # init some fields if not self.summarized_ranks or force_resummarize: - self.summarize_up_ranks(single_rank=rank, force_resummarize=force_resummarize) + self.summarize_up_ranks( + single_rank=rank, force_resummarize=force_resummarize + ) # catch potential error from running summarize_up_ranks separately and passing in different single_rank self.classified_ranks = self.summarized_ranks # if a rank is provided, we need to classify ONLY using that rank if rank: if rank not in self.summarized_ranks: - raise ValueError(f"Error: rank '{rank}' not in summarized rank(s), {','.join(self.summarized_ranks)}") + raise ValueError( + f"Error: rank '{rank}' not in summarized rank(s), {','.join(self.summarized_ranks)}" + ) else: self.classified_ranks = [rank] if lingroup_ranks: notify("Restricting classification to lingroups.") - self.classified_ranks = [x for x in self.classified_ranks if x in lingroup_ranks] + self.classified_ranks = [ + x for x in self.classified_ranks if x in lingroup_ranks + ] if not self.classified_ranks: - raise ValueError(f"Error: no ranks remain for classification.") + raise ValueError("Error: no ranks remain for classification.") # CLASSIFY using summarization--> best only result. Best way = use ANI or containment threshold classif = None - for this_rank in self.classified_ranks: # ascending order or just single rank + for this_rank in self.classified_ranks: # ascending order or just single rank # reset for this rank - f_weighted=0.0 - f_unique_at_rank=0.0 - bp_intersect_at_rank=0 + f_weighted = 0.0 + f_unique_at_rank = 0.0 + bp_intersect_at_rank = 0 sum_uniq_to_query = self.sum_uniq_to_query[this_rank] # sort the results and grab best sorted_sum_uniq_to_query = list(sum_uniq_to_query.items()) - sorted_sum_uniq_to_query.sort(key = lambda x: -x[1]) + sorted_sum_uniq_to_query.sort(key=lambda x: -x[1]) # select best-at-rank only this_lineage, f_unique_at_rank = sorted_sum_uniq_to_query[0] # if in desired lineage groups, continue (or??) @@ -2074,19 +2361,33 @@ def build_classification_result(self, rank=None, ani_threshold=None, containment bp_intersect_at_rank = self.sum_uniq_bp[this_rank][this_lineage] f_weighted = self.sum_uniq_weighted[this_rank][this_lineage] - classif = ClassificationResult(rank=this_rank, fraction=f_unique_at_rank, lineage=this_lineage, - f_weighted_at_rank=f_weighted, bp_match_at_rank=bp_intersect_at_rank) - - classif.set_status(self.query_info, containment_threshold=containment_threshold, ani_threshold=ani_threshold) + classif = ClassificationResult( + rank=this_rank, + fraction=f_unique_at_rank, + lineage=this_lineage, + f_weighted_at_rank=f_weighted, + bp_match_at_rank=bp_intersect_at_rank, + ) + + classif.set_status( + self.query_info, + containment_threshold=containment_threshold, + ani_threshold=ani_threshold, + ) # determine whether to move on to a higher tax rank (if avail) - if classif.status == 'match' or classif.status == "nomatch": # not sure we want/need the `nomatch` part... + if ( + classif.status == "match" or classif.status == "nomatch" + ): # not sure we want/need the `nomatch` part... break # store the final classification result self.classification_result = classif # could do this later, in __main__.py, for example - self.krona_classified, self.krona_unclassified = self.classification_result.build_krona_result(rank=rank) - self.krona_header = self.make_krona_header(min_rank = rank) + ( + self.krona_classified, + self.krona_unclassified, + ) = self.classification_result.build_krona_result(rank=rank) + self.krona_header = self.make_krona_header(min_rank=rank) def make_krona_header(self, min_rank): "make header for krona output" @@ -2096,7 +2397,7 @@ def make_krona_header(self, min_rank): raise ValueError(f"Rank '{min_rank}' not present in summarized ranks.") else: rank_index = self.ranks.index(min_rank) - return ["fraction"] + list(self.ranks[:rank_index+1]) + return ["fraction"] + list(self.ranks[: rank_index + 1]) def check_classification(self): if not self.classification_result: @@ -2125,41 +2426,65 @@ def make_full_summary(self, classification=False, limit_float=False): rD = {} if classification: self.check_classification() - header= ["query_name", "status", "rank", "fraction", "lineage", - "query_md5", "query_filename", "f_weighted_at_rank", - "bp_match_at_rank", "query_ani_at_rank"] - rD = self.classification_result.as_summary_dict(query_info = self.query_info, limit_float=limit_float) - del rD['total_weighted_hashes'] + header = [ + "query_name", + "status", + "rank", + "fraction", + "lineage", + "query_md5", + "query_filename", + "f_weighted_at_rank", + "bp_match_at_rank", + "query_ani_at_rank", + ] + rD = self.classification_result.as_summary_dict( + query_info=self.query_info, limit_float=limit_float + ) + del rD["total_weighted_hashes"] results.append(rD) else: self.check_summarization() - header= ["query_name", "rank", "fraction", "lineage", "query_md5", - "query_filename", "f_weighted_at_rank", "bp_match_at_rank", - "query_ani_at_rank", "total_weighted_hashes"] - - for rank in self.summarized_ranks[::-1]: #descending - unclassified=[] + header = [ + "query_name", + "rank", + "fraction", + "lineage", + "query_md5", + "query_filename", + "f_weighted_at_rank", + "bp_match_at_rank", + "query_ani_at_rank", + "total_weighted_hashes", + ] + + for rank in self.summarized_ranks[::-1]: # descending + unclassified = [] rank_results = self.summarized_lineage_results[rank] - rank_results.sort(key=lambda res: -res.fraction) #v5?: f_weighted_at_rank) + rank_results.sort( + key=lambda res: -res.fraction + ) # v5?: f_weighted_at_rank) for res in rank_results: - rD = res.as_summary_dict(query_info=self.query_info, limit_float=limit_float) + rD = res.as_summary_dict( + query_info=self.query_info, limit_float=limit_float + ) # save unclassified for the end - if rD['lineage'] == "unclassified": + if rD["lineage"] == "unclassified": unclassified.append(rD) else: results.append(rD) - results +=unclassified + results += unclassified return header, results def make_kreport_results(self): - ''' + """ Format taxonomy-summarized gather results as kraken-style kreport. STANDARD KREPORT FORMAT: - `Percent Reads Contained in Taxon`: The cumulative percentage of reads for this taxon and all descendants. - `Number of Reads Contained in Taxon`: The cumulative number of reads for this taxon and all descendants. - `Number of Reads Assigned to Taxon`: The number of reads assigned directly to this taxon (not a cumulative count of all descendants). - - `Rank Code`: (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. + - `Rank Code`: (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. - `NCBI Taxon ID`: Numerical ID from the NCBI taxonomy database. - `Scientific Name`: The scientific name of the taxon. @@ -2191,30 +2516,43 @@ def make_kreport_results(self): - `Percent Contained in Taxon`: Percent of all base pairs contained by this taxon (weighted by abundance if tracked) - `Estimated base pairs Contained in Taxon`: Number of base pairs contained by this taxon (weighted by abundance if tracked) - `Estimated base pairs Assigned to Taxon`: Number of base pairs at species-level (weighted by abundance if tracked) - - `Rank Code`: (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. + - `Rank Code`: (U)nclassified, (R)oot, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. - `NCBI Taxon ID` will not be reported (blank entries). - `Scientific Name`: The scientific name of the taxon. In the future, we may wish to report the NCBI taxid when we can (NCBI taxonomy only). - ''' + """ self.check_summarization() - header = ["percent_containment", "num_bp_contained", "num_bp_assigned", "rank_code", "ncbi_taxid", "sci_name"] + header = [ + "percent_containment", + "num_bp_contained", + "num_bp_assigned", + "rank_code", + "ncbi_taxid", + "sci_name", + ] if self.query_info.total_weighted_hashes == 0: - raise ValueError("ERROR: cannot produce 'kreport' format from gather results before sourmash v4.5.0") + raise ValueError( + "ERROR: cannot produce 'kreport' format from gather results before sourmash v4.5.0" + ) required_ranks = set(RANKCODE.keys()) - acceptable_ranks = list(self.ranks) + ['unclassified', 'kingdom'] + acceptable_ranks = list(self.ranks) + ["unclassified", "kingdom"] if not required_ranks.issubset(set(acceptable_ranks)): - raise ValueError("ERROR: cannot produce 'kreport' format from ranks {', '.join(self.ranks)}") + raise ValueError( + "ERROR: cannot produce 'kreport' format from ranks {', '.join(self.ranks)}" + ) kreport_results = [] - unclassified_recorded=False + unclassified_recorded = False # want to order results descending by rank for rank in self.ranks: - if rank == 'strain': # no code for strain, can't include in this output afaik + if ( + rank == "strain" + ): # no code for strain, can't include in this output afaik continue rank_results = self.summarized_lineage_results[rank] for res in rank_results: kresD = res.as_kreport_dict(self.query_info) - if kresD['sci_name'] == "unclassified": + if kresD["sci_name"] == "unclassified": # SummarizedGatherResults have an unclassified lineage at every rank, to facilitate reporting at a specific rank. # Here, we only need to report it once, since it will be the same fraction for all ranks if unclassified_recorded: @@ -2224,7 +2562,9 @@ def make_kreport_results(self): kreport_results.append(kresD) return header, kreport_results - def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_prefix: lg_name} + def make_lingroup_results( + self, LINgroupsD + ): # LingroupsD is dictionary {lg_prefix: lg_name} """ Report results for the specified LINGroups. Keep LCA paths in order as much as possible. @@ -2233,7 +2573,9 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref header = ["name", "lin", "percent_containment", "num_bp_contained"] if self.query_info.total_weighted_hashes == 0: - raise ValueError("ERROR: cannot produce 'lingroup' format from gather results before sourmash v4.5.0") + raise ValueError( + "ERROR: cannot produce 'lingroup' format from gather results before sourmash v4.5.0" + ) # find the ranks we need to consider lg_ranks, all_lgs = parse_lingroups(LINgroupsD) @@ -2243,17 +2585,19 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref for rank in lg_ranks: rank_results = self.summarized_lineage_results[rank] for res in rank_results: - if res.lineage in all_lgs:# is this lineage in the list of LINgroups? - this_lingroup_name = LINgroupsD[res.lineage.display_lineage(truncate_empty=True)] + if res.lineage in all_lgs: # is this lineage in the list of LINgroups? + this_lingroup_name = LINgroupsD[ + res.lineage.display_lineage(truncate_empty=True) + ] lg_resD = res.as_lingroup_dict(self.query_info, this_lingroup_name) lg_results[res.lineage] = lg_resD # We want to return in ~ depth order: descending each specific path in order # use LineageTree to find ordered paths lg_tree = LineageTree(all_lgs) - ordered_paths = lg_tree.ordered_paths(include_internal = True) + ordered_paths = lg_tree.ordered_paths(include_internal=True) # store results in order: - lingroup_results=[] + lingroup_results = [] for lg in ordered_paths: # get LINInfo object lg_LINInfo = LINLineageInfo(lineage=lg) @@ -2261,9 +2605,9 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref lg_res = lg_results.get(lg_LINInfo) if lg_res: lingroup_results.append(lg_res) - + return header, lingroup_results - + def make_cami_bioboxes(self): """ info: https://github.com/CAMI-challenge/contest_information/blob/master/file_formats/CAMI_TP_specification.mkd @@ -2271,17 +2615,17 @@ def make_cami_bioboxes(self): columns: TAXID - specifies a unique alphanumeric ID for a node in a reference tree such as the NCBI taxonomy RANK - superkingdom --> strain - TAXPATH - the path from the root of the reference taxonomy to the respective taxon + TAXPATH - the path from the root of the reference taxonomy to the respective taxon TAXPATHSN - scientific names of taxpath PERCENTAGE (0-100) - field specifies what percentage of the sample was assigned to the respective TAXID example: - + #CAMI Submission for Taxonomic Profiling @Version:0.9.1 @SampleID:SAMPLEID @Ranks:superkingdom|phylum|class|order|family|genus|species|strain - + @@TAXID RANK TAXPATH TAXPATHSN PERCENTAGE 2 superkingdom 2 Bacteria 98.81211 2157 superkingdom 2157 Archaea 1.18789 @@ -2296,7 +2640,7 @@ def make_cami_bioboxes(self): 204455 order 2|1224|28211|204455 Bacteria|Proteobacteria|Alphaproteobacteria|Rhodobacterales 8.42263 2158 order 2157|28890|183925|2158 Archaea|Euryarchaeotes|Methanobacteria|Methanobacteriales 1.18789 """ - # build CAMI header info + # build CAMI header info header_title = "# Taxonomic Profiling Output" version_info = "@Version:0.10.0" program = "@__program__:sourmash" @@ -2308,9 +2652,9 @@ def make_cami_bioboxes(self): rank_info = f"@Ranks:{'|'.join(ranks)}" header_lines = [header_title, sample_info, version_info, rank_info, program] - colnames = ["@@TAXID","RANK","TAXPATH","TAXPATHSN","PERCENTAGE"] - header_lines.append('\t'.join(colnames)) - + colnames = ["@@TAXID", "RANK", "TAXPATH", "TAXPATHSN", "PERCENTAGE"] + header_lines.append("\t".join(colnames)) + # now build results in CAMI format bioboxes_results = [] # order results by rank (descending), then percentage @@ -2322,4 +2666,3 @@ def make_cami_bioboxes(self): bioboxes_results.append(bb_info) return header_lines, bioboxes_results - diff --git a/src/sourmash/utils.py b/src/sourmash/utils.py index 71afc20261..1910504e05 100644 --- a/src/sourmash/utils.py +++ b/src/sourmash/utils.py @@ -42,7 +42,7 @@ def decode_str(s): """Decodes a SourmashStr""" try: if s.len == 0: - return u"" + return "" return ffi.unpack(s.data, s.len).decode("utf-8", "replace") finally: if s.owned: diff --git a/tests/conftest.py b/tests/conftest.py index 3281133cd5..9cc035bb4a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,11 +5,14 @@ import pytest import matplotlib.pyplot as plt -plt.rcParams.update({'figure.max_open_warning': 0}) + +plt.rcParams.update({"figure.max_open_warning": 0}) from sourmash_tst_utils import TempDirectory, RunnerContext + sys.stdout = sys.stderr + @pytest.fixture def runtmp(): with TempDirectory() as location: @@ -66,16 +69,17 @@ def use_manifest(request): return request.param -@pytest.fixture(params=['json', 'sql']) +@pytest.fixture(params=["json", "sql"]) def lca_db_format(request): return request.param -@pytest.fixture(params=['csv', 'sql']) +@pytest.fixture(params=["csv", "sql"]) def manifest_db_format(request): return request.param -@pytest.fixture(params=['sig', 'sig.gz', 'zip', '.d/', '.sqldb']) + +@pytest.fixture(params=["sig", "sig.gz", "zip", ".d/", ".sqldb"]) def sig_save_extension(request): return request.param @@ -89,29 +93,37 @@ def pytest_collection_modifyitems(items, config): deselected_items = [] for item in items: - if fixture_name in getattr(item, 'fixturenames', ()): + if fixture_name in getattr(item, "fixturenames", ()): selected_items.append(item) else: deselected_items.append(item) config.hook.pytest_deselected(items=deselected_items) items[:] = selected_items + + # --- END - Only run tests using a particular fixture --- # + def pytest_addoption(parser): - parser.addoption("--usesfixture", - action="store", - default=None, - help="just run tests that use a particular fixture") + parser.addoption( + "--usesfixture", + action="store", + default=None, + help="just run tests that use a particular fixture", + ) + + parser.addoption( + "--run-hypothesis", action="store_true", help="run hypothesis tests" + ) - parser.addoption("--run-hypothesis", action="store_true", - help="run hypothesis tests") def pytest_runtest_setup(item): if item.config.getoption("--run-hypothesis"): if not any(mark for mark in item.iter_markers(name="hypothesis")): pytest.skip("--run-hypothesis option set, running only hypothesis tests") + settings.register_profile("ci", max_examples=1000) settings.register_profile("dev", max_examples=10) settings.register_profile("debug", max_examples=10, verbosity=Verbosity.verbose) -settings.load_profile(os.getenv(u'HYPOTHESIS_PROFILE', 'default')) +settings.load_profile(os.getenv("HYPOTHESIS_PROFILE", "default")) diff --git a/tests/sourmash_tst_utils.py b/tests/sourmash_tst_utils.py index 7425934d2a..a2a35cb2e6 100644 --- a/tests/sourmash_tst_utils.py +++ b/tests/sourmash_tst_utils.py @@ -7,7 +7,6 @@ import collections import pprint import traceback -from io import open # pylint: disable=redefined-builtin from io import StringIO from pathlib import Path @@ -15,13 +14,21 @@ from importlib.metadata import entry_points -SIG_FILES = [os.path.join('demo', f) for f in ( - "SRR2060939_1.sig", "SRR2060939_2.sig", "SRR2241509_1.sig", - "SRR2255622_1.sig", "SRR453566_1.sig", "SRR453569_1.sig", "SRR453570_1.sig") +SIG_FILES = [ + os.path.join("demo", f) + for f in ( + "SRR2060939_1.sig", + "SRR2060939_2.sig", + "SRR2241509_1.sig", + "SRR2255622_1.sig", + "SRR453566_1.sig", + "SRR453569_1.sig", + "SRR453570_1.sig", + ) ] -def scriptpath(scriptname='sourmash'): +def scriptpath(scriptname="sourmash"): """Return the path to the scripts, in both dev and install situations.""" # note - it doesn't matter what the scriptname is here, as long as # it's some script present in this version of sourmash. @@ -34,7 +41,7 @@ def scriptpath(scriptname='sourmash'): if os.path.exists(os.path.join(path, scriptname)): return path - for path in os.environ['PATH'].split(':'): + for path in os.environ["PATH"].split(":"): if os.path.exists(os.path.join(path, scriptname)): return path @@ -42,7 +49,7 @@ def scriptpath(scriptname='sourmash'): def _runscript(scriptname): """Find & run a script with exec (i.e. not via os.system or subprocess).""" namespace = {"__name__": "__main__"} - namespace['sys'] = globals()['sys'] + namespace["sys"] = globals()["sys"] try: (script,) = entry_points(name=scriptname, group="console_scripts") @@ -57,15 +64,15 @@ def _runscript(scriptname): if os.path.isfile(scriptfile): if os.path.isfile(scriptfile): exec( # pylint: disable=exec-used - compile(Path(scriptfile).read_text(), scriptfile, 'exec'), - namespace) + compile(Path(scriptfile).read_text(), scriptfile, "exec"), namespace + ) return 0 return -1 -ScriptResults = collections.namedtuple('ScriptResults', - ['status', 'out', 'err']) +ScriptResults = collections.namedtuple("ScriptResults", ["status", "out", "err"]) + def runscript(scriptname, args, **kwargs): """Run a Python script using exec(). @@ -81,8 +88,8 @@ def runscript(scriptname, args, **kwargs): sysargs.extend(args) cwd = os.getcwd() - in_directory = kwargs.get('in_directory', cwd) - fail_ok = kwargs.get('fail_ok', False) + in_directory = kwargs.get("in_directory", cwd) + fail_ok = kwargs.get("fail_ok", False) try: status = -1 @@ -90,8 +97,8 @@ def runscript(scriptname, args, **kwargs): sys.argv = sysargs oldin = None - if 'stdin_data' in kwargs: - oldin, sys.stdin = sys.stdin, StringIO(kwargs['stdin_data']) + if "stdin_data" in kwargs: + oldin, sys.stdin = sys.stdin, StringIO(kwargs["stdin_data"]) oldout, olderr = sys.stdout, sys.stderr sys.stdout = StringIO() @@ -101,13 +108,13 @@ def runscript(scriptname, args, **kwargs): os.chdir(in_directory) try: - print('running:', scriptname, 'in:', in_directory, file=oldout) - print('arguments', sysargs, file=oldout) + print("running:", scriptname, "in:", in_directory, file=oldout) + print("arguments", sysargs, file=oldout) status = _runscript(scriptname) except SystemExit as err: status = err.code - if status == None: + if status is None: status = 0 except: # pylint: disable=bare-except traceback.print_exc(file=sys.stderr) @@ -133,14 +140,13 @@ def runscript(scriptname, args, **kwargs): def get_test_data(filename): filepath = resources.files("sourmash") / "tests" / "test-data" / filename if not filepath.exists() or not os.path.isfile(filepath): - filepath = os.path.join(os.path.dirname(__file__), 'test-data', - filename) + filepath = os.path.join(os.path.dirname(__file__), "test-data", filename) return filepath -class TempDirectory(object): +class TempDirectory: def __init__(self): - self.tempdir = tempfile.mkdtemp(prefix='sourmashtest_') + self.tempdir = tempfile.mkdtemp(prefix="sourmashtest_") def __enter__(self): return self.tempdir @@ -158,10 +164,10 @@ def __exit__(self, exc_type, exc_value, traceback): class SourmashCommandFailed(Exception): def __init__(self, msg): Exception.__init__(self, msg) - self.message = msg + self.message = msg -class RunnerContext(object): +class RunnerContext: """ I am a RunnerContext object from sourmash_tst_utils. @@ -171,6 +177,7 @@ class RunnerContext(object): You can use the 'output' method to build filenames in my temp directory. """ + def __init__(self, location): self.location = location self.last_command = None @@ -178,25 +185,26 @@ def __init__(self, location): def run_sourmash(self, *args, **kwargs): "Run the sourmash script with the given arguments." - kwargs['fail_ok'] = True - if 'in_directory' not in kwargs: - kwargs['in_directory'] = self.location + kwargs["fail_ok"] = True + if "in_directory" not in kwargs: + kwargs["in_directory"] = self.location - cmdlist = ['sourmash'] - cmdlist.extend(( str(x) for x in args)) + cmdlist = ["sourmash"] + cmdlist.extend(str(x) for x in args) self.last_command = " ".join(cmdlist) - self.last_result = runscript('sourmash', args, **kwargs) + self.last_result = runscript("sourmash", args, **kwargs) if self.last_result.status: raise SourmashCommandFailed(self.last_result.err) return self.last_result + sourmash = run_sourmash def run(self, scriptname, *args, **kwargs): "Run a script with the given arguments." - if 'in_directory' not in kwargs: - kwargs['in_directory'] = self.location + if "in_directory" not in kwargs: + kwargs["in_directory"] = self.location self.last_command = " ".join(args) self.last_result = runscript(scriptname, args, **kwargs) return self.last_result @@ -207,18 +215,18 @@ def output(self, path): def __str__(self): s = "" if self.last_command: - s += "Last command run:\n{}\n".format(repr(self.last_command)) + s += f"Last command run:\n{repr(self.last_command)}\n" if self.last_result: s += "\nLAST RESULT:\n" - s += "- exit code: {}\n\n".format(self.last_result.status) + s += f"- exit code: {self.last_result.status}\n\n" if self.last_result.out: - s += "- stdout:\n---\n{}---\n".format(self.last_result.out) + s += f"- stdout:\n---\n{self.last_result.out}---\n" else: - s += '(no stdout)\n\n' + s += "(no stdout)\n\n" if self.last_result.err: - s += "- stderr:\n---\n{}---\n".format(self.last_result.err) + s += f"- stderr:\n---\n{self.last_result.err}---\n" else: - s += '(no stderr)\n' + s += "(no stderr)\n" return s diff --git a/tests/test__minhash_hypothesis.py b/tests/test__minhash_hypothesis.py index 7f1b421dbd..2778358caa 100644 --- a/tests/test__minhash_hypothesis.py +++ b/tests/test__minhash_hypothesis.py @@ -7,9 +7,11 @@ from sourmash.minhash import _get_max_hash_for_scaled -@given(st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), - st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), - st.integers(min_value=10, max_value=1000)) +@given( + st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), + st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), + st.integers(min_value=10, max_value=1000), +) @example([1, 2], [3, 4], 2) def test_set_abundance_num_hypothesis(hashes, abundances, sketch_size): a = MinHash(sketch_size, 10, track_abundance=True) @@ -25,9 +27,11 @@ def test_set_abundance_num_hypothesis(hashes, abundances, sketch_size): assert oracle[k] == v -@given(st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), - st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), - st.integers(min_value=1000, max_value=10000)) +@given( + st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), + st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), + st.integers(min_value=1000, max_value=10000), +) @example([0], [0], 1000) def test_set_abundance_scaled_hypothesis(hashes, abundances, scaled): a = MinHash(0, 10, track_abundance=True, scaled=scaled) diff --git a/tests/test_api.py b/tests/test_api.py index ccaf321df6..a06a610c83 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -9,10 +9,10 @@ def test_sourmash_signature_api(c): e = sourmash.MinHash(n=1, ksize=20) sig = sourmash.SourmashSignature(e) - with open(c.output('xxx.sig'), 'wt') as fp: + with open(c.output("xxx.sig"), "w") as fp: sourmash.save_signatures([sig], fp) - sig_x1 = sourmash.load_one_signature(c.output('xxx.sig')) - sig_x2 = list(sourmash.load_file_as_signatures(c.output('xxx.sig')))[0] + sig_x1 = sourmash.load_one_signature(c.output("xxx.sig")) + sig_x2 = list(sourmash.load_file_as_signatures(c.output("xxx.sig")))[0] assert sig_x1 == sig assert sig_x2 == sig @@ -21,12 +21,12 @@ def test_sourmash_signature_api(c): @utils.in_tempdir def test_load_index_0_no_file(c): with pytest.raises(ValueError) as exc: - idx = sourmash.load_file_as_index(c.output('does-not-exist')) - assert 'Error while reading signatures from ' in str(exc.value) + sourmash.load_file_as_index(c.output("does-not-exist")) + assert "Error while reading signatures from " in str(exc.value) def test_load_index_1(): - testfile = utils.get_test_data('prot/protein.sbt.zip') + testfile = utils.get_test_data("prot/protein.sbt.zip") idx = sourmash.load_file_as_index(testfile) sigs = list(idx.signatures()) @@ -34,7 +34,7 @@ def test_load_index_1(): def test_load_index_2(): - testfile = utils.get_test_data('prot/protein.lca.json.gz') + testfile = utils.get_test_data("prot/protein.lca.json.gz") idx = sourmash.load_file_as_index(testfile) sigs = list(idx.signatures()) @@ -42,7 +42,7 @@ def test_load_index_2(): def test_load_index_3(): - testfile = utils.get_test_data('prot/protein/') + testfile = utils.get_test_data("prot/protein/") idx = sourmash.load_file_as_index(testfile) sigs = list(idx.signatures()) @@ -50,7 +50,7 @@ def test_load_index_3(): def test_load_index_4(): - testfile = utils.get_test_data('prot/all.zip') + testfile = utils.get_test_data("prot/all.zip") idx = sourmash.load_file_as_index(testfile) sigs = list(idx.signatures()) @@ -58,7 +58,7 @@ def test_load_index_4(): def test_load_index_4_b(): - testfile = utils.get_test_data('prot/protein.zip') + testfile = utils.get_test_data("prot/protein.zip") idx = sourmash.load_file_as_index(testfile) sigs = list(idx.signatures()) @@ -67,19 +67,24 @@ def test_load_index_4_b(): def test_load_fasta_as_signature(): # try loading a fasta file - should fail with informative exception - testfile = utils.get_test_data('short.fa') + testfile = utils.get_test_data("short.fa") with pytest.raises(Exception) as exc: - idx = sourmash.load_file_as_index(testfile) + sourmash.load_file_as_index(testfile) print(exc.value) - assert f"Error while reading signatures from '{testfile}' - got sequences instead! Is this a FASTA/FASTQ file?" in str(exc.value) + assert ( + f"Error while reading signatures from '{testfile}' - got sequences instead! Is this a FASTA/FASTQ file?" + in str(exc.value) + ) def test_load_and_search_sbt_api(): - treefile = utils.get_test_data('prot/protein.sbt.zip') - queryfile = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') + treefile = utils.get_test_data("prot/protein.sbt.zip") + queryfile = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) tree = sourmash.load_sbt_index(treefile) query = sourmash.load_one_signature(queryfile) diff --git a/tests/test_bugs.py b/tests/test_bugs.py index e0f3c5daf0..2b8f677279 100644 --- a/tests/test_bugs.py +++ b/tests/test_bugs.py @@ -1,11 +1,12 @@ import sourmash_tst_utils as utils + @utils.in_tempdir def test_bug_803(c): # can we do a 'sourmash search' on an LCA database and a query with abundance? - query = utils.get_test_data('track_abund/47.fa.sig') - lca_db = utils.get_test_data('lca/47+63.lca.json') + query = utils.get_test_data("track_abund/47.fa.sig") + lca_db = utils.get_test_data("lca/47+63.lca.json") - c.run_sourmash('search', query, lca_db, '--ignore-abundance') + c.run_sourmash("search", query, lca_db, "--ignore-abundance") print(c) - assert 'NC_009665.1 Shewanella baltica OS185, complete genome' in str(c) + assert "NC_009665.1 Shewanella baltica OS185, complete genome" in str(c) diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 232ba6a218..7f8365118f 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -26,31 +26,33 @@ def _write_file(runtmp, basename, lines, *, gz=False): else: xopen = open - with xopen(loc, 'wt') as fp: + with xopen(loc, "wt") as fp: fp.write("\n".join(lines)) return loc def test_run_sourmash_signature_cmd(): - status, out, err = utils.runscript('sourmash', ['signature'], fail_ok=True) - assert not 'sourmash: error: argument cmd: invalid choice:' in err - assert 'Manipulate signature files:' in out - assert status != 0 # no args provided, ok ;) + status, out, err = utils.runscript("sourmash", ["signature"], fail_ok=True) + assert "sourmash: error: argument cmd: invalid choice:" not in err + assert "Manipulate signature files:" in out + assert status != 0 # no args provided, ok ;) def test_run_sourmash_sig_cmd(): - status, out, err = utils.runscript('sourmash', ['sig'], fail_ok=True) - assert not 'sourmash: error: argument cmd: invalid choice:' in err - assert 'Manipulate signature files:' in out - assert status != 0 # no args provided, ok ;) + status, out, err = utils.runscript("sourmash", ["sig"], fail_ok=True) + assert "sourmash: error: argument cmd: invalid choice:" not in err + assert "Manipulate signature files:" in out + assert status != 0 # no args provided, ok ;) def test_run_cat_via_parse_args(): # run a command ('sourmash.sig.cat') with args constructed via parse_args - import sourmash.sig, sourmash.cli - sig47 = utils.get_test_data('47.fa.sig') + import sourmash.sig + import sourmash.cli - args = sourmash.cli.parse_args(['sig', 'cat', sig47]) + sig47 = utils.get_test_data("47.fa.sig") + + args = sourmash.cli.parse_args(["sig", "cat", sig47]) sourmash.sig.cat(args) @@ -58,10 +60,10 @@ def test_sig_merge_1_use_full_signature_in_cmd(runtmp): c = runtmp # merge of 47 & 63 should be union of mins - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig47and63 = utils.get_test_data('47+63.fa.sig') - c.run_sourmash('signature', 'merge', sig47, sig63) + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig47and63 = utils.get_test_data("47+63.fa.sig") + c.run_sourmash("signature", "merge", sig47, sig63) # stdout should be new signature out = c.last_result.out @@ -80,16 +82,21 @@ def test_sig_merge_1_fromfile_picklist(runtmp): c = runtmp # merge of 47 & 63 should be union of mins - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig47and63 = utils.get_test_data('47+63.fa.sig') - - from_file = _write_file(runtmp, 'list.txt', [sig47, sig63]) - picklist = _write_file(runtmp, 'pl.csv', - ['md5short', '09a08691', '38729c63']) - - c.run_sourmash('signature', 'merge', '--from-file', from_file, - '--picklist', f'{picklist}:md5short:md5short') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig47and63 = utils.get_test_data("47+63.fa.sig") + + from_file = _write_file(runtmp, "list.txt", [sig47, sig63]) + picklist = _write_file(runtmp, "pl.csv", ["md5short", "09a08691", "38729c63"]) + + c.run_sourmash( + "signature", + "merge", + "--from-file", + from_file, + "--picklist", + f"{picklist}:md5short:md5short", + ) # stdout should be new signature out = c.last_result.out @@ -109,17 +116,23 @@ def test_sig_merge_1_fromfile_picklist_gz(runtmp): c = runtmp # merge of 47 & 63 should be union of mins - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig47and63 = utils.get_test_data('47+63.fa.sig') - - from_file = _write_file(runtmp, 'list.txt', [sig47, sig63]) - picklist = _write_file(runtmp, 'pl.csv', - ['md5short', '09a08691', '38729c63'], - gz=True) - - c.run_sourmash('signature', 'merge', '--from-file', from_file, - '--picklist', f'{picklist}:md5short:md5short') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig47and63 = utils.get_test_data("47+63.fa.sig") + + from_file = _write_file(runtmp, "list.txt", [sig47, sig63]) + picklist = _write_file( + runtmp, "pl.csv", ["md5short", "09a08691", "38729c63"], gz=True + ) + + c.run_sourmash( + "signature", + "merge", + "--from-file", + from_file, + "--picklist", + f"{picklist}:md5short:md5short", + ) # stdout should be new signature out = c.last_result.out @@ -137,10 +150,10 @@ def test_sig_merge_1_fromfile_picklist_gz(runtmp): @utils.in_tempdir def test_sig_merge_1(c): # merge of 47 & 63 should be union of mins - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig47and63 = utils.get_test_data('47+63.fa.sig') - c.run_sourmash('sig', 'merge', sig47, sig63) + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig47and63 = utils.get_test_data("47+63.fa.sig") + c.run_sourmash("sig", "merge", sig47, sig63) # stdout should be new signature out = c.last_result.out @@ -158,9 +171,9 @@ def test_sig_merge_1(c): @utils.in_tempdir def test_sig_merge_1_multisig(c): # merge of 47 & 63 should be union of mins; here, sigs are in same file. - multisig = utils.get_test_data('47+63-multisig.sig') - sig47and63 = utils.get_test_data('47+63.fa.sig') - c.run_sourmash('sig', 'merge', multisig, '--flatten') + multisig = utils.get_test_data("47+63-multisig.sig") + sig47and63 = utils.get_test_data("47+63.fa.sig") + c.run_sourmash("sig", "merge", multisig, "--flatten") # stdout should be new signature out = c.last_result.out @@ -178,13 +191,25 @@ def test_sig_merge_1_multisig(c): @utils.in_tempdir def test_sig_merge_1_name(c): # check name arg - sig2 = utils.get_test_data('2.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - - assignedSigName = 'SIG_NAME' - outsig = c.output('merged2and63.sig') - - c.run_sourmash('sig', 'merge', sig2, sig63, '--dna', '-k', '31', '-o', "merged2and63.sig", '--name', assignedSigName ) + sig2 = utils.get_test_data("2.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + + assignedSigName = "SIG_NAME" + outsig = c.output("merged2and63.sig") + + c.run_sourmash( + "sig", + "merge", + sig2, + sig63, + "--dna", + "-k", + "31", + "-o", + "merged2and63.sig", + "--name", + assignedSigName, + ) test_merge_sig = sourmash.load_one_signature(outsig) @@ -197,10 +222,10 @@ def test_sig_merge_1_name(c): @utils.in_tempdir def test_sig_merge_1_ksize_moltype(c): # check ksize, moltype args - sig2 = utils.get_test_data('2.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig2and63 = utils.get_test_data('2+63.fa.sig') - c.run_sourmash('sig', 'merge', sig2, sig63, '--dna', '-k', '31') + sig2 = utils.get_test_data("2.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig2and63 = utils.get_test_data("2+63.fa.sig") + c.run_sourmash("sig", "merge", sig2, sig63, "--dna", "-k", "31") # stdout should be new signature out = c.last_result.out @@ -218,12 +243,12 @@ def test_sig_merge_1_ksize_moltype(c): @utils.in_tempdir def test_sig_merge_1_ksize_moltype_fail(c): # check ksize, moltype args - sig2 = utils.get_test_data('2.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig2and63 = utils.get_test_data('2+63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + utils.get_test_data("2+63.fa.sig") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('sig', 'merge', sig2, sig63) + c.run_sourmash("sig", "merge", sig2, sig63) assert "ERROR when merging signature" in str(exc.value) @@ -231,8 +256,8 @@ def test_sig_merge_1_ksize_moltype_fail(c): @utils.in_tempdir def test_sig_merge_2(c): # merge of 47 with nothing should be 47 - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'merge', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "merge", sig47) # stdout should be new signature out = c.last_result.out @@ -248,46 +273,52 @@ def test_sig_merge_2(c): @utils.in_tempdir def test_sig_merge_3_abund_ab_ok(c): # merge of 47 and 63 with abund should work - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - sig63abund = utils.get_test_data('track_abund/63.fa.sig') + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + sig63abund = utils.get_test_data("track_abund/63.fa.sig") - c.run_sourmash('sig', 'merge', sig47abund, sig63abund) - actual_merge_sig = sourmash.load_one_signature(c.last_result.out) + c.run_sourmash("sig", "merge", sig47abund, sig63abund) + sourmash.load_one_signature(c.last_result.out) # CTB: should check that this merge did what we think it should do! @utils.in_tempdir def test_sig_merge_3_abund_ab(c): # merge of 47 with abund, with 63 without, should fail; and vice versa - sig47 = utils.get_test_data('47.fa.sig') - sig63abund = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63abund = utils.get_test_data("track_abund/63.fa.sig") - with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('sig', 'merge', sig47, sig63abund) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("sig", "merge", sig47, sig63abund) print(c.last_result) - assert 'incompatible signatures: track_abundance is False in first sig, True in second' in c.last_result.err + assert ( + "incompatible signatures: track_abundance is False in first sig, True in second" + in c.last_result.err + ) @utils.in_tempdir def test_sig_merge_3_abund_ba(c): # merge of 47 without abund, with 63 with, should fail - sig47 = utils.get_test_data('47.fa.sig') - sig63abund = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63abund = utils.get_test_data("track_abund/63.fa.sig") - with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('sig', 'merge', sig63abund, sig47) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("sig", "merge", sig63abund, sig47) print(c.last_result) - assert 'incompatible signatures: track_abundance is True in first sig, False in second' in c.last_result.err + assert ( + "incompatible signatures: track_abundance is True in first sig, False in second" + in c.last_result.err + ) @utils.in_tempdir def test_sig_filter_1(c): # test basic filtering - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') - c.run_sourmash('sig', 'filter', sig47, sig63) + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") + c.run_sourmash("sig", "filter", sig47, sig63) # stdout should be new signature out = c.last_result.out @@ -307,8 +338,8 @@ def test_sig_filter_1(c): @utils.in_tempdir def test_sig_filter_2(c): # test basic filtering - sig47 = utils.get_test_data('track_abund/47.fa.sig') - c.run_sourmash('sig', 'filter', '-m', '2', '-M', '5', sig47) + sig47 = utils.get_test_data("track_abund/47.fa.sig") + c.run_sourmash("sig", "filter", "-m", "2", "-M", "5", sig47) # stdout should be new signature out = c.last_result.out @@ -317,7 +348,7 @@ def test_sig_filter_2(c): test_sig = sourmash.load_one_signature(sig47) abunds = test_sig.minhash.hashes - abunds = { k: v for (k, v) in abunds.items() if v >= 2 and v <= 5 } + abunds = {k: v for (k, v) in abunds.items() if v >= 2 and v <= 5} assert abunds assert filtered_sig.minhash.hashes == abunds @@ -326,8 +357,8 @@ def test_sig_filter_2(c): @utils.in_tempdir def test_sig_filter_3(c): # test basic filtering - sig47 = utils.get_test_data('track_abund/47.fa.sig') - c.run_sourmash('sig', 'filter', '-m', '2', sig47) + sig47 = utils.get_test_data("track_abund/47.fa.sig") + c.run_sourmash("sig", "filter", "-m", "2", sig47) # stdout should be new signature out = c.last_result.out @@ -336,7 +367,7 @@ def test_sig_filter_3(c): test_sig = sourmash.load_one_signature(sig47) abunds = test_sig.minhash.hashes - abunds = { k: v for (k, v) in abunds.items() if v >= 2 } + abunds = {k: v for (k, v) in abunds.items() if v >= 2} assert abunds assert filtered_sig.minhash.hashes == abunds @@ -345,8 +376,8 @@ def test_sig_filter_3(c): @utils.in_tempdir def test_sig_filter_3_ksize_select(c): # test filtering with ksize selectiong - psw_mag = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - c.run_sourmash('sig', 'filter', '-m', '2', psw_mag, '-k', '31') + psw_mag = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + c.run_sourmash("sig", "filter", "-m", "2", psw_mag, "-k", "31") # stdout should be new signature out = c.last_result.out @@ -355,7 +386,7 @@ def test_sig_filter_3_ksize_select(c): test_sig = sourmash.load_one_signature(psw_mag, ksize=31) abunds = test_sig.minhash.hashes - abunds = { k: v for (k, v) in abunds.items() if v >= 2 } + abunds = {k: v for (k, v) in abunds.items() if v >= 2} assert abunds assert filtered_sig.minhash.hashes == abunds @@ -364,11 +395,11 @@ def test_sig_filter_3_ksize_select(c): @utils.in_tempdir def test_sig_merge_flatten(c): # merge of 47 without abund, with 63 with, will succeed with --flatten - sig47 = utils.get_test_data('47.fa.sig') - sig63abund = utils.get_test_data('track_abund/63.fa.sig') - sig47and63 = utils.get_test_data('47+63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63abund = utils.get_test_data("track_abund/63.fa.sig") + sig47and63 = utils.get_test_data("47+63.fa.sig") - c.run_sourmash('sig', 'merge', sig63abund, sig47, '--flatten') + c.run_sourmash("sig", "merge", sig63abund, sig47, "--flatten") print(c.last_result) out = c.last_result.out @@ -386,11 +417,11 @@ def test_sig_merge_flatten(c): @utils.in_tempdir def test_sig_merge_flatten_2(c): # merge of 47 with abund, with 63 with, will succeed with --flatten - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig47and63 = utils.get_test_data('47+63.fa.sig') + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig47and63 = utils.get_test_data("47+63.fa.sig") - c.run_sourmash('sig', 'merge', sig63, sig47abund, '--flatten') + c.run_sourmash("sig", "merge", sig63, sig47abund, "--flatten") print(c.last_result) out = c.last_result.out @@ -410,7 +441,7 @@ def test_sig_intersect_0(runtmp): c = runtmp with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'intersect') + c.run_sourmash("sig", "intersect") err = c.last_result.err assert "no signatures provided to intersect!?" in err @@ -420,10 +451,10 @@ def test_sig_intersect_1(runtmp): c = runtmp # intersect of 47 and 63 should be intersection of mins - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig47and63 = utils.get_test_data('47+63-intersect.fa.sig') - c.run_sourmash('sig', 'intersect', sig47, sig63) + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig47and63 = utils.get_test_data("47+63-intersect.fa.sig") + c.run_sourmash("sig", "intersect", sig47, sig63) # stdout should be new signature out = c.last_result.out @@ -442,16 +473,21 @@ def test_sig_intersect_1_fromfile_picklist(runtmp): c = runtmp # intersect of 47 and 63 should be intersection of mins - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig47and63 = utils.get_test_data('47+63-intersect.fa.sig') - - from_file = _write_file(runtmp, 'list.txt', [sig47, sig63]) - picklist = _write_file(runtmp, 'pl.csv', - ['md5short', '09a08691', '38729c63']) - - c.run_sourmash('signature', 'intersect', '--from-file', from_file, - '--picklist', f'{picklist}:md5short:md5short') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig47and63 = utils.get_test_data("47+63-intersect.fa.sig") + + from_file = _write_file(runtmp, "list.txt", [sig47, sig63]) + picklist = _write_file(runtmp, "pl.csv", ["md5short", "09a08691", "38729c63"]) + + c.run_sourmash( + "signature", + "intersect", + "--from-file", + from_file, + "--picklist", + f"{picklist}:md5short:md5short", + ) # stdout should be new signature out = c.last_result.out @@ -470,10 +506,10 @@ def test_sig_intersect_1_fromfile_picklist(runtmp): def test_sig_intersect_2(c): # intersect of 47 with abund and 63 with abund should be same # as without abund, i.e. intersect 'flattens' - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') - sig47and63 = utils.get_test_data('47+63-intersect.fa.sig') - c.run_sourmash('sig', 'intersect', sig47, sig63) + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") + sig47and63 = utils.get_test_data("47+63-intersect.fa.sig") + c.run_sourmash("sig", "intersect", sig47, sig63) # stdout should be new signature out = c.last_result.out @@ -491,9 +527,9 @@ def test_sig_intersect_2(c): @utils.in_tempdir def test_sig_intersect_3(c): # use --abundances-from to preserve abundances from sig #47 - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') - c.run_sourmash('sig', 'intersect', '--abundances-from', sig47, sig63) + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") + c.run_sourmash("sig", "intersect", "--abundances-from", sig47, sig63) # stdout should be new signature out = c.last_result.out @@ -510,7 +546,7 @@ def test_sig_intersect_3(c): mh63_mins.intersection_update(mh47_abunds) # take abundances from mh47 & create new sig - mh47_abunds = { k: mh47_abunds[k] for k in mh63_mins } + mh47_abunds = {k: mh47_abunds[k] for k in mh63_mins} test_mh = mh47.copy_and_clear() test_mh.set_abundances(mh47_abunds) @@ -523,9 +559,9 @@ def test_sig_intersect_3(c): @utils.in_tempdir def test_sig_intersect_4(c): # use --abundances-from to preserve abundances from sig #47 - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') - c.run_sourmash('sig', 'intersect', '--abundances-from', sig47, sig63) + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") + c.run_sourmash("sig", "intersect", "--abundances-from", sig47, sig63) # stdout should be new signature out = c.last_result.out @@ -542,7 +578,7 @@ def test_sig_intersect_4(c): mh63_mins.intersection_update(mh47_abunds) # take abundances from mh47 & create new sig - mh47_abunds = { k: mh47_abunds[k] for k in mh63_mins } + mh47_abunds = {k: mh47_abunds[k] for k in mh63_mins} test_mh = mh47.copy_and_clear() test_mh.set_abundances(mh47_abunds) @@ -556,41 +592,41 @@ def test_sig_intersect_4(c): def test_sig_intersect_5(c): # use --abundances-from to preserve abundances from sig #47 # make sure that you can't specify a flat sig for --abundances-from - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'intersect', '--abundances-from', sig47, sig63) + c.run_sourmash("sig", "intersect", "--abundances-from", sig47, sig63) @utils.in_tempdir def test_sig_intersect_6_ksize_fail(c): # specify ksize to intersect 2.fa.sig with 47.fa.sig - 2.fa.sig contains # multiple ksizes. - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'intersect', sig2, sig47) + c.run_sourmash("sig", "intersect", sig2, sig47) @utils.in_tempdir def test_sig_intersect_6_ksize_succeed(c): # specify ksize to intersect 2.fa.sig with 47.fa.sig - 2.fa.sig contains # multiple ksizes. - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") - c.run_sourmash('sig', 'intersect', '-k', '31', sig2, sig47) + c.run_sourmash("sig", "intersect", "-k", "31", sig2, sig47) - assert 'loaded and intersected 2 signatures' in c.last_result.err + assert "loaded and intersected 2 signatures" in c.last_result.err @utils.in_tempdir def test_sig_intersect_7(c): # intersect of 47 and nothing should be self - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'intersect', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "intersect", sig47) # stdout should be new signature out = c.last_result.out @@ -608,8 +644,8 @@ def test_sig_intersect_7(c): @utils.in_tempdir def test_sig_intersect_8_multisig(c): # intersect of all the multisig stuff should be nothing - sig47 = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'intersect', sig47) + sig47 = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "intersect", sig47) # stdout should be new signature out = c.last_result.out @@ -621,9 +657,9 @@ def test_sig_intersect_8_multisig(c): def test_sig_inflate_1(runtmp): # basic inflate test - inflate 47 flat with 47 abund - sig47_flat = utils.get_test_data('47.fa.sig') - sig47_abund = utils.get_test_data('track_abund/47.fa.sig') - runtmp.run_sourmash('sig', 'inflate', sig47_abund, sig47_flat) + sig47_flat = utils.get_test_data("47.fa.sig") + sig47_abund = utils.get_test_data("track_abund/47.fa.sig") + runtmp.run_sourmash("sig", "inflate", sig47_abund, sig47_flat) # stdout should be new signature out = runtmp.last_result.out @@ -641,9 +677,9 @@ def test_sig_inflate_1(runtmp): def test_sig_inflate_2(runtmp): # use abundances from sig #47 - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - runtmp.run_sourmash('sig', 'inflate', sig47, sig63) + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + runtmp.run_sourmash("sig", "inflate", sig47, sig63) # stdout should be new signature out = runtmp.last_result.out @@ -660,7 +696,7 @@ def test_sig_inflate_2(runtmp): mh63_mins.intersection_update(mh47_abunds) # take abundances from mh47 & create new sig - mh47_abunds = { k: mh47_abunds[k] for k in mh63_mins } + mh47_abunds = {k: mh47_abunds[k] for k in mh63_mins} test_mh = mh47.copy_and_clear() test_mh.set_abundances(mh47_abunds) @@ -672,34 +708,33 @@ def test_sig_inflate_2(runtmp): def test_sig_inflate_3(runtmp): # should fail on flat first sig - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('sig', 'inflate', sig63, sig47) + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash("sig", "inflate", sig63, sig47) - assert 'has no abundances' in runtmp.last_result.err + assert "has no abundances" in runtmp.last_result.err def test_sig_inflate_4_picklist(runtmp): # try out picklists - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - sig47_flat = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + sig47_flat = utils.get_test_data("47.fa.sig") ss63 = sourmash.load_one_signature(sig63, ksize=31) - picklist = _write_file(runtmp, 'pl.csv', ['md5', ss63.md5sum()]) + _write_file(runtmp, "pl.csv", ["md5", ss63.md5sum()]) print(ss63.md5sum()) - - runtmp.run_sourmash('sig', 'inflate', sig47, sig63, sig47_flat, - '--picklist', f'pl.csv:md5:md5') + runtmp.run_sourmash( + "sig", "inflate", sig47, sig63, sig47_flat, "--picklist", "pl.csv:md5:md5" + ) # stdout should be new signature out = runtmp.last_result.out - err = runtmp.last_result.err actual_inflate_sig = sourmash.load_one_signature(out) @@ -713,7 +748,7 @@ def test_sig_inflate_4_picklist(runtmp): mh63_mins.intersection_update(mh47_abunds) # take abundances from mh47 & create new sig - mh47_abunds = { k: mh47_abunds[k] for k in mh63_mins } + mh47_abunds = {k: mh47_abunds[k] for k in mh63_mins} test_mh = mh47.copy_and_clear() test_mh.set_abundances(mh47_abunds) @@ -725,21 +760,21 @@ def test_sig_inflate_4_picklist(runtmp): def test_sig_inflate_5_bad_moltype(runtmp): # should fail when no signatures match moltype - sig47 = utils.get_test_data('track_abund/47.fa.sig') - prot = utils.get_test_data('prot/protein.zip') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + prot = utils.get_test_data("prot/protein.zip") - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('sig', 'inflate', sig47, prot) + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash("sig", "inflate", sig47, prot) - assert 'no signatures to inflate' in runtmp.last_result.err + assert "no signatures to inflate" in runtmp.last_result.err @utils.in_tempdir def test_sig_subtract_1(c): # subtract of 63 from 47 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - c.run_sourmash('sig', 'subtract', sig47, sig63) + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + c.run_sourmash("sig", "subtract", sig47, sig63) # stdout should be new signature out = c.last_result.out @@ -758,9 +793,9 @@ def test_sig_subtract_1_abund(runtmp): # subtract 63 from 47, with abundances borrowed from 47 c = runtmp - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') - c.run_sourmash('sig', 'subtract', sig47, sig63, '-A', sig47) + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") + c.run_sourmash("sig", "subtract", sig47, sig63, "-A", sig47) # stdout should be new signature out = c.last_result.out @@ -791,21 +826,21 @@ def test_sig_subtract_1_abund_is_flat(runtmp): # subtract 63 from 47, with abundances borrowed from 47 c = runtmp - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') - sig47_flat = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") + sig47_flat = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'subtract', sig47, sig63, '-A', sig47_flat) + c.run_sourmash("sig", "subtract", sig47, sig63, "-A", sig47_flat) def test_sig_subtract_1_flatten(runtmp): # subtract 63 from 47, with abund signatures originally and --flatten c = runtmp - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') - c.run_sourmash('sig', 'subtract', sig47, sig63, '--flatten') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") + c.run_sourmash("sig", "subtract", sig47, sig63, "--flatten") # stdout should be new signature out = c.last_result.out @@ -824,9 +859,9 @@ def test_sig_subtract_1_flatten(runtmp): @utils.in_tempdir def test_sig_subtract_1_multisig(c): # subtract of everything from 47 - sig47 = utils.get_test_data('47.fa.sig') - multisig = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'subtract', sig47, multisig, '--flatten') + sig47 = utils.get_test_data("47.fa.sig") + multisig = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "subtract", sig47, multisig, "--flatten") # stdout should be new signature out = c.last_result.out @@ -839,60 +874,60 @@ def test_sig_subtract_1_multisig(c): @utils.in_tempdir def test_sig_subtract_2(c): # subtract of 63 from 47 should fail if 47 has abund - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'subtract', sig47, sig63) + c.run_sourmash("sig", "subtract", sig47, sig63) @utils.in_tempdir def test_sig_subtract_3(c): # subtract of 63 from 47 should fail if 63 has abund - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'subtract', sig47, sig63) + c.run_sourmash("sig", "subtract", sig47, sig63) @utils.in_tempdir def test_sig_subtract_4_ksize_fail(c): # subtract of 2 from 47 should fail without -k specified - sig47 = utils.get_test_data('47.fa.sig') - sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig2 = utils.get_test_data("2.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'subtract', sig47, sig2) + c.run_sourmash("sig", "subtract", sig47, sig2) @utils.in_tempdir def test_sig_subtract_4_ksize_succeed(c): # subtract of 2 from 47 should fail without -k specified - sig47 = utils.get_test_data('47.fa.sig') - sig2 = utils.get_test_data('2.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig2 = utils.get_test_data("2.fa.sig") - c.run_sourmash('sig', 'subtract', sig47, sig2, '-k', '31') - assert 'loaded and subtracted 1 signatures' in c.last_result.err + c.run_sourmash("sig", "subtract", sig47, sig2, "-k", "31") + assert "loaded and subtracted 1 signatures" in c.last_result.err def test_sig_subtract_5_bad_moltype(runtmp): # should fail when no matching sigs - sig47 = utils.get_test_data('47.fa.sig') - prot = utils.get_test_data('prot/protein.zip') + sig47 = utils.get_test_data("47.fa.sig") + prot = utils.get_test_data("prot/protein.zip") - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('sig', 'subtract', '-k', '31', sig47, prot) + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash("sig", "subtract", "-k", "31", sig47, prot) - assert 'no signatures to subtract' in runtmp.last_result.err + assert "no signatures to subtract" in runtmp.last_result.err def test_sig_rename_1(runtmp): c = runtmp # set new name for 47 - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'rename', sig47, 'fiz bar') + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "rename", sig47, "fiz bar") # stdout should be new signature out = c.last_result.out @@ -905,20 +940,27 @@ def test_sig_rename_1(runtmp): assert actual_rename_sig.minhash == test_rename_sig.minhash assert test_rename_sig.name != actual_rename_sig.name - assert actual_rename_sig.name == 'fiz bar' + assert actual_rename_sig.name == "fiz bar" def test_sig_rename_1_fromfile_picklist(runtmp): c = runtmp # set new name for 47 - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - from_file = _write_file(runtmp, 'list.txt', [sig47]) - picklist = _write_file(runtmp, 'pl.csv', ['md5short', '09a08691']) + from_file = _write_file(runtmp, "list.txt", [sig47]) + picklist = _write_file(runtmp, "pl.csv", ["md5short", "09a08691"]) - c.run_sourmash('sig', 'rename', '--from-file', from_file, 'fiz bar', - '--picklist', f'{picklist}:md5short:md5short') + c.run_sourmash( + "sig", + "rename", + "--from-file", + from_file, + "fiz bar", + "--picklist", + f"{picklist}:md5short:md5short", + ) # stdout should be new signature out = c.last_result.out @@ -931,22 +973,22 @@ def test_sig_rename_1_fromfile_picklist(runtmp): assert actual_rename_sig.minhash == test_rename_sig.minhash assert test_rename_sig.name != actual_rename_sig.name - assert actual_rename_sig.name == 'fiz bar' + assert actual_rename_sig.name == "fiz bar" @utils.in_tempdir def test_sig_rename_1_multisig(c): # set new name for multiple signatures/files - multisig = utils.get_test_data('47+63-multisig.sig') - other_sig = utils.get_test_data('2.fa.sig') - c.run_sourmash('sig', 'rename', multisig, other_sig, 'fiz bar') + multisig = utils.get_test_data("47+63-multisig.sig") + other_sig = utils.get_test_data("2.fa.sig") + c.run_sourmash("sig", "rename", multisig, other_sig, "fiz bar") # stdout should be new signature out = c.last_result.out n = 0 for sig in load_signatures(out): - assert sig.name == 'fiz bar' + assert sig.name == "fiz bar" n += 1 assert n == 9, n @@ -955,16 +997,16 @@ def test_sig_rename_1_multisig(c): @utils.in_tempdir def test_sig_rename_1_multisig_ksize(c): # set new name for multiple signatures/files; select k=31 - multisig = utils.get_test_data('47+63-multisig.sig') - other_sig = utils.get_test_data('2.fa.sig') - c.run_sourmash('sig', 'rename', multisig, other_sig, 'fiz bar', '-k', '31') + multisig = utils.get_test_data("47+63-multisig.sig") + other_sig = utils.get_test_data("2.fa.sig") + c.run_sourmash("sig", "rename", multisig, other_sig, "fiz bar", "-k", "31") # stdout should be new signature out = c.last_result.out n = 0 for sig in load_signatures(out): - assert sig.name == 'fiz bar' + assert sig.name == "fiz bar" n += 1 assert n == 7, n @@ -973,23 +1015,23 @@ def test_sig_rename_1_multisig_ksize(c): @utils.in_tempdir def test_sig_rename_2_output_to_same(c): # change name of signature "in place", same output file - sig47 = utils.get_test_data('47.fa.sig') - inplace = c.output('inplace.sig') + sig47 = utils.get_test_data("47.fa.sig") + inplace = c.output("inplace.sig") shutil.copyfile(sig47, inplace) print(inplace) - c.run_sourmash('sig', 'rename', '-d', inplace, 'fiz bar', '-o', inplace) + c.run_sourmash("sig", "rename", "-d", inplace, "fiz bar", "-o", inplace) actual_rename_sig = sourmash.load_one_signature(inplace) - assert actual_rename_sig.name == 'fiz bar' + assert actual_rename_sig.name == "fiz bar" @utils.in_tempdir def test_sig_rename_3_file_dne(c): # rename on a file that does not exist should fail! - with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('sig', 'rename', 'no-such-sig', 'fiz bar') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("sig", "rename", "no-such-sig", "fiz bar") assert "Error while reading signatures from 'no-such-sig'" in c.last_result.err @@ -997,7 +1039,7 @@ def test_sig_rename_3_file_dne(c): @utils.in_tempdir def test_sig_rename_3_file_dne_force(c): # rename on a file that does not exist should fail! - c.run_sourmash('sig', 'rename', 'no-such-sig', 'fiz bar', '-f') + c.run_sourmash("sig", "rename", "no-such-sig", "fiz bar", "-f") print(c.last_result.err) assert "Error while reading signatures from 'no-such-sig'" in c.last_result.err @@ -1005,35 +1047,37 @@ def test_sig_rename_3_file_dne_force(c): def test_sig_rename_4_pattern_include(runtmp): # test sig rename --include-db-pattern - sigfiles = glob.glob(utils.get_test_data('prot/*.zip')) - runtmp.sourmash('sig', 'rename', '--include', 'shewanella', - *sigfiles, 'SHEWME', '-o', 'out.zip') + sigfiles = glob.glob(utils.get_test_data("prot/*.zip")) + runtmp.sourmash( + "sig", "rename", "--include", "shewanella", *sigfiles, "SHEWME", "-o", "out.zip" + ) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) - names = [ ss.name for ss in idx.signatures() ] + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) + names = [ss.name for ss in idx.signatures()] for n in names: - assert n == 'SHEWME' + assert n == "SHEWME" assert len(names) == 2 def test_sig_rename_4_pattern_exclude(runtmp): # test sig rename --exclude-db-pattern - sigfiles = glob.glob(utils.get_test_data('prot/*.zip')) - runtmp.sourmash('sig', 'rename', '--exclude', 'shewanella', - *sigfiles, 'NOSHEW', '-o', 'out.zip') + sigfiles = glob.glob(utils.get_test_data("prot/*.zip")) + runtmp.sourmash( + "sig", "rename", "--exclude", "shewanella", *sigfiles, "NOSHEW", "-o", "out.zip" + ) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) - names = [ ss.name for ss in idx.signatures() ] + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) + names = [ss.name for ss in idx.signatures()] for n in names: - assert n == 'NOSHEW' + assert n == "NOSHEW" assert len(names) == 6 @utils.in_thisdir def test_sig_cat_1(c): # cat 47 to 47... - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'cat', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "cat", sig47) # stdout should be same signature out = c.last_result.out @@ -1047,8 +1091,8 @@ def test_sig_cat_1(c): @utils.in_thisdir def test_sig_cat_1_no_unique(c): # cat 47 to 47... twice - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'cat', sig47, sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "cat", sig47, sig47) # stdout should be same signature out = c.last_result.out @@ -1059,15 +1103,15 @@ def test_sig_cat_1_no_unique(c): for n, sig in enumerate(actual_cat_sigs): assert sig == test_cat_sig - assert n == 1 # two signatures, but enumerate stops at 1. - assert 'encountered 1 MinHashes multiple times' in c.last_result.err + assert n == 1 # two signatures, but enumerate stops at 1. + assert "encountered 1 MinHashes multiple times" in c.last_result.err @utils.in_thisdir def test_sig_cat_1_unique(c): # cat 47 to 47... twice... and get unique - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'cat', sig47, sig47, '--unique') + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "cat", sig47, sig47, "--unique") # stdout should be same signature out = c.last_result.out @@ -1079,18 +1123,18 @@ def test_sig_cat_1_unique(c): for n, sig in enumerate(actual_cat_sigs): assert sig == test_cat_sig - assert n == 0 # enumerate stops at 0, first sig. - assert 'encountered 1 MinHashes multiple times' in err - assert '...and removed the duplicates, because --unique was specified.' in err + assert n == 0 # enumerate stops at 0, first sig. + assert "encountered 1 MinHashes multiple times" in err + assert "...and removed the duplicates, because --unique was specified." in err @utils.in_thisdir def test_sig_cat_2(c): # cat several - sig47 = utils.get_test_data('47.fa.sig') - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - multisig = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'cat', sig47, sig47abund, multisig) + sig47 = utils.get_test_data("47.fa.sig") + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + multisig = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "cat", sig47, sig47abund, multisig) # stdout should be same signatures out = c.last_result.out @@ -1098,40 +1142,44 @@ def test_sig_cat_2(c): siglist = list(load_signatures(out)) print(len(siglist)) - assert repr(siglist) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377)]""" + assert ( + repr(siglist) + == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377)]""" + ) @utils.in_tempdir def test_sig_cat_2_out(c): # cat several - sig47 = utils.get_test_data('47.fa.sig') - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - multisig = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'cat', sig47, sig47abund, multisig, - '-o', 'out.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + multisig = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "cat", sig47, sig47abund, multisig, "-o", "out.sig") # stdout should be same signatures - out = c.output('out.sig') + out = c.output("out.sig") siglist = list(load_signatures(out)) print(len(siglist)) - assert repr(siglist) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377)]""" + assert ( + repr(siglist) + == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377)]""" + ) @utils.in_tempdir def test_sig_cat_2_out_inplace(c): # cat several; check that we can overwrite one of the input files. - sig47 = utils.get_test_data('47.fa.sig') - input_sig = c.output('inp.sig') + sig47 = utils.get_test_data("47.fa.sig") + input_sig = c.output("inp.sig") shutil.copyfile(sig47, input_sig) - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - multisig = utils.get_test_data('47+63-multisig.sig') + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + multisig = utils.get_test_data("47+63-multisig.sig") # write out to input. - c.run_sourmash('sig', 'cat', input_sig, sig47abund, multisig, - '-o', input_sig) + c.run_sourmash("sig", "cat", input_sig, sig47abund, multisig, "-o", input_sig) # stdout should be same signatures out = input_sig @@ -1139,25 +1187,27 @@ def test_sig_cat_2_out_inplace(c): siglist = list(load_signatures(out)) print(len(siglist)) - assert repr(siglist) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377)]""" + assert ( + repr(siglist) + == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377)]""" + ) @utils.in_tempdir def test_sig_cat_3_filelist(c): # cat using a file list as input - sig47 = utils.get_test_data('47.fa.sig') - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - multisig = utils.get_test_data('47+63-multisig.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + multisig = utils.get_test_data("47+63-multisig.sig") filelist = c.output("filelist") - with open(filelist, 'w') as f: + with open(filelist, "w") as f: f.write("\n".join((sig47, sig47abund, multisig))) - c.run_sourmash('sig', 'cat', filelist, - '-o', 'out.sig') + c.run_sourmash("sig", "cat", filelist, "-o", "out.sig") # stdout should be same signatures - out = c.output('out.sig') + out = c.output("out.sig") # make this a list, not a set, because a set will collapse identical # signatures. `sig cat` does not collapse identical signatures, although @@ -1174,27 +1224,29 @@ def test_sig_cat_3_filelist(c): assert len(all_sigs) == len(siglist) # sort the signatures by something deterministic and unique - siglist.sort(key = lambda x: x.md5sum()) + siglist.sort(key=lambda x: x.md5sum()) - assert repr(siglist) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8)]""" + assert ( + repr(siglist) + == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8)]""" + ) @utils.in_tempdir def test_sig_cat_4_filelist_with_dbs(c): # cat using a file list as input - sig47 = utils.get_test_data('47.fa.sig') - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - sbt = utils.get_test_data('v6.sbt.zip') + sig47 = utils.get_test_data("47.fa.sig") + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + sbt = utils.get_test_data("v6.sbt.zip") filelist = c.output("filelist") - with open(filelist, 'w') as f: + with open(filelist, "w") as f: f.write("\n".join((sig47, sig47abund, sbt))) - c.run_sourmash('sig', 'cat', filelist, - '-o', 'out.sig') + c.run_sourmash("sig", "cat", filelist, "-o", "out.sig") # stdout should be same signatures - out = c.output('out.sig') + out = c.output("out.sig") siglist = list(load_signatures(out)) print(len(siglist)) @@ -1211,27 +1263,29 @@ def test_sig_cat_4_filelist_with_dbs(c): assert len(all_sigs) == len(siglist) # sort the signatures by something deterministic and unique - siglist.sort(key = lambda x: x.md5sum()) + siglist.sort(key=lambda x: x.md5sum()) - assert repr(siglist) == """[SourmashSignature('', 0107d767), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('', 4e94e602), SourmashSignature('', 60f7e23c), SourmashSignature('', 6d6e87e1), SourmashSignature('', b59473c9), SourmashSignature('', f0c834bc), SourmashSignature('', f71e7817)]""" + assert ( + repr(siglist) + == """[SourmashSignature('', 0107d767), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('', 4e94e602), SourmashSignature('', 60f7e23c), SourmashSignature('', 6d6e87e1), SourmashSignature('', b59473c9), SourmashSignature('', f0c834bc), SourmashSignature('', f71e7817)]""" + ) @utils.in_tempdir def test_sig_cat_5_from_file(c): # cat using a file list as input - sig47 = utils.get_test_data('47.fa.sig') - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - sbt = utils.get_test_data('v6.sbt.zip') + sig47 = utils.get_test_data("47.fa.sig") + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + sbt = utils.get_test_data("v6.sbt.zip") filelist = c.output("filelist") - with open(filelist, 'w') as f: + with open(filelist, "w") as f: f.write("\n".join((sig47, sig47abund, sbt))) - c.run_sourmash('sig', 'cat', '--from-file', filelist, - '-o', 'out.sig') + c.run_sourmash("sig", "cat", "--from-file", filelist, "-o", "out.sig") # stdout should be same signatures - out = c.output('out.sig') + out = c.output("out.sig") siglist = list(load_signatures(out)) print(len(siglist)) @@ -1248,30 +1302,40 @@ def test_sig_cat_5_from_file(c): assert len(all_sigs) == len(siglist) # sort the signatures by something deterministic and unique - siglist.sort(key = lambda x: x.md5sum()) + siglist.sort(key=lambda x: x.md5sum()) - assert repr(siglist) == """[SourmashSignature('', 0107d767), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('', 4e94e602), SourmashSignature('', 60f7e23c), SourmashSignature('', 6d6e87e1), SourmashSignature('', b59473c9), SourmashSignature('', f0c834bc), SourmashSignature('', f71e7817)]""" + assert ( + repr(siglist) + == """[SourmashSignature('', 0107d767), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('', 4e94e602), SourmashSignature('', 60f7e23c), SourmashSignature('', 6d6e87e1), SourmashSignature('', b59473c9), SourmashSignature('', f0c834bc), SourmashSignature('', f71e7817)]""" + ) def test_sig_cat_5_from_file_picklist(runtmp): c = runtmp # cat using a file list as input - sig47 = utils.get_test_data('47.fa.sig') - sbt = utils.get_test_data('v6.sbt.zip') + sig47 = utils.get_test_data("47.fa.sig") + sbt = utils.get_test_data("v6.sbt.zip") filelist = c.output("filelist") - with open(filelist, 'w') as f: + with open(filelist, "w") as f: f.write("\n".join((sig47, sbt))) - picklist = _write_file(runtmp, 'pl.csv', ['md5short', '09a08691']) + picklist = _write_file(runtmp, "pl.csv", ["md5short", "09a08691"]) - c.run_sourmash('sig', 'cat', '--from-file', filelist, - '--picklist', f'{picklist}:md5short:md5short', - '-o', 'out.sig') + c.run_sourmash( + "sig", + "cat", + "--from-file", + filelist, + "--picklist", + f"{picklist}:md5short:md5short", + "-o", + "out.sig", + ) # stdout should be same signatures - out = c.output('out.sig') + out = c.output("out.sig") siglist = list(load_signatures(out)) print(len(siglist)) @@ -1286,46 +1350,46 @@ def test_sig_cat_5_from_file_picklist(runtmp): assert len(all_sigs) == len(siglist) # sort the signatures by something deterministic and unique - siglist.sort(key = lambda x: x.md5sum()) + siglist.sort(key=lambda x: x.md5sum()) - assert repr(siglist) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691)]""" + assert ( + repr(siglist) + == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691)]""" + ) def test_sig_cat_6_pattern_include(runtmp): # test --include-db-pattern - sigfiles = glob.glob(utils.get_test_data('prot/*.zip')) + sigfiles = glob.glob(utils.get_test_data("prot/*.zip")) - runtmp.sourmash('sig', 'cat', '--include', 'shewanella', *sigfiles, - '-o', 'out.zip') + runtmp.sourmash("sig", "cat", "--include", "shewanella", *sigfiles, "-o", "out.zip") - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) assert len(idx) == 2 - names = [ ss.name for ss in idx.signatures() ] + names = [ss.name for ss in idx.signatures()] for n in names: - assert 'shewanella' in n.lower(), n + assert "shewanella" in n.lower(), n def test_sig_cat_6_pattern_exclude(runtmp): # test --exclude-db-pattern - sigfiles = glob.glob(utils.get_test_data('prot/*.zip')) + sigfiles = glob.glob(utils.get_test_data("prot/*.zip")) - runtmp.sourmash('sig', 'cat', '--exclude', 'shewanella', *sigfiles, - '-o', 'out.zip') + runtmp.sourmash("sig", "cat", "--exclude", "shewanella", *sigfiles, "-o", "out.zip") - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) assert len(idx) == 18 - names = [ ss.name for ss in idx.signatures() ] + names = [ss.name for ss in idx.signatures()] for n in names: - assert 'shewanella' not in n.lower(), n + assert "shewanella" not in n.lower(), n def test_sig_cat_6_pattern_exclude_no_manifest(runtmp): # test --exclude-db-pattern - db = utils.get_test_data('v6.sbt.zip') + db = utils.get_test_data("v6.sbt.zip") with pytest.raises(SourmashCommandFailed) as e: - runtmp.sourmash('sig', 'cat', '--exclude', 'shewanella', db, - '-o', 'out.zip') + runtmp.sourmash("sig", "cat", "--exclude", "shewanella", db, "-o", "out.zip") assert "require a manifest" in str(e) @@ -1333,10 +1397,10 @@ def test_sig_cat_6_pattern_exclude_no_manifest(runtmp): def test_sig_split_1(runtmp): c = runtmp # split 47 into 1 sig :) - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'split', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "split", sig47) - outname = '09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig' + outname = "09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig" assert os.path.exists(c.output(outname)) @@ -1349,15 +1413,21 @@ def test_sig_split_1(runtmp): def test_sig_split_1_fromfile_picklist(runtmp): c = runtmp # split 47 into 1 sig :) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - from_file = _write_file(runtmp, 'list.txt', [sig47]) - picklist = _write_file(runtmp, 'pl.csv', ['md5short', '09a08691']) + from_file = _write_file(runtmp, "list.txt", [sig47]) + picklist = _write_file(runtmp, "pl.csv", ["md5short", "09a08691"]) - c.run_sourmash('sig', 'split', '--from-file', from_file, - '--picklist', f'{picklist}:md5short:md5short') + c.run_sourmash( + "sig", + "split", + "--from-file", + from_file, + "--picklist", + f"{picklist}:md5short:md5short", + ) - outname = '09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig' + outname = "09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig" assert os.path.exists(c.output(outname)) @@ -1370,27 +1440,27 @@ def test_sig_split_1_fromfile_picklist(runtmp): @utils.in_tempdir def test_sig_split_1_overwrite(c): # check message about overwriting - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'split', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "split", sig47) - outname = '09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig' + outname = "09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig" assert os.path.exists(c.output(outname)) - c.run_sourmash('sig', 'split', sig47) + c.run_sourmash("sig", "split", sig47) err = c.last_result.err print(err) - assert '** overwriting existing file ' + outname in err + assert "** overwriting existing file " + outname in err @utils.in_tempdir def test_sig_split_2(c): # split 47 twice - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'split', sig47, sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "split", sig47, sig47) - outname1 = '09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig' - outname2 = '09a08691.k=31.scaled=1000.DNA.dup=1.47.fa.sig' + outname1 = "09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig" + outname2 = "09a08691.k=31.scaled=1000.DNA.dup=1.47.fa.sig" assert os.path.exists(c.output(outname1)) assert os.path.exists(c.output(outname2)) @@ -1407,12 +1477,12 @@ def test_sig_split_2(c): @utils.in_tempdir def test_sig_split_2_outdir(c): # split 47 twice, put in outdir - sig47 = utils.get_test_data('47.fa.sig') - outdir = c.output('sigout/') - c.run_sourmash('sig', 'split', sig47, sig47, '--outdir', outdir) + sig47 = utils.get_test_data("47.fa.sig") + outdir = c.output("sigout/") + c.run_sourmash("sig", "split", sig47, sig47, "--outdir", outdir) - outname1 = 'sigout/09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig' - outname2 = 'sigout/09a08691.k=31.scaled=1000.DNA.dup=1.47.fa.sig' + outname1 = "sigout/09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig" + outname2 = "sigout/09a08691.k=31.scaled=1000.DNA.dup=1.47.fa.sig" assert os.path.exists(c.output(outname1)) assert os.path.exists(c.output(outname2)) @@ -1429,12 +1499,12 @@ def test_sig_split_2_outdir(c): @utils.in_tempdir def test_sig_split_2_output_dir(c): # split 47 twice, put in outdir via --output-dir instead of --outdir - sig47 = utils.get_test_data('47.fa.sig') - outdir = c.output('sigout/') - c.run_sourmash('sig', 'split', sig47, sig47, '--output-dir', outdir) + sig47 = utils.get_test_data("47.fa.sig") + outdir = c.output("sigout/") + c.run_sourmash("sig", "split", sig47, sig47, "--output-dir", outdir) - outname1 = 'sigout/09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig' - outname2 = 'sigout/09a08691.k=31.scaled=1000.DNA.dup=1.47.fa.sig' + outname1 = "sigout/09a08691.k=31.scaled=1000.DNA.dup=0.47.fa.sig" + outname2 = "sigout/09a08691.k=31.scaled=1000.DNA.dup=1.47.fa.sig" assert os.path.exists(c.output(outname1)) assert os.path.exists(c.output(outname2)) @@ -1451,16 +1521,18 @@ def test_sig_split_2_output_dir(c): @utils.in_tempdir def test_sig_split_3_multisig(c): # split 47 and 47+63-multisig.sig - sig47 = utils.get_test_data('47.fa.sig') - multisig = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'split', sig47, multisig) - - outlist = ['57e2b22f.k=31.scaled=1000.DNA.dup=0.none.sig', - 'bde81a41.k=31.scaled=1000.DNA.dup=0.none.sig', - 'f033bbd8.k=31.scaled=1000.DNA.dup=0.none.sig', - '87a9aec4.k=31.scaled=1000.DNA.dup=0.none.sig', - '837bf2a7.k=31.scaled=1000.DNA.dup=0.none.sig', - '485c3377.k=31.scaled=1000.DNA.dup=0.none.sig'] + sig47 = utils.get_test_data("47.fa.sig") + multisig = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "split", sig47, multisig) + + outlist = [ + "57e2b22f.k=31.scaled=1000.DNA.dup=0.none.sig", + "bde81a41.k=31.scaled=1000.DNA.dup=0.none.sig", + "f033bbd8.k=31.scaled=1000.DNA.dup=0.none.sig", + "87a9aec4.k=31.scaled=1000.DNA.dup=0.none.sig", + "837bf2a7.k=31.scaled=1000.DNA.dup=0.none.sig", + "485c3377.k=31.scaled=1000.DNA.dup=0.none.sig", + ] for filename in outlist: assert os.path.exists(c.output(filename)) @@ -1469,16 +1541,18 @@ def test_sig_split_3_multisig_sig_gz(runtmp): # split 47 and 47+63-multisig.sig with a .sig.gz extension c = runtmp - sig47 = utils.get_test_data('47.fa.sig') - multisig = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'split', sig47, multisig, '-E', '.sig.gz') - - outlist = ['57e2b22f.k=31.scaled=1000.DNA.dup=0.none.sig.gz', - 'bde81a41.k=31.scaled=1000.DNA.dup=0.none.sig.gz', - 'f033bbd8.k=31.scaled=1000.DNA.dup=0.none.sig.gz', - '87a9aec4.k=31.scaled=1000.DNA.dup=0.none.sig.gz', - '837bf2a7.k=31.scaled=1000.DNA.dup=0.none.sig.gz', - '485c3377.k=31.scaled=1000.DNA.dup=0.none.sig.gz'] + sig47 = utils.get_test_data("47.fa.sig") + multisig = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "split", sig47, multisig, "-E", ".sig.gz") + + outlist = [ + "57e2b22f.k=31.scaled=1000.DNA.dup=0.none.sig.gz", + "bde81a41.k=31.scaled=1000.DNA.dup=0.none.sig.gz", + "f033bbd8.k=31.scaled=1000.DNA.dup=0.none.sig.gz", + "87a9aec4.k=31.scaled=1000.DNA.dup=0.none.sig.gz", + "837bf2a7.k=31.scaled=1000.DNA.dup=0.none.sig.gz", + "485c3377.k=31.scaled=1000.DNA.dup=0.none.sig.gz", + ] for filename in outlist: assert os.path.exists(c.output(filename)) @@ -1487,16 +1561,18 @@ def test_sig_split_3_multisig_zip(runtmp): # split 47 and 47+63-multisig.sig with a .zip extension c = runtmp - sig47 = utils.get_test_data('47.fa.sig') - multisig = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'split', sig47, multisig, '-E', '.zip') - - outlist = ['57e2b22f.k=31.scaled=1000.DNA.dup=0.none.zip', - 'bde81a41.k=31.scaled=1000.DNA.dup=0.none.zip', - 'f033bbd8.k=31.scaled=1000.DNA.dup=0.none.zip', - '87a9aec4.k=31.scaled=1000.DNA.dup=0.none.zip', - '837bf2a7.k=31.scaled=1000.DNA.dup=0.none.zip', - '485c3377.k=31.scaled=1000.DNA.dup=0.none.zip'] + sig47 = utils.get_test_data("47.fa.sig") + multisig = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "split", sig47, multisig, "-E", ".zip") + + outlist = [ + "57e2b22f.k=31.scaled=1000.DNA.dup=0.none.zip", + "bde81a41.k=31.scaled=1000.DNA.dup=0.none.zip", + "f033bbd8.k=31.scaled=1000.DNA.dup=0.none.zip", + "87a9aec4.k=31.scaled=1000.DNA.dup=0.none.zip", + "837bf2a7.k=31.scaled=1000.DNA.dup=0.none.zip", + "485c3377.k=31.scaled=1000.DNA.dup=0.none.zip", + ] for filename in outlist: assert os.path.exists(c.output(filename)) @@ -1504,17 +1580,19 @@ def test_sig_split_3_multisig_zip(runtmp): @utils.in_tempdir def test_sig_split_4_sbt_prot(c): # split sbt - sbt1 = utils.get_test_data('prot/protein.sbt.zip') - sbt2 = utils.get_test_data('prot/dayhoff.sbt.zip') - sbt3 = utils.get_test_data('prot/hp.sbt.zip') - c.run_sourmash('sig', 'split', sbt1, sbt2, sbt3) - - outlist = ['16869d2c.k=19.scaled=100.protein.dup=0.GCA_001593925.1_ASM159392v1_protein.faa.gz.sig', - '120d311c.k=19.scaled=100.protein.dup=0.GCA_001593935.1_ASM159393v1_protein.faa.gz.sig', - 'fbca5e52.k=19.scaled=100.dayhoff.dup=0.GCA_001593925.1_ASM159392v1_protein.faa.gz.sig', - '1cbd888b.k=19.scaled=100.dayhoff.dup=0.GCA_001593935.1_ASM159393v1_protein.faa.gz.sig', - 'ea2a1ad2.k=19.scaled=100.hp.dup=0.GCA_001593925.1_ASM159392v1_protein.faa.gz.sig', - 'bb0e6d90.k=19.scaled=100.hp.dup=0.GCA_001593935.1_ASM159393v1_protein.faa.gz.sig'] + sbt1 = utils.get_test_data("prot/protein.sbt.zip") + sbt2 = utils.get_test_data("prot/dayhoff.sbt.zip") + sbt3 = utils.get_test_data("prot/hp.sbt.zip") + c.run_sourmash("sig", "split", sbt1, sbt2, sbt3) + + outlist = [ + "16869d2c.k=19.scaled=100.protein.dup=0.GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + "120d311c.k=19.scaled=100.protein.dup=0.GCA_001593935.1_ASM159393v1_protein.faa.gz.sig", + "fbca5e52.k=19.scaled=100.dayhoff.dup=0.GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + "1cbd888b.k=19.scaled=100.dayhoff.dup=0.GCA_001593935.1_ASM159393v1_protein.faa.gz.sig", + "ea2a1ad2.k=19.scaled=100.hp.dup=0.GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + "bb0e6d90.k=19.scaled=100.hp.dup=0.GCA_001593935.1_ASM159393v1_protein.faa.gz.sig", + ] for filename in outlist: assert os.path.exists(c.output(filename)) @@ -1522,20 +1600,22 @@ def test_sig_split_4_sbt_prot(c): @utils.in_tempdir def test_sig_split_4_lca_prot(c): # split lca - lca1 = utils.get_test_data('prot/protein.lca.json.gz') - lca2 = utils.get_test_data('prot/dayhoff.lca.json.gz') - lca3 = utils.get_test_data('prot/hp.lca.json.gz') - c.run_sourmash('sig', 'split', lca1, lca2, lca3) + lca1 = utils.get_test_data("prot/protein.lca.json.gz") + lca2 = utils.get_test_data("prot/dayhoff.lca.json.gz") + lca3 = utils.get_test_data("prot/hp.lca.json.gz") + c.run_sourmash("sig", "split", lca1, lca2, lca3) print(c.last_result.out) print(c.last_result.err) - outlist = ['16869d2c.k=19.scaled=100.protein.dup=0.none.sig', - '120d311c.k=19.scaled=100.protein.dup=0.none.sig', - 'fbca5e52.k=19.scaled=100.dayhoff.dup=0.none.sig', - '1cbd888b.k=19.scaled=100.dayhoff.dup=0.none.sig', - 'ea2a1ad2.k=19.scaled=100.hp.dup=0.none.sig', - 'bb0e6d90.k=19.scaled=100.hp.dup=0.none.sig'] + outlist = [ + "16869d2c.k=19.scaled=100.protein.dup=0.none.sig", + "120d311c.k=19.scaled=100.protein.dup=0.none.sig", + "fbca5e52.k=19.scaled=100.dayhoff.dup=0.none.sig", + "1cbd888b.k=19.scaled=100.dayhoff.dup=0.none.sig", + "ea2a1ad2.k=19.scaled=100.hp.dup=0.none.sig", + "bb0e6d90.k=19.scaled=100.hp.dup=0.none.sig", + ] for filename in outlist: assert os.path.exists(c.output(filename)) @@ -1543,23 +1623,25 @@ def test_sig_split_4_lca_prot(c): @utils.in_tempdir def test_sig_split_5_no_exist(c): # no such file - with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('sig', 'split', 'foo') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("sig", "split", "foo") def test_sig_split_6_numsigs(runtmp): c = runtmp - sigs11 = utils.get_test_data('genome-s11.fa.gz.sig') - c.run_sourmash('sig', 'split', sigs11) + sigs11 = utils.get_test_data("genome-s11.fa.gz.sig") + c.run_sourmash("sig", "split", sigs11) print(c.last_result.out) print(c.last_result.err) - outlist = ['1437d8ea.k=21.num=500.DNA.dup=0.genome-s11.fa.gz.sig', - '37aea787.k=7.num=500.protein.dup=0.genome-s11.fa.gz.sig', - '68c565be.k=30.num=500.DNA.dup=0.genome-s11.fa.gz.sig', - '73b6df1c.k=10.num=500.protein.dup=0.genome-s11.fa.gz.sig'] + outlist = [ + "1437d8ea.k=21.num=500.DNA.dup=0.genome-s11.fa.gz.sig", + "37aea787.k=7.num=500.protein.dup=0.genome-s11.fa.gz.sig", + "68c565be.k=30.num=500.DNA.dup=0.genome-s11.fa.gz.sig", + "73b6df1c.k=10.num=500.protein.dup=0.genome-s11.fa.gz.sig", + ] for filename in outlist: assert os.path.exists(c.output(filename)) @@ -1569,8 +1651,8 @@ def test_sig_extract_1(runtmp): c = runtmp # extract 47 from 47... :) - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'extract', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "extract", sig47) # stdout should be new signature out = c.last_result.out @@ -1586,9 +1668,9 @@ def test_sig_extract_1_from_file(runtmp): c = runtmp # extract 47 from 47... :) - sig47 = utils.get_test_data('47.fa.sig') - from_file = _write_file(runtmp, 'list.txt', [sig47]) - c.run_sourmash('sig', 'extract', '--from-file', from_file) + sig47 = utils.get_test_data("47.fa.sig") + from_file = _write_file(runtmp, "list.txt", [sig47]) + c.run_sourmash("sig", "extract", "--from-file", from_file) # stdout should be new signature out = c.last_result.out @@ -1602,9 +1684,9 @@ def test_sig_extract_1_from_file(runtmp): @utils.in_tempdir def test_sig_extract_2(c): # extract matches to 47's md5sum from among several - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - c.run_sourmash('sig', 'extract', sig47, sig63, '--md5', '09a0869') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + c.run_sourmash("sig", "extract", sig47, sig63, "--md5", "09a0869") # stdout should be new signature out = c.last_result.out @@ -1621,10 +1703,10 @@ def test_sig_extract_2(c): @utils.in_tempdir def test_sig_extract_2_zipfile(c): # extract matches to 47's md5sum from among several in a zipfile - all_zip = utils.get_test_data('prot/all.zip') - sig47 = utils.get_test_data('47.fa.sig') + all_zip = utils.get_test_data("prot/all.zip") + sig47 = utils.get_test_data("47.fa.sig") - c.run_sourmash('sig', 'extract', all_zip, '--md5', '09a0869') + c.run_sourmash("sig", "extract", all_zip, "--md5", "09a0869") # stdout should be new signature out = c.last_result.out @@ -1641,17 +1723,17 @@ def test_sig_extract_2_zipfile(c): @utils.in_tempdir def test_sig_extract_3(c): # extract nothing (no md5 match) - sig47 = utils.get_test_data('47.fa.sig') - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('sig', 'extract', sig47, '--md5', 'FOO') + sig47 = utils.get_test_data("47.fa.sig") + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("sig", "extract", sig47, "--md5", "FOO") @utils.in_tempdir def test_sig_extract_4(c): # extract matches to 47's name from among several signatures - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - c.run_sourmash('sig', 'extract', sig47, sig63, '--name', 'NC_009665.1') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + c.run_sourmash("sig", "extract", sig47, sig63, "--name", "NC_009665.1") # stdout should be new signature out = c.last_result.out @@ -1668,17 +1750,17 @@ def test_sig_extract_4(c): @utils.in_tempdir def test_sig_extract_5(c): # extract nothing (no name match) - sig47 = utils.get_test_data('47.fa.sig') - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('sig', 'extract', sig47, '--name', 'FOO') + sig47 = utils.get_test_data("47.fa.sig") + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("sig", "extract", sig47, "--name", "FOO") @utils.in_tempdir def test_sig_extract_6(c): # extract matches to several names from among several signatures - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - c.run_sourmash('sig', 'extract', sig47, sig63, '--name', 'Shewanella') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + c.run_sourmash("sig", "extract", sig47, sig63, "--name", "Shewanella") # stdout should be new signature out = c.last_result.out @@ -1692,8 +1774,8 @@ def test_sig_extract_6(c): @utils.in_tempdir def test_sig_extract_7(c): # extract matches based on ksize - sig2 = utils.get_test_data('2.fa.sig') - c.run_sourmash('sig', 'extract', sig2, '-k', '31') + sig2 = utils.get_test_data("2.fa.sig") + c.run_sourmash("sig", "extract", sig2, "-k", "31") # stdout should be new signature out = c.last_result.out @@ -1707,8 +1789,8 @@ def test_sig_extract_7(c): @utils.in_tempdir def test_sig_extract_7_no_ksize(c): # extract all three matches when -k not specified - sig2 = utils.get_test_data('2.fa.sig') - c.run_sourmash('sig', 'extract', sig2) + sig2 = utils.get_test_data("2.fa.sig") + c.run_sourmash("sig", "extract", sig2) # stdout should be new signature out = c.last_result.out @@ -1721,18 +1803,18 @@ def test_sig_extract_7_no_ksize(c): def test_sig_extract_8_empty_picklist_fail(runtmp): # what happens with an empty picklist? - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # make empty picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline=""): pass picklist_arg = f"{picklist_csv}:md5full:md5" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) err = runtmp.last_result.err print(err) @@ -1742,15 +1824,15 @@ def test_sig_extract_8_empty_picklist_fail(runtmp): def test_sig_extract_8_nofile_picklist_fail(runtmp): # what happens when picklist file does not exist? - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # picklist file does not exist - picklist_csv = runtmp.output('pick.csv') + picklist_csv = runtmp.output("pick.csv") picklist_arg = f"{picklist_csv}:md5full:md5" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) err = runtmp.last_result.err print(err) @@ -1760,25 +1842,27 @@ def test_sig_extract_8_nofile_picklist_fail(runtmp): def test_sig_extract_8_picklist_md5(runtmp): # extract 47 from 47, using a picklist w/full md5 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5full:md5" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -1799,26 +1883,28 @@ def test_sig_extract_8_picklist_md5(runtmp): def test_sig_extract_8_picklist_md5_zipfile(runtmp): # extract 47 from a zipfile, using a picklist w/full md5 - allzip = utils.get_test_data('prot/all.zip') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + allzip = utils.get_test_data("prot/all.zip") + sig47 = utils.get_test_data("47.fa.sig") + utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5full:md5" - runtmp.sourmash('sig', 'extract', allzip, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", allzip, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -1840,54 +1926,68 @@ def test_sig_extract_8_picklist_md5_zipfile(runtmp): def test_sig_extract_8_picklist_md5_lca_fail(runtmp): # try to extract 47 from an LCA database, using a picklist w/full md5; will # fail. - allzip = utils.get_test_data('lca/47+63.lca.json') + allzip = utils.get_test_data("lca/47+63.lca.json") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='50a9274021e43eda8b2e77f8fa60ae8e', - md5short='50a9274021e43eda8b2e77f8fa60ae8e'[:8], - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="50a9274021e43eda8b2e77f8fa60ae8e", + md5short="50a9274021e43eda8b2e77f8fa60ae8e"[:8], + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5full:md5" - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sig', 'extract', allzip, '--picklist', picklist_arg, - '--md5', '50a9274021e4') + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash( + "sig", + "extract", + allzip, + "--picklist", + picklist_arg, + "--md5", + "50a9274021e4", + ) # this happens b/c the implementation of 'extract' uses picklists, and # LCA databases don't support multiple picklists. print(runtmp.last_result.err) - assert "This input collection doesn't support 'extract' with picklists or patterns." in runtmp.last_result.err + assert ( + "This input collection doesn't support 'extract' with picklists or patterns." + in runtmp.last_result.err + ) def test_sig_extract_8_picklist_md5_include(runtmp): # extract 47 from 47, using a picklist w/full md5:: explicit include - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5full:md5:include" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -1908,25 +2008,27 @@ def test_sig_extract_8_picklist_md5_include(runtmp): def test_sig_extract_8_picklist_md5_exclude(runtmp): # extract 63 from 47,63 by excluding 47, using a picklist w/full md5 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5full:md5:exclude" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -1948,30 +2050,45 @@ def test_sig_extract_8_picklist_md5_exclude(runtmp): def test_sig_extract_8_picklist_md5_require_all(runtmp): # extract 47 from 47, using a picklist w/full md5; # confirm that check missing picklist val errors out on --picklist-require - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) - w.writerow(dict(exactName='', md5full='BAD MD5', - md5short='', fullIdent='', nodotIdent='')) + w.writerow( + dict( + exactName="", + md5full="BAD MD5", + md5short="", + fullIdent="", + nodotIdent="", + ) + ) picklist_arg = f"{picklist_csv}:md5full:md5" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sig47, sig63, - '--picklist', picklist_arg, - '--picklist-require-all') + runtmp.sourmash( + "sig", + "extract", + sig47, + sig63, + "--picklist", + picklist_arg, + "--picklist-require-all", + ) # stdout should be new signature out = runtmp.last_result.out @@ -1988,31 +2105,33 @@ def test_sig_extract_8_picklist_md5_require_all(runtmp): assert "loaded 1 total that matched ksize & molecule type" in err assert "extracted 1 signatures from 2 file(s)" in err assert "for given picklist, found 1 matches to 2 distinct values" in err - assert 'WARNING: 1 missing picklist values.' in err - assert 'ERROR: failing because --picklist-require-all was set' in err + assert "WARNING: 1 missing picklist values." in err + assert "ERROR: failing because --picklist-require-all was set" in err def test_sig_extract_8_picklist_name(runtmp): # extract 47 from 47, using a picklist w/full md5 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:exactName:name" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2025,25 +2144,27 @@ def test_sig_extract_8_picklist_name(runtmp): def test_sig_extract_8_picklist_name_exclude(runtmp): # exclude 47 based on name - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:exactName:name:exclude" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2056,25 +2177,27 @@ def test_sig_extract_8_picklist_name_exclude(runtmp): def test_sig_extract_8_picklist_ident(runtmp): # extract 47 from 47, using a picklist w/full md5 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:fullIdent:ident" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2087,25 +2210,27 @@ def test_sig_extract_8_picklist_ident(runtmp): def test_sig_extract_8_picklist_ident_exclude(runtmp): # exclude 47 based on ident - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:fullIdent:ident:exclude" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2118,25 +2243,27 @@ def test_sig_extract_8_picklist_ident_exclude(runtmp): def test_sig_extract_8_picklist_ident_dot(runtmp): # extract 47 from 47, using a picklist w/full md5 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:nodotIdent:identprefix" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2149,25 +2276,27 @@ def test_sig_extract_8_picklist_ident_dot(runtmp): def test_sig_extract_8_picklist_ident_dot_exclude(runtmp): # exlude 47 based on identprefix - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:nodotIdent:identprefix:exclude" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2180,25 +2309,27 @@ def test_sig_extract_8_picklist_ident_dot_exclude(runtmp): def test_sig_extract_8_picklist_md5_short(runtmp): # extract 47 from 47, using a picklist w/full md5 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5short:md5prefix8" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2211,25 +2342,27 @@ def test_sig_extract_8_picklist_md5_short(runtmp): def test_sig_extract_8_picklist_md5_short_exclude(runtmp): # exclude 47 based on md5prefix8 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5short:md5prefix8:exclude" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2242,25 +2375,27 @@ def test_sig_extract_8_picklist_md5_short_exclude(runtmp): def test_sig_extract_8_picklist_md5_short_alias(runtmp): # extract 47 from 47, using a picklist w/full md5 - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5short:md5short" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2273,25 +2408,27 @@ def test_sig_extract_8_picklist_md5_short_alias(runtmp): def test_sig_extract_8_picklist_md5_short_alias_exclude(runtmp): # exlude 47 based on md5prefix8 alias, md5short - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5short:md5short:exclude" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2304,57 +2441,63 @@ def test_sig_extract_8_picklist_md5_short_alias_exclude(runtmp): def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector_nomatch(runtmp): # extract 47 from 47, using a picklist w/full md5 and also md5 selector - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5short:md5short" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sig47, sig63, - '--picklist', picklist_arg, - '--md5', 'XXX') # no match to md5 selector here + runtmp.sourmash( + "sig", "extract", sig47, sig63, "--picklist", picklist_arg, "--md5", "XXX" + ) # no match to md5 selector here err = runtmp.last_result.err assert "no matching signatures to save!" in err -def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector_nomatch_exclude(runtmp): +def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector_nomatch_exclude( + runtmp, +): # exclude 47 using a picklist w/full md5 and also md5 selector - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5short:md5short:exclude" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sig47, sig63, - '--picklist', picklist_arg, - '--md5', 'XXX') # no match to md5 selector here + runtmp.sourmash( + "sig", "extract", sig47, sig63, "--picklist", picklist_arg, "--md5", "XXX" + ) # no match to md5 selector here err = runtmp.last_result.err assert "no matching signatures to save!" in err @@ -2362,26 +2505,36 @@ def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector_nomatch_exclud def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector(runtmp): # extract 47 from 47, using a picklist w/full md5 and also md5 selector - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5short:md5short" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg, - '--md5', '09a08691ce5295215') + runtmp.sourmash( + "sig", + "extract", + sig47, + sig63, + "--picklist", + picklist_arg, + "--md5", + "09a08691ce5295215", + ) # stdout should be new signature out = runtmp.last_result.out @@ -2391,54 +2544,64 @@ def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector(runtmp): assert actual_extract_sig == test_extract_sig + def test_sig_extract_8_picklist_md5_short_alias_with_md5_selector_exclude(runtmp): # exclude 47, using a picklist w/full md5; but try to select with md5 selector - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='09a08691ce52952152f0e866a59f6261', - md5short='09a08691ce5295215', - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="09a08691ce52952152f0e866a59f6261", + md5short="09a08691ce5295215", + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5short:md5short:exclude" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', picklist_arg, - '--md5', '09a08691ce5295215') + runtmp.sourmash( + "sig", + "extract", + sig47, + sig63, + "--picklist", + picklist_arg, + "--md5", + "09a08691ce5295215", + ) # NTP: do we want to emit a more informative "conflicting selectors" type of msg? err = runtmp.last_result.err print(err) assert "loaded 1 distinct values into picklist." in err assert "loaded 1 total that matched ksize & molecule type" in err - assert 'no matching signatures to save!' in err + assert "no matching signatures to save!" in err def test_sig_extract_8_picklist_md5_nomatch(runtmp): # use an empty picklist => no match - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5short']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5short"]) w.writeheader() picklist_arg = f"{picklist_csv}:md5short:md5prefix8" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', - picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be new signature out = runtmp.last_result.out @@ -2451,19 +2614,18 @@ def test_sig_extract_8_picklist_md5_nomatch(runtmp): def test_sig_extract_8_picklist_md5_nomatch_exclude(runtmp): # use an empty picklist to exclude => no match => include everything - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5short']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5short"]) w.writeheader() picklist_arg = f"{picklist_csv}:md5short:md5prefix8:exclude" - runtmp.sourmash('sig', 'extract', sig47, sig63, '--picklist', - picklist_arg) + runtmp.sourmash("sig", "extract", sig47, sig63, "--picklist", picklist_arg) # stdout should be both signatures out = runtmp.last_result.out @@ -2478,91 +2640,94 @@ def test_sig_extract_8_picklist_md5_nomatch_exclude(runtmp): err = runtmp.last_result.err print(err) assert runtmp.last_result.status == 0 - assert 'loaded 0 distinct values into picklist.' in err - assert 'loaded 2 total that matched ksize & molecule type' in err - assert 'extracted 2 signatures from 2 file(s)' in err - assert 'for given picklist, found 2 matches by excluding 0 distinct values' in err + assert "loaded 0 distinct values into picklist." in err + assert "loaded 2 total that matched ksize & molecule type" in err + assert "extracted 2 signatures from 2 file(s)" in err + assert "for given picklist, found 2 matches by excluding 0 distinct values" in err def test_sig_extract_9_picklist_md5_ksize_hp_select(runtmp): # test with -k and moltype selector - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) picklist_arg = f"{picklist_csv}:md5:md5" - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) # stdout should be new signature out = runtmp.last_result.out actual_extract_sig = sourmash.load_one_signature(out) print(actual_extract_sig.md5sum) - assert str(actual_extract_sig) == 'GCA_001593925' - assert actual_extract_sig.md5sum() == 'ea2a1ad233c2908529d124a330bcb672' + assert str(actual_extract_sig) == "GCA_001593925" + assert actual_extract_sig.md5sum() == "ea2a1ad233c2908529d124a330bcb672" assert actual_extract_sig.minhash.ksize == 19 - assert actual_extract_sig.minhash.moltype == 'hp' + assert actual_extract_sig.minhash.moltype == "hp" def test_sig_extract_9_picklist_md5_ksize_hp_select_exclude(runtmp): # test picklist exclude with -k and moltype selector - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) picklist_arg = f"{picklist_csv}:md5:md5:exclude" - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) # stdout should be new signature out = runtmp.last_result.out actual_extract_sig = sourmash.load_one_signature(out) print(actual_extract_sig.md5sum) - assert str(actual_extract_sig) == 'GCA_001593935' - assert actual_extract_sig.md5sum() == 'bb0e6d90df01b7bd5d0956a5f9e3ed12' + assert str(actual_extract_sig) == "GCA_001593935" + assert actual_extract_sig.md5sum() == "bb0e6d90df01b7bd5d0956a5f9e3ed12" assert actual_extract_sig.minhash.ksize == 19 - assert actual_extract_sig.minhash.moltype == 'hp' + assert actual_extract_sig.minhash.moltype == "hp" def test_sig_extract_10_picklist_md5_dups_and_empty(runtmp): # test empty picklist values, and duplicate picklist values - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) - w.writerow(dict(md5='')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) + w.writerow(dict(md5="")) picklist_arg = f"{picklist_csv}:md5:md5" - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) # stdout should be new signature out = runtmp.last_result.out actual_extract_sig = sourmash.load_one_signature(out) assert actual_extract_sig.minhash.ksize == 19 - assert actual_extract_sig.minhash.moltype == 'hp' - assert actual_extract_sig.md5sum() == 'ea2a1ad233c2908529d124a330bcb672' + assert actual_extract_sig.minhash.moltype == "hp" + assert actual_extract_sig.md5sum() == "ea2a1ad233c2908529d124a330bcb672" err = runtmp.last_result.err print(err) @@ -2573,29 +2738,30 @@ def test_sig_extract_10_picklist_md5_dups_and_empty(runtmp): def test_sig_extract_10_picklist_md5_dups_and_empty_exclude(runtmp): # test empty picklist values, and duplicate picklist values for exclude - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) - w.writerow(dict(md5='')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) + w.writerow(dict(md5="")) picklist_arg = f"{picklist_csv}:md5:md5:exclude" - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) # stdout should be new signature out = runtmp.last_result.out actual_extract_sig = sourmash.load_one_signature(out) assert actual_extract_sig.minhash.ksize == 19 - assert actual_extract_sig.minhash.moltype == 'hp' - assert actual_extract_sig.md5sum() == 'bb0e6d90df01b7bd5d0956a5f9e3ed12' + assert actual_extract_sig.minhash.moltype == "hp" + assert actual_extract_sig.md5sum() == "bb0e6d90df01b7bd5d0956a5f9e3ed12" err = runtmp.last_result.err print(err) @@ -2606,20 +2772,21 @@ def test_sig_extract_10_picklist_md5_dups_and_empty_exclude(runtmp): def test_sig_extract_11_picklist_bad_coltype(runtmp): # test with invalid picklist coltype - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) picklist_arg = f"{picklist_csv}:md5:BADCOLTYPE" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) err = runtmp.last_result.err print(err) @@ -2628,20 +2795,21 @@ def test_sig_extract_11_picklist_bad_coltype(runtmp): def test_sig_extract_11_picklist_bad_coltype_exclude(runtmp): # test with invalid picklist coltype - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) picklist_arg = f"{picklist_csv}:md5:BADCOLTYPE:exclude" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) err = runtmp.last_result.err print(err) @@ -2650,20 +2818,21 @@ def test_sig_extract_11_picklist_bad_coltype_exclude(runtmp): def test_sig_extract_12_picklist_bad_argstr(runtmp): # test with invalid argument format to --picklist - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) picklist_arg = f"{picklist_csv}" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) err = runtmp.last_result.err print(err) @@ -2672,42 +2841,47 @@ def test_sig_extract_12_picklist_bad_argstr(runtmp): def test_sig_extract_12_picklist_bad_pickstyle(runtmp): # test with invalid argument format to --picklist - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) picklist_arg = f"{picklist_csv}:md5:md5:XXX" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) err = runtmp.last_result.err print(err) - assert "invalid picklist 'pickstyle' argument 4: 'XXX' must be 'include' or 'exclude'" in err + assert ( + "invalid picklist 'pickstyle' argument 4: 'XXX' must be 'include' or 'exclude'" + in err + ) def test_sig_extract_12_picklist_bad_colname(runtmp): # test with invalid picklist colname - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) picklist_arg = f"{picklist_csv}:BADCOLNAME:md5" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) err = runtmp.last_result.err print(err) @@ -2716,20 +2890,21 @@ def test_sig_extract_12_picklist_bad_colname(runtmp): def test_sig_extract_12_picklist_bad_colname_exclude(runtmp): # test with invalid picklist colname - sigdir = utils.get_test_data('prot/') + sigdir = utils.get_test_data("prot/") # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: - w = csv.DictWriter(csvfp, fieldnames=['md5']) + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: + w = csv.DictWriter(csvfp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='ea2a1ad233c2908529d124a330bcb672')) + w.writerow(dict(md5="ea2a1ad233c2908529d124a330bcb672")) picklist_arg = f"{picklist_csv}:BADCOLNAME:md5:exclude" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'extract', sigdir, '--picklist', - picklist_arg, '-k', '19', '--hp') + runtmp.sourmash( + "sig", "extract", sigdir, "--picklist", picklist_arg, "-k", "19", "--hp" + ) err = runtmp.last_result.err print(err) @@ -2738,45 +2913,47 @@ def test_sig_extract_12_picklist_bad_colname_exclude(runtmp): def test_sig_extract_11_pattern_include(runtmp): # test --include-db-pattern - sigfiles = glob.glob(utils.get_test_data('prot/*.zip')) + sigfiles = glob.glob(utils.get_test_data("prot/*.zip")) - runtmp.sourmash('sig', 'extract', '--include', 'shewanella', *sigfiles, - '-o', 'out.zip') + runtmp.sourmash( + "sig", "extract", "--include", "shewanella", *sigfiles, "-o", "out.zip" + ) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) assert len(idx) == 2 - names = [ ss.name for ss in idx.signatures() ] + names = [ss.name for ss in idx.signatures()] for n in names: - assert 'shewanella' in n.lower(), n + assert "shewanella" in n.lower(), n def test_sig_extract_11_pattern_exclude(runtmp): # test --exclude-db-pattern - sigfiles = glob.glob(utils.get_test_data('prot/*.zip')) + sigfiles = glob.glob(utils.get_test_data("prot/*.zip")) - runtmp.sourmash('sig', 'extract', '--exclude', 'shewanella', *sigfiles, - '-o', 'out.zip') + runtmp.sourmash( + "sig", "extract", "--exclude", "shewanella", *sigfiles, "-o", "out.zip" + ) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) assert len(idx) == 18 - names = [ ss.name for ss in idx.signatures() ] + names = [ss.name for ss in idx.signatures()] for n in names: - assert 'shewanella' not in n.lower(), n + assert "shewanella" not in n.lower(), n def test_sig_extract_identical_md5s(runtmp): # test that we properly handle different signatures with identical md5s - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = load_signatures(sig47) sig = list(ss)[0] new_sig = sig.to_mutable() - new_sig.name = 'foo' - sig47foo = runtmp.output('foo.sig') + new_sig.name = "foo" + sig47foo = runtmp.output("foo.sig") # this was only a problem when the signatures are stored in the same file - with open(sig47foo, 'wt') as fp: + with open(sig47foo, "w") as fp: sourmash.save_signatures([new_sig, sig], fp) - runtmp.run_sourmash('sig', 'extract', '--name', 'foo', sig47foo) + runtmp.run_sourmash("sig", "extract", "--name", "foo", sig47foo) out = runtmp.last_result.out print(out) @@ -2784,18 +2961,18 @@ def test_sig_extract_identical_md5s(runtmp): ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella' not in ss.name - assert 'foo' in ss.name - assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + assert "Shewanella" not in ss.name + assert "foo" in ss.name + assert ss.md5sum() == "09a08691ce52952152f0e866a59f6261" def test_sig_flatten_1(runtmp): c = runtmp # extract matches to several names from among several signatures & flatten - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'flatten', sig47abund, '--name', 'Shewanella') + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "flatten", sig47abund, "--name", "Shewanella") # stdout should be new signature out = c.last_result.out @@ -2813,14 +2990,20 @@ def test_sig_flatten_1_from_file(runtmp): c = runtmp # extract matches to several names from among several signatures & flatten - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") - from_file = _write_file(runtmp, 'list.txt', [sig47abund]) - picklist = _write_file(runtmp, 'pl.csv', ['md5short', '09a08691']) + from_file = _write_file(runtmp, "list.txt", [sig47abund]) + picklist = _write_file(runtmp, "pl.csv", ["md5short", "09a08691"]) - c.run_sourmash('sig', 'flatten', '--from-file', from_file, - '--picklist', f'{picklist}:md5short:md5short') + c.run_sourmash( + "sig", + "flatten", + "--from-file", + from_file, + "--picklist", + f"{picklist}:md5short:md5short", + ) # stdout should be new signature out = c.last_result.out @@ -2837,10 +3020,10 @@ def test_sig_flatten_1_from_file(runtmp): @utils.in_tempdir def test_sig_flatten_1_select_name(c): # extract matches to several names from among several signatures & flatten - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'flatten', sig2, sig47abund, '--name', 'Shewanella') + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "flatten", sig2, sig47abund, "--name", "Shewanella") # stdout should be new signature out = c.last_result.out @@ -2858,10 +3041,10 @@ def test_sig_flatten_1_select_md5(runtmp): c = runtmp # extract matches to several names from among several signatures & flatten - sig47abund = utils.get_test_data('track_abund/47.fa.sig') - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'flatten', sig2, sig47abund, '--md5', '09a08691c') + sig47abund = utils.get_test_data("track_abund/47.fa.sig") + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "flatten", sig2, sig47abund, "--md5", "09a08691c") # stdout should be new signature out = c.last_result.out @@ -2878,8 +3061,8 @@ def test_sig_flatten_1_select_md5(runtmp): def test_sig_flatten_2_ksize(runtmp): c = runtmp # flatten only one signature selected using ksize - psw_mag = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - c.run_sourmash('sig', 'flatten', psw_mag, '-k', '31') + psw_mag = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + c.run_sourmash("sig", "flatten", psw_mag, "-k", "31") # stdout should be new signature out = c.last_result.out @@ -2893,8 +3076,8 @@ def test_sig_flatten_2_ksize(runtmp): @utils.in_tempdir def test_sig_downsample_1_scaled(c): # downsample a scaled signature - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'downsample', '--scaled', '10000', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "downsample", "--scaled", "10000", sig47) # stdout should be new signature out = c.last_result.out @@ -2910,8 +3093,8 @@ def test_sig_downsample_1_scaled(c): @utils.in_tempdir def test_sig_downsample_1_scaled_downsample_multisig(c): # downsample many scaled signatures in one file - multisig = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'downsample', '--scaled', '10000', multisig) + multisig = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "downsample", "--scaled", "10000", multisig) # stdout should be new signatures out = c.last_result.out @@ -2923,8 +3106,8 @@ def test_sig_downsample_1_scaled_downsample_multisig(c): @utils.in_tempdir def test_sig_downsample_1_scaled_to_num(c): # downsample a scaled signature - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'downsample', '--num', '500', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "downsample", "--num", "500", sig47) # stdout should be new signature out = c.last_result.out @@ -2938,70 +3121,72 @@ def test_sig_downsample_1_scaled_to_num(c): test_mins = test_downsample_sig.minhash.hashes.keys() test_mins = list(test_mins) test_mins.sort() - test_mins = test_mins[:500] # take 500 smallest + test_mins = test_mins[:500] # take 500 smallest assert actual_mins == test_mins def test_sig_downsample_check_num_bounds_negative(runtmp): - c=runtmp - sig47 = utils.get_test_data('47.fa.sig') + c = runtmp + sig47 = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'downsample', '--num', '-5', sig47) + c.run_sourmash("sig", "downsample", "--num", "-5", sig47) assert "ERROR: num value must be positive" in c.last_result.err def test_sig_downsample_check_num_bounds_less_than_minimum(runtmp): - c=runtmp - sig47 = utils.get_test_data('47.fa.sig') + c = runtmp + sig47 = utils.get_test_data("47.fa.sig") - c.run_sourmash('sig', 'downsample', '--num', '25', sig47) + c.run_sourmash("sig", "downsample", "--num", "25", sig47) assert "WARNING: num value should be >= 50. Continuing anyway." in c.last_result.err def test_sig_downsample_check_num_bounds_more_than_maximum(runtmp): - c=runtmp - sig47 = utils.get_test_data('47.fa.sig') + c = runtmp + sig47 = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'downsample', '--num', '100000', sig47) + c.run_sourmash("sig", "downsample", "--num", "100000", sig47) - assert "WARNING: num value should be <= 50000. Continuing anyway." in c.last_result.err + assert ( + "WARNING: num value should be <= 50000. Continuing anyway." in c.last_result.err + ) @utils.in_tempdir def test_sig_downsample_1_scaled_to_num_fail(c): # downsample a scaled signature - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'downsample', '--num', '50000', sig47) + c.run_sourmash("sig", "downsample", "--num", "50000", sig47) @utils.in_tempdir def test_sig_downsample_1_scaled_empty(c): # downsample a scaled signature - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'downsample', sig47) + c.run_sourmash("sig", "downsample", sig47) @utils.in_tempdir def test_sig_downsample_2_num(c): # downsample a num signature - sigs11 = utils.get_test_data('genome-s11.fa.gz.sig') - c.run_sourmash('sig', 'downsample', '--num', '500', - '-k', '21', '--dna', sigs11) + sigs11 = utils.get_test_data("genome-s11.fa.gz.sig") + c.run_sourmash("sig", "downsample", "--num", "500", "-k", "21", "--dna", sigs11) # stdout should be new signature out = c.last_result.out - test_downsample_sig = sourmash.load_one_signature(sigs11, ksize=21, - select_moltype='DNA') + test_downsample_sig = sourmash.load_one_signature( + sigs11, ksize=21, select_moltype="DNA" + ) actual_downsample_sig = sourmash.load_one_signature(out) test_mh = test_downsample_sig.minhash.downsample(num=500) @@ -3011,15 +3196,17 @@ def test_sig_downsample_2_num(c): @utils.in_tempdir def test_sig_downsample_2_num_to_scaled(c): # downsample a num signature and convert it into a scaled sig - sigs11 = utils.get_test_data('genome-s11.fa.gz.sig') - c.run_sourmash('sig', 'downsample', '--scaled', '10000', - '-k', '21', '--dna', sigs11) + sigs11 = utils.get_test_data("genome-s11.fa.gz.sig") + c.run_sourmash( + "sig", "downsample", "--scaled", "10000", "-k", "21", "--dna", sigs11 + ) # stdout should be new signature out = c.last_result.out - test_downsample_sig = sourmash.load_one_signature(sigs11, ksize=21, - select_moltype='DNA') + test_downsample_sig = sourmash.load_one_signature( + sigs11, ksize=21, select_moltype="DNA" + ) actual_downsample_sig = sourmash.load_one_signature(out) test_mins = test_downsample_sig.minhash.hashes.keys() @@ -3027,7 +3214,7 @@ def test_sig_downsample_2_num_to_scaled(c): # select those mins that are beneath the new max hash... max_hash = actual_downsample_sig.minhash._max_hash - test_mins_down = { k for k in test_mins if k < max_hash } + test_mins_down = {k for k in test_mins if k < max_hash} assert test_mins_down == set(actual_mins) @@ -3035,38 +3222,49 @@ def test_sig_downsample_2_num_to_scaled(c): def test_sig_downsample_2_num_to_scaled_fail(c): # downsample a num signature and FAIL to convert it into a scaled sig # because new scaled is too low - sigs11 = utils.get_test_data('genome-s11.fa.gz.sig') + sigs11 = utils.get_test_data("genome-s11.fa.gz.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'downsample', '--scaled', '100', - '-k', '21', '--dna', sigs11) + c.run_sourmash( + "sig", "downsample", "--scaled", "100", "-k", "21", "--dna", sigs11 + ) @utils.in_tempdir def test_sig_downsample_2_num_and_scaled_both_fail(c): # cannot specify both --num and --scaled - sigs11 = utils.get_test_data('genome-s11.fa.gz.sig') + sigs11 = utils.get_test_data("genome-s11.fa.gz.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'downsample', '--scaled', '100', '--num', '50', - '-k', '21', '--dna', sigs11) + c.run_sourmash( + "sig", + "downsample", + "--scaled", + "100", + "--num", + "50", + "-k", + "21", + "--dna", + sigs11, + ) @utils.in_tempdir def test_sig_downsample_2_num_empty(c): # downsample a num signature - sigs11 = utils.get_test_data('genome-s11.fa.gz.sig') + sigs11 = utils.get_test_data("genome-s11.fa.gz.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sig', 'downsample', '-k', '21', '--dna', sigs11) + c.run_sourmash("sig", "downsample", "-k", "21", "--dna", sigs11) def test_sig_describe_1(runtmp): c = runtmp # get basic info on a signature - sig47 = utils.get_test_data('47.fa.sig') - c.run_sourmash('sig', 'describe', sig47) + sig47 = utils.get_test_data("47.fa.sig") + c.run_sourmash("sig", "describe", sig47) out = c.last_result.out print(c.last_result) @@ -3087,12 +3285,18 @@ def test_sig_describe_1_fromfile_picklist(runtmp): c = runtmp # get basic info on a signature - sig47 = utils.get_test_data('47.fa.sig') - from_file = _write_file(runtmp, 'list.txt', [sig47]) - picklist = _write_file(runtmp, 'pl.csv', ['md5short', '09a08691']) - - c.run_sourmash('sig', 'describe', '--from-file', from_file, - '--picklist', f'{picklist}:md5short:md5short') + sig47 = utils.get_test_data("47.fa.sig") + from_file = _write_file(runtmp, "list.txt", [sig47]) + picklist = _write_file(runtmp, "pl.csv", ["md5short", "09a08691"]) + + c.run_sourmash( + "sig", + "describe", + "--from-file", + from_file, + "--picklist", + f"{picklist}:md5short:md5short", + ) out = c.last_result.out print(c.last_result) @@ -3112,41 +3316,55 @@ def test_sig_describe_1_fromfile_picklist(runtmp): @utils.in_thisdir def test_sig_describe_protein(c): # test describe on a singleton protein signature - testdata = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - c.run_sourmash('sig', 'describe', testdata) + testdata = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + c.run_sourmash("sig", "describe", testdata) - assert 'k=19 molecule=protein num=0 scaled=100 seed=42 track_abundance=0' in c.last_result.out + assert ( + "k=19 molecule=protein num=0 scaled=100 seed=42 track_abundance=0" + in c.last_result.out + ) @utils.in_thisdir def test_sig_describe_hp(c): # test describe on a singleton hp signature - testdata = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - c.run_sourmash('sig', 'describe', testdata) + testdata = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + c.run_sourmash("sig", "describe", testdata) - assert 'k=19 molecule=hp num=0 scaled=100 seed=42 track_abundance=0' in c.last_result.out + assert ( + "k=19 molecule=hp num=0 scaled=100 seed=42 track_abundance=0" + in c.last_result.out + ) @utils.in_thisdir def test_sig_describe_dayhoff(c): # test describe on a singleton dayhoff signature - testdata = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - c.run_sourmash('sig', 'describe', testdata) + testdata = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + c.run_sourmash("sig", "describe", testdata) - assert 'k=19 molecule=dayhoff num=0 scaled=100 seed=42 track_abundance=0' in c.last_result.out + assert ( + "k=19 molecule=dayhoff num=0 scaled=100 seed=42 track_abundance=0" + in c.last_result.out + ) @utils.in_tempdir def test_sig_describe_1_hp(c): # get basic info on a signature - testdata = utils.get_test_data('short.fa') - c.run_sourmash('compute', '-k', '21,30', - '--dayhoff', '--hp', '--protein', - '--dna', - testdata) + testdata = utils.get_test_data("short.fa") + c.run_sourmash( + "compute", "-k", "21,30", "--dayhoff", "--hp", "--protein", "--dna", testdata + ) # stdout should be new signature - computed_sig = os.path.join(c.location, 'short.fa.sig') - c.run_sourmash('sig', 'describe', computed_sig) + computed_sig = os.path.join(c.location, "short.fa.sig") + c.run_sourmash("sig", "describe", computed_sig) out = c.last_result.out print(c.last_result.out) @@ -3237,16 +3455,15 @@ def test_sig_describe_1_hp(c): """.splitlines() for line in out.splitlines(): - cleaned_line = line.strip().replace( - testdata_dirname, '').replace(location, '') + cleaned_line = line.strip().replace(testdata_dirname, "").replace(location, "") assert cleaned_line in expected_output, cleaned_line @utils.in_tempdir def test_sig_describe_1_multisig(c): # get basic info on multiple signatures in a single file - sigs = utils.get_test_data('47+63-multisig.sig') - c.run_sourmash('sig', 'describe', sigs) + sigs = utils.get_test_data("47+63-multisig.sig") + c.run_sourmash("sig", "describe", sigs) out = c.last_result.out print(c.last_result) @@ -3265,8 +3482,8 @@ def test_sig_describe_1_multisig(c): @utils.in_tempdir def test_sig_describe_1_sbt(c): # get basic info on multiple signatures in an SBT - sigs = utils.get_test_data('prot/protein.sbt.zip') - c.run_sourmash('sig', 'describe', sigs) + sigs = utils.get_test_data("prot/protein.sbt.zip") + c.run_sourmash("sig", "describe", sigs) out = c.last_result.out print(c.last_result) @@ -3282,8 +3499,8 @@ def test_sig_describe_1_sbt(c): @utils.in_tempdir def test_sig_describe_1_lca(c): # get basic info on multiple signatures in an LCA database - sigs = utils.get_test_data('prot/protein.lca.json.gz') - c.run_sourmash('sig', 'describe', sigs) + sigs = utils.get_test_data("prot/protein.lca.json.gz") + c.run_sourmash("sig", "describe", sigs) out = c.last_result.out print(c.last_result) @@ -3299,8 +3516,8 @@ def test_sig_describe_1_lca(c): @utils.in_tempdir def test_sig_describe_1_dir(c): # get basic info on multiple signatures in a directory - sigs = utils.get_test_data('prot/protein/') - c.run_sourmash('sig', 'describe', sigs) + sigs = utils.get_test_data("prot/protein/") + c.run_sourmash("sig", "describe", sigs) out = c.last_result.out print(c.last_result) @@ -3320,8 +3537,8 @@ def test_sig_describe_1_dir(c): @utils.in_tempdir def test_sig_describe_1_zipfile(c): # get basic info on multiple signatures in a zipfile - sigs = utils.get_test_data('prot/all.zip') - c.run_sourmash('sig', 'describe', sigs) + sigs = utils.get_test_data("prot/all.zip") + c.run_sourmash("sig", "describe", sigs) out = c.last_result.out print(c.last_result) @@ -3342,8 +3559,8 @@ def test_sig_describe_1_sig_abund(runtmp): # check output of sig describe on a sketch with abundances c = runtmp - sigfile = utils.get_test_data('track_abund/47.fa.sig') - c.run_sourmash('sig', 'describe', sigfile) + sigfile = utils.get_test_data("track_abund/47.fa.sig") + c.run_sourmash("sig", "describe", sigfile) out = c.last_result.out print(c.last_result.out) @@ -3363,18 +3580,22 @@ def test_sig_describe_1_sig_abund(runtmp): @utils.in_thisdir def test_sig_describe_stdin(c): - sig = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - with open(sig, 'rt') as fp: + sig = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + with open(sig) as fp: data = fp.read() - c.run_sourmash('sig', 'describe', '-', stdin_data=data) + c.run_sourmash("sig", "describe", "-", stdin_data=data) - assert 'signature: GCA_001593925' in c.last_result.out + assert "signature: GCA_001593925" in c.last_result.out @utils.in_tempdir def test_sig_describe_empty(c): - sig = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') + sig = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) ss = sourmash.load_file_as_signatures(sig) ss = list(ss) @@ -3382,34 +3603,34 @@ def test_sig_describe_empty(c): ss = ss[0] ss = ss.to_mutable() - ss.name = '' - ss.filename = '' + ss.name = "" + ss.filename = "" - outsig = c.output('xxx.sig') - with open(outsig, 'wt') as fp: + outsig = c.output("xxx.sig") + with open(outsig, "w") as fp: sourmash.save_signatures([ss], fp) ss = sourmash.load_file_as_signatures(outsig) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert ss.name == '' - assert ss.filename == '' + assert ss.name == "" + assert ss.filename == "" - c.run_sourmash('sig', 'describe', outsig) + c.run_sourmash("sig", "describe", outsig) print(c.last_result.out) - assert 'signature: ** no name **' in c.last_result.out - assert 'source file: ** no name **' in c.last_result.out + assert "signature: ** no name **" in c.last_result.out + assert "source file: ** no name **" in c.last_result.out def test_sig_describe_sqldb(runtmp): # make a sqldb and run fileinfo on it - gcf_all = glob.glob(utils.get_test_data('gather/GCF*.sig')) - sqldb = runtmp.output('some.sqldb') + gcf_all = glob.glob(utils.get_test_data("gather/GCF*.sig")) + sqldb = runtmp.output("some.sqldb") - runtmp.sourmash('sig', 'cat', '-k', '31', *gcf_all, '-o', sqldb) + runtmp.sourmash("sig", "cat", "-k", "31", *gcf_all, "-o", sqldb) - runtmp.sourmash('sig', 'describe', sqldb) + runtmp.sourmash("sig", "describe", sqldb) err = runtmp.last_result.err print(err) @@ -3417,28 +3638,30 @@ def test_sig_describe_sqldb(runtmp): out = runtmp.last_result.out print(out) - assert 'md5: 4289d4241be8573145282352215ca3c4' in out - assert 'md5: 85c3aeec6457c0b1d210472ddeb67714' in out + assert "md5: 4289d4241be8573145282352215ca3c4" in out + assert "md5: 85c3aeec6457c0b1d210472ddeb67714" in out def test_sig_describe_2_csv(runtmp): # output info in CSV spreadsheet c = runtmp - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - c.run_sourmash('sig', 'describe', sig47, sig63, '--csv', 'out.csv') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + c.run_sourmash("sig", "describe", sig47, sig63, "--csv", "out.csv") - expected_md5 = ['09a08691ce52952152f0e866a59f6261', - '38729c6374925585db28916b82a6f513'] + expected_md5 = [ + "09a08691ce52952152f0e866a59f6261", + "38729c6374925585db28916b82a6f513", + ] - with open(c.output('out.csv'), 'rt') as fp: + with open(c.output("out.csv")) as fp: r = csv.DictReader(fp) n = 0 for row, md5 in zip(r, expected_md5): - assert row['md5'] == md5 + assert row["md5"] == md5 n += 1 assert n == 2 @@ -3448,20 +3671,22 @@ def test_sig_describe_2_csv_gz(runtmp): # output info in CSV spreadsheet, gzipped c = runtmp - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - c.run_sourmash('sig', 'describe', sig47, sig63, '--csv', 'out.csv.gz') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + c.run_sourmash("sig", "describe", sig47, sig63, "--csv", "out.csv.gz") - expected_md5 = ['09a08691ce52952152f0e866a59f6261', - '38729c6374925585db28916b82a6f513'] + expected_md5 = [ + "09a08691ce52952152f0e866a59f6261", + "38729c6374925585db28916b82a6f513", + ] - with gzip.open(c.output('out.csv.gz'), 'rt', newline="") as fp: + with gzip.open(c.output("out.csv.gz"), "rt", newline="") as fp: r = csv.DictReader(fp) n = 0 for row, md5 in zip(r, expected_md5): - assert row['md5'] == md5 + assert row["md5"] == md5 n += 1 assert n == 2 @@ -3471,31 +3696,29 @@ def test_sig_describe_2_csv_abund(runtmp): # output info in CSV spreadsheet, for abund sig c = runtmp - sig47 = utils.get_test_data('track_abund/47.fa.sig') - c.run_sourmash('sig', 'describe', sig47, '--csv', 'out.csv') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + c.run_sourmash("sig", "describe", sig47, "--csv", "out.csv") - with open(c.output('out.csv'), 'rt') as fp: + with open(c.output("out.csv")) as fp: r = csv.DictReader(fp) - n = 0 - rows = list(r) assert len(rows) == 1 row = rows[0] - assert row['signature_file'] == sig47 - assert row['md5'] == "09a08691ce52952152f0e866a59f6261" - assert row['ksize'] == "31" - assert row['moltype'] == "DNA" - assert row['num'] == "0" - assert row['scaled'] == "1000" - assert row['n_hashes'] == "5177" - assert row['seed'] == "42" - assert row['with_abundance'] == "1" - assert row['name'] == "NC_009665.1 Shewanella baltica OS185, complete genome" - assert row['filename'] == "podar-ref/47.fa" - assert row['license'] == "CC0" - assert row['sum_hashes'] == "5292" + assert row["signature_file"] == sig47 + assert row["md5"] == "09a08691ce52952152f0e866a59f6261" + assert row["ksize"] == "31" + assert row["moltype"] == "DNA" + assert row["num"] == "0" + assert row["scaled"] == "1000" + assert row["n_hashes"] == "5177" + assert row["seed"] == "42" + assert row["with_abundance"] == "1" + assert row["name"] == "NC_009665.1 Shewanella baltica OS185, complete genome" + assert row["filename"] == "podar-ref/47.fa" + assert row["license"] == "CC0" + assert row["sum_hashes"] == "5292" def test_sig_describe_2_csv_as_picklist(runtmp): @@ -3503,14 +3726,12 @@ def test_sig_describe_2_csv_as_picklist(runtmp): # pickfile c = runtmp - sig47 = utils.get_test_data('47.fa.sig') - outcsv = runtmp.output('out.csv') + sig47 = utils.get_test_data("47.fa.sig") + outcsv = runtmp.output("out.csv") - c.run_sourmash('sig', 'describe', sig47, - '--csv', outcsv) + c.run_sourmash("sig", "describe", sig47, "--csv", outcsv) - c.run_sourmash('sig', 'describe', sig47, - '--picklist', f'{outcsv}::manifest') + c.run_sourmash("sig", "describe", sig47, "--picklist", f"{outcsv}::manifest") out = c.last_result.out print(c.last_result) @@ -3531,10 +3752,9 @@ def test_sig_describe_2_include_db_pattern(runtmp): # test sig describe --include-db-pattern c = runtmp - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - c.run_sourmash('sig', 'describe', allzip, - '--include-db-pattern', 'os185') + c.run_sourmash("sig", "describe", allzip, "--include-db-pattern", "os185") out = c.last_result.out print(c.last_result) @@ -3555,10 +3775,11 @@ def test_sig_describe_2_exclude_db_pattern(runtmp): # test sig describe --exclude-db-pattern c = runtmp - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - c.run_sourmash('sig', 'describe', allzip, '--dna', '-k', '31', - '--exclude-db-pattern', 'os223') + c.run_sourmash( + "sig", "describe", allzip, "--dna", "-k", "31", "--exclude-db-pattern", "os223" + ) out = c.last_result.out print(c.last_result) @@ -3577,13 +3798,13 @@ def test_sig_describe_2_exclude_db_pattern(runtmp): def test_sig_describe_3_manifest_works(runtmp): # test on a manifest with relative paths, in proper location - mf = utils.get_test_data('scaled/mf.csv') - runtmp.sourmash('sig', 'describe', mf, '--csv', 'out.csv') + mf = utils.get_test_data("scaled/mf.csv") + runtmp.sourmash("sig", "describe", mf, "--csv", "out.csv") out = runtmp.last_result.out print(out) - with open(runtmp.output('out.csv'), newline='') as fp: + with open(runtmp.output("out.csv"), newline="") as fp: r = csv.reader(fp) rows = list(r) assert len(rows) == 16 # 15 signatures, plus head @@ -3593,41 +3814,41 @@ def test_sig_describe_3_manifest_fails_when_moved(runtmp): # test on a manifest with relative paths, when in wrong place; # should fail, because actual signatures cannot be loaded now. # note: this tests lazy loading. - mf = utils.get_test_data('scaled/mf.csv') - shutil.copyfile(mf, runtmp.output('mf.csv')) + mf = utils.get_test_data("scaled/mf.csv") + shutil.copyfile(mf, runtmp.output("mf.csv")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'describe', 'mf.csv') + runtmp.sourmash("sig", "describe", "mf.csv") + - @utils.in_tempdir def test_sig_overlap(c): # get overlap details - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - c.run_sourmash('sig', 'overlap', sig47, sig63) + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + c.run_sourmash("sig", "overlap", sig47, sig63) out = c.last_result.out print(out) # md5s - assert '09a08691ce52952152f0e866a59f6261' in out - assert '38729c6374925585db28916b82a6f513' in out + assert "09a08691ce52952152f0e866a59f6261" in out + assert "38729c6374925585db28916b82a6f513" in out - assert 'similarity: 0.32069' in out - assert 'number of hashes in common: 2529' in out + assert "similarity: 0.32069" in out + assert "number of hashes in common: 2529" in out @utils.in_tempdir def test_import_export_1(c): # check to make sure we can import what we've exported! - inp = utils.get_test_data('genome-s11.fa.gz.sig') - outp = c.output('export.json') + inp = utils.get_test_data("genome-s11.fa.gz.sig") + outp = c.output("export.json") - c.run_sourmash('sig', 'export', inp, '-o', outp, '-k', '21', '--dna') - c.run_sourmash('sig', 'import', outp) + c.run_sourmash("sig", "export", inp, "-o", outp, "-k", "21", "--dna") + c.run_sourmash("sig", "import", outp) - original = sourmash.load_one_signature(inp, ksize=21, select_moltype='DNA') + original = sourmash.load_one_signature(inp, ksize=21, select_moltype="DNA") roundtrip = sourmash.load_one_signature(c.last_result.out) assert original.minhash == roundtrip.minhash @@ -3636,13 +3857,13 @@ def test_import_export_1(c): @utils.in_tempdir def test_import_export_1_by_md5(c): # check to make sure we can import what we've exported! - inp = utils.get_test_data('genome-s11.fa.gz.sig') - outp = c.output('export.json') + inp = utils.get_test_data("genome-s11.fa.gz.sig") + outp = c.output("export.json") - c.run_sourmash('sig', 'export', inp, '-o', outp, '--md5', '1437d8eae6') - c.run_sourmash('sig', 'import', outp) + c.run_sourmash("sig", "export", inp, "-o", outp, "--md5", "1437d8eae6") + c.run_sourmash("sig", "import", outp) - original = sourmash.load_one_signature(inp, ksize=21, select_moltype='DNA') + original = sourmash.load_one_signature(inp, ksize=21, select_moltype="DNA") roundtrip = sourmash.load_one_signature(c.last_result.out) assert original.minhash == roundtrip.minhash @@ -3655,271 +3876,259 @@ def test_import_export_2(c): # mash sketch -s 500 -k 21 ./tests/test-data/genome-s11.fa.gz # mash info -d ./tests/test-data/genome-s11.fa.gz.msh > tests/test-data/genome-s11.fa.gz.msh.json_dump # - sig1 = utils.get_test_data('genome-s11.fa.gz.sig') - msh_sig = utils.get_test_data('genome-s11.fa.gz.msh.json_dump') + sig1 = utils.get_test_data("genome-s11.fa.gz.sig") + msh_sig = utils.get_test_data("genome-s11.fa.gz.msh.json_dump") - c.run_sourmash('sig', 'import', msh_sig) + c.run_sourmash("sig", "import", msh_sig) imported = sourmash.load_one_signature(c.last_result.out) - compare = sourmash.load_one_signature(sig1, ksize=21, select_moltype='DNA') + compare = sourmash.load_one_signature(sig1, ksize=21, select_moltype="DNA") assert imported.minhash == compare.minhash def test_import_mash_csv_to_sig(runtmp): # test copied over from 'sourmash import_csv'. - testdata1 = utils.get_test_data('short.fa.msh.dump') - testdata2 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa.msh.dump") + testdata2 = utils.get_test_data("short.fa") - runtmp.sourmash('sig', 'import', '--csv', testdata1, '-o', 'xxx.sig') + runtmp.sourmash("sig", "import", "--csv", testdata1, "-o", "xxx.sig") - runtmp.sourmash('sketch', 'dna', '-p', 'k=31,num=970', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=970", testdata2) - runtmp.sourmash('search', '-k', '31', 'short.fa.sig', 'xxx.sig') + runtmp.sourmash("search", "-k", "31", "short.fa.sig", "xxx.sig") print("RUNTEMP", runtmp) - assert '1 matches' in runtmp.last_result.out - assert '100.0% short.fa' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "100.0% short.fa" in runtmp.last_result.out def test_sig_manifest_1_zipfile(runtmp): # make a manifest from a .zip file - protzip = utils.get_test_data('prot/protein.zip') - runtmp.sourmash('sig', 'manifest', protzip, '-o', 'SOURMASH-MANIFEST.csv') + protzip = utils.get_test_data("prot/protein.zip") + runtmp.sourmash("sig", "manifest", protzip, "-o", "SOURMASH-MANIFEST.csv") - manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("SOURMASH-MANIFEST.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_manifest_1_zipfile_csv_gz(runtmp): # make a gzipped manifest from a .zip file - protzip = utils.get_test_data('prot/protein.zip') - runtmp.sourmash('sig', 'manifest', protzip, - '-o', 'SOURMASH-MANIFEST.csv.gz') + protzip = utils.get_test_data("prot/protein.zip") + runtmp.sourmash("sig", "manifest", protzip, "-o", "SOURMASH-MANIFEST.csv.gz") - manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv.gz') - with gzip.open(manifest_fn, "rt", newline='') as csvfp: + manifest_fn = runtmp.output("SOURMASH-MANIFEST.csv.gz") + with gzip.open(manifest_fn, "rt", newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_manifest_1_zipfile_already_exists(runtmp): # make a manifest from a .zip file; f - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") - mf_csv = runtmp.output('mf.csv') + mf_csv = runtmp.output("mf.csv") with open(mf_csv, "w") as fp: fp.write("hello, world") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'manifest', protzip, '-o', 'mf.csv') + runtmp.sourmash("sig", "manifest", protzip, "-o", "mf.csv") def test_sig_manifest_1_zipfile_already_exists_force(runtmp): # make a manifest from a .zip file - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") - mf_csv = runtmp.output('mf.csv') + mf_csv = runtmp.output("mf.csv") with open(mf_csv, "w") as fp: fp.write("hello, world") - runtmp.sourmash('sig', 'manifest', protzip, '-o', 'mf.csv', '-f') + runtmp.sourmash("sig", "manifest", protzip, "-o", "mf.csv", "-f") - with open(mf_csv, newline='') as csvfp: + with open(mf_csv, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_manifest_1_zipfile_already_exists_sql(runtmp): # make a manifest from a .zip file - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") - mf_csv = runtmp.output('mf.mfsql') - runtmp.sourmash('sig', 'manifest', protzip, '-o', 'mf.mfsql', '-F', 'sql') - runtmp.sourmash('sig', 'manifest', protzip, '-o', 'mf.mfsql', '-F', 'sql', - '-f') + mf_csv = runtmp.output("mf.mfsql") + runtmp.sourmash("sig", "manifest", protzip, "-o", "mf.mfsql", "-F", "sql") + runtmp.sourmash("sig", "manifest", protzip, "-o", "mf.mfsql", "-F", "sql", "-f") manifest = CollectionManifest.load_from_filename(mf_csv) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_manifest_2_sigfile(runtmp): # make a manifest from a .sig file - sigfile = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') + sigfile = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) - runtmp.sourmash('sig', 'manifest', sigfile, '-o', 'SOURMASH-MANIFEST.csv') + runtmp.sourmash("sig", "manifest", sigfile, "-o", "SOURMASH-MANIFEST.csv") - status = runtmp.last_result.status - out = runtmp.last_result.out - err = runtmp.last_result.err - - manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("SOURMASH-MANIFEST.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 1 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list def test_sig_manifest_3_sbt(runtmp): # make a manifest from an SBT - protzip = utils.get_test_data('prot/protein.sbt.zip') - runtmp.sourmash('sig', 'manifest', protzip, '-o', 'SOURMASH-MANIFEST.csv') + protzip = utils.get_test_data("prot/protein.sbt.zip") + runtmp.sourmash("sig", "manifest", protzip, "-o", "SOURMASH-MANIFEST.csv") - manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("SOURMASH-MANIFEST.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_manifest_4_lca(runtmp): # make a manifest from a .lca.json file - sigfile = utils.get_test_data('prot/protein.lca.json.gz') - runtmp.sourmash('sig', 'manifest', sigfile, '-o', - 'SOURMASH-MANIFEST.csv') + sigfile = utils.get_test_data("prot/protein.lca.json.gz") + runtmp.sourmash("sig", "manifest", sigfile, "-o", "SOURMASH-MANIFEST.csv") - manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("SOURMASH-MANIFEST.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_manifest_5_dir(runtmp): # make a manifest from a directory - sigfile = utils.get_test_data('prot/protein/') - runtmp.sourmash('sig', 'manifest', sigfile, '-o', 'SOURMASH-MANIFEST.csv') - - status = runtmp.last_result.status - out = runtmp.last_result.out - err = runtmp.last_result.err + sigfile = utils.get_test_data("prot/protein/") + runtmp.sourmash("sig", "manifest", sigfile, "-o", "SOURMASH-MANIFEST.csv") - manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("SOURMASH-MANIFEST.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_manifest_6_pathlist(runtmp): # make a manifest from a pathlist file - sigfiles = utils.get_test_data('prot/protein/*.sig') + sigfiles = utils.get_test_data("prot/protein/*.sig") sigfiles = glob.glob(sigfiles) - pathlist = runtmp.output('pathlist.txt') - with open(pathlist, 'wt') as fp: + pathlist = runtmp.output("pathlist.txt") + with open(pathlist, "w") as fp: fp.write("\n".join(sigfiles)) - runtmp.sourmash('sig', 'manifest', pathlist, '-o', 'SOURMASH-MANIFEST.csv') + runtmp.sourmash("sig", "manifest", pathlist, "-o", "SOURMASH-MANIFEST.csv") - status = runtmp.last_result.status - out = runtmp.last_result.out - err = runtmp.last_result.err - - manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("SOURMASH-MANIFEST.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list # note: the manifest output for pathlists will contain the locations # used in the pathlist. This is required by StandaloneManifestIndex. for row in manifest.rows: - iloc = row['internal_location'] + iloc = row["internal_location"] print(iloc) - assert iloc.startswith('/'), iloc + assert iloc.startswith("/"), iloc def test_sig_manifest_does_not_exist(runtmp): with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('sig', 'manifest', 'does-not-exist', - '-o', 'out.csv') + runtmp.run_sourmash("sig", "manifest", "does-not-exist", "-o", "out.csv") - assert "Cannot open 'does-not-exist' as a sourmash signature collection." in runtmp.last_result.err + assert ( + "Cannot open 'does-not-exist' as a sourmash signature collection." + in runtmp.last_result.err + ) def test_sig_manifest_7_allzip_1(runtmp): # the rebuilt manifest w/o '-f' will miss dna-sig.noext - allzip = utils.get_test_data('prot/all.zip') - runtmp.sourmash('sig', 'manifest', allzip, '-o', 'xyz.csv') + allzip = utils.get_test_data("prot/all.zip") + runtmp.sourmash("sig", "manifest", allzip, "-o", "xyz.csv") - manifest_fn = runtmp.output('xyz.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("xyz.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 7 - filenames = set( row['internal_location'] for row in manifest.rows ) - assert 'dna-sig.noext' not in filenames + filenames = set(row["internal_location"] for row in manifest.rows) + assert "dna-sig.noext" not in filenames def test_sig_manifest_7_allzip_2(runtmp): # the rebuilt manifest w/ '-f' will contain dna-sig.noext - allzip = utils.get_test_data('prot/all.zip') - runtmp.sourmash('sig', 'manifest', allzip, '-o', 'xyz.csv', '-f') + allzip = utils.get_test_data("prot/all.zip") + runtmp.sourmash("sig", "manifest", allzip, "-o", "xyz.csv", "-f") - manifest_fn = runtmp.output('xyz.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("xyz.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 8 - filenames = set( row['internal_location'] for row in manifest.rows ) - assert 'dna-sig.noext' in filenames + filenames = set(row["internal_location"] for row in manifest.rows) + assert "dna-sig.noext" in filenames def test_sig_manifest_7_allzip_3(runtmp): # the existing manifest contains 'dna-sig.noext' whther or not -f is # used. - allzip = utils.get_test_data('prot/all.zip') - runtmp.sourmash('sig', 'manifest', allzip, '-o', 'xyz.csv', - '--no-rebuild') + allzip = utils.get_test_data("prot/all.zip") + runtmp.sourmash("sig", "manifest", allzip, "-o", "xyz.csv", "--no-rebuild") - manifest_fn = runtmp.output('xyz.csv') - with open(manifest_fn, newline='') as csvfp: + manifest_fn = runtmp.output("xyz.csv") + with open(manifest_fn, newline="") as csvfp: manifest = CollectionManifest.load_from_csv(csvfp) assert len(manifest) == 8 - filenames = set( row['internal_location'] for row in manifest.rows ) - assert 'dna-sig.noext' in filenames + filenames = set(row["internal_location"] for row in manifest.rows) + assert "dna-sig.noext" in filenames def test_sig_manifest_8_sqldb(runtmp): # make a sqldb and then run sig manifest on it. - gcf_all = glob.glob(utils.get_test_data('gather/GCF*.sig')) - sqldb = runtmp.output('some.sqldb') + gcf_all = glob.glob(utils.get_test_data("gather/GCF*.sig")) + sqldb = runtmp.output("some.sqldb") - runtmp.sourmash('sig', 'cat', '-k', '31', *gcf_all, '-o', sqldb) + runtmp.sourmash("sig", "cat", "-k", "31", *gcf_all, "-o", sqldb) # need to use '--no-rebuild-manifest' with 'sig manifest' on sqldb, # because it has a manifest but not the _signatures_with_internal @@ -3927,11 +4136,10 @@ def test_sig_manifest_8_sqldb(runtmp): # so, this should fail... with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'manifest', sqldb, '-o', 'mf.csv') + runtmp.sourmash("sig", "manifest", sqldb, "-o", "mf.csv") # ...and this should succeed: - runtmp.sourmash('sig', 'manifest', sqldb, '-o', 'mf.csv', - '--no-rebuild') + runtmp.sourmash("sig", "manifest", sqldb, "-o", "mf.csv", "--no-rebuild") err = runtmp.last_result.err print(err) @@ -3939,23 +4147,22 @@ def test_sig_manifest_8_sqldb(runtmp): out = runtmp.last_result.out print(out) - assert 'manifest contains 12 signatures total.' in err + assert "manifest contains 12 signatures total." in err assert "wrote manifest to 'mf.csv'" in err - mf = CollectionManifest.load_from_filename(runtmp.output('mf.csv')) + mf = CollectionManifest.load_from_filename(runtmp.output("mf.csv")) assert len(mf) == 12 def test_sig_manifest_8_sqldb_out(runtmp): # make a zip and run manifest out on it to make a sql format manifest. - gcf_all = glob.glob(utils.get_test_data('gather/GCF*.sig')) - zipfile = runtmp.output('some.zip') + gcf_all = glob.glob(utils.get_test_data("gather/GCF*.sig")) + zipfile = runtmp.output("some.zip") - runtmp.sourmash('sig', 'cat', '-k', '31', *gcf_all, '-o', zipfile) + runtmp.sourmash("sig", "cat", "-k", "31", *gcf_all, "-o", zipfile) # ...and this should succeed: - runtmp.sourmash('sig', 'manifest', zipfile, '-o', 'mf.sqldb', - '-F', 'sql') + runtmp.sourmash("sig", "manifest", zipfile, "-o", "mf.sqldb", "-F", "sql") err = runtmp.last_result.err print(err) @@ -3963,38 +4170,46 @@ def test_sig_manifest_8_sqldb_out(runtmp): out = runtmp.last_result.out print(out) - assert 'manifest contains 12 signatures total.' in err + assert "manifest contains 12 signatures total." in err assert "wrote manifest to 'mf.sqldb'" in err - mf = CollectionManifest.load_from_filename(runtmp.output('mf.sqldb')) + mf = CollectionManifest.load_from_filename(runtmp.output("mf.sqldb")) assert len(mf) == 12 def test_sig_kmers_1_dna(runtmp): # test sig kmers on dna - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") - runtmp.sourmash('sketch', 'dna', seqfile, '-p', 'scaled=1') - ss = sourmash.load_one_signature(runtmp.output('short.fa.sig')) + runtmp.sourmash("sketch", "dna", seqfile, "-p", "scaled=1") + ss = sourmash.load_one_signature(runtmp.output("short.fa.sig")) mh = ss.minhash - assert mh.moltype == 'DNA' - - runtmp.sourmash('sig', 'kmers', '--sig', 'short.fa.sig', - '--seq', seqfile, - '--save-kmers', 'short.csv', - '--save-sequences', 'matched.fa') + assert mh.moltype == "DNA" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "short.fa.sig", + "--seq", + seqfile, + "--save-kmers", + "short.csv", + "--save-sequences", + "matched.fa", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 970' in err - assert 'found 970 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 970" in err + assert "found 970 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 1 assert len(records[0].sequence) == 1000, len(records[0].sequence) @@ -4005,8 +4220,8 @@ def test_sig_kmers_1_dna(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('short.csv')) - with open(runtmp.output('short.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("short.csv")) + with open(runtmp.output("short.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 970 @@ -4014,58 +4229,56 @@ def test_sig_kmers_1_dna(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_sequence(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_sequence(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_kmers_1_dna_more_in_query(runtmp): # test sig kmers on dna, where query has more than matches - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") - runtmp.sourmash('sketch', 'dna', seqfile, '-p', 'scaled=1') - ss = sourmash.load_one_signature(runtmp.output('short.fa.sig')) + runtmp.sourmash("sketch", "dna", seqfile, "-p", "scaled=1") + ss = sourmash.load_one_signature(runtmp.output("short.fa.sig")) mh = ss.minhash - assert mh.moltype == 'DNA' + assert mh.moltype == "DNA" # make a new sequence for query, with more k-mers - query_seqfile = runtmp.output('query.fa') - with open(query_seqfile, 'wt') as fp: + query_seqfile = runtmp.output("query.fa") + with open(query_seqfile, "w") as fp: with screed.open(seqfile) as screed_iter: for record in screed_iter: fp.write(f">{record.name}\n{record.sequence}AGTTACGATC\n") - runtmp.sourmash('sig', 'kmers', '--sig', 'short.fa.sig', - '--seq', query_seqfile) + runtmp.sourmash("sig", "kmers", "--sig", "short.fa.sig", "--seq", query_seqfile) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 970' in err + assert "total hashes in merged signature: 970" in err # should only find 970 overlapping hashes here -- - assert 'found 970 distinct matching hashes (100.0%)' in err + assert "found 970 distinct matching hashes (100.0%)" in err def test_sig_kmers_1_dna_empty_seq(runtmp): # test sig kmers with empty query seq - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") - runtmp.sourmash('sketch', 'dna', seqfile, '-p', 'scaled=1') - ss = sourmash.load_one_signature(runtmp.output('short.fa.sig')) + runtmp.sourmash("sketch", "dna", seqfile, "-p", "scaled=1") + ss = sourmash.load_one_signature(runtmp.output("short.fa.sig")) mh = ss.minhash - assert mh.moltype == 'DNA' + assert mh.moltype == "DNA" # make a new sequence for query, with more k-mers - query_seqfile = runtmp.output('query.fa') - with open(query_seqfile, 'wt') as fp: + query_seqfile = runtmp.output("query.fa") + with open(query_seqfile, "w"): pass with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'kmers', '--sig', 'short.fa.sig', - '--seq', query_seqfile) + runtmp.sourmash("sig", "kmers", "--sig", "short.fa.sig", "--seq", query_seqfile) out = runtmp.last_result.out print(out) @@ -4077,16 +4290,15 @@ def test_sig_kmers_1_dna_empty_seq(runtmp): def test_sig_kmers_1_dna_empty_sig(runtmp): # test sig kmers with empty query sig - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") mh = sourmash.MinHash(ksize=31, n=0, scaled=1) ss = sourmash.SourmashSignature(mh, name="empty") - with open(runtmp.output('empty.sig'), 'wt') as fp: + with open(runtmp.output("empty.sig"), "w") as fp: sourmash.save_signatures([ss], fp) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'kmers', '--sig', 'empty.sig', - '--seq', seqfile) + runtmp.sourmash("sig", "kmers", "--sig", "empty.sig", "--seq", seqfile) out = runtmp.last_result.out print(out) @@ -4098,51 +4310,58 @@ def test_sig_kmers_1_dna_empty_sig(runtmp): def test_sig_kmers_1_dna_single_sig(runtmp): # test sig kmers with a fabricated query sig with a single hash - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") mh = sourmash.MinHash(ksize=31, n=0, scaled=1) mh.add_hash(1070961951490202715) ss = sourmash.SourmashSignature(mh, name="small") - with open(runtmp.output('small.sig'), 'wt') as fp: + with open(runtmp.output("small.sig"), "w") as fp: sourmash.save_signatures([ss], fp) - runtmp.sourmash('sig', 'kmers', '--sig', 'small.sig', - '--seq', seqfile) + runtmp.sourmash("sig", "kmers", "--sig", "small.sig", "--seq", seqfile) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 1' in err - assert 'found 1 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 1" in err + assert "found 1 distinct matching hashes (100.0%)" in err def test_sig_kmers_1_dna_lowscaled(runtmp): # test sig kmers on dna with a scaled of 100, so not all k-mers - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") - runtmp.sourmash('sketch', 'dna', seqfile, '-p', 'scaled=100') - ss = sourmash.load_one_signature(runtmp.output('short.fa.sig')) + runtmp.sourmash("sketch", "dna", seqfile, "-p", "scaled=100") + ss = sourmash.load_one_signature(runtmp.output("short.fa.sig")) mh = ss.minhash - assert mh.moltype == 'DNA' - - runtmp.sourmash('sig', 'kmers', '--sig', 'short.fa.sig', - '--seq', seqfile, - '--save-kmers', 'short.csv', - '--save-sequences', 'matched.fa') + assert mh.moltype == "DNA" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "short.fa.sig", + "--seq", + seqfile, + "--save-kmers", + "short.csv", + "--save-sequences", + "matched.fa", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 5' in err - assert 'found 5 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 5" in err + assert "found 5 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 1 assert len(records[0].sequence) == 1000, len(records[0].sequence) @@ -4153,8 +4372,8 @@ def test_sig_kmers_1_dna_lowscaled(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('short.csv')) - with open(runtmp.output('short.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("short.csv")) + with open(runtmp.output("short.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 5 @@ -4162,37 +4381,45 @@ def test_sig_kmers_1_dna_lowscaled(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_sequence(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_sequence(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_kmers_1_dna_num(runtmp): # test sig kmers on dna with a scaled of 100, so not all k-mers - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") - runtmp.sourmash('sketch', 'dna', seqfile, '-p', 'num=50') - ss = sourmash.load_one_signature(runtmp.output('short.fa.sig')) + runtmp.sourmash("sketch", "dna", seqfile, "-p", "num=50") + ss = sourmash.load_one_signature(runtmp.output("short.fa.sig")) mh = ss.minhash - assert mh.moltype == 'DNA' - - runtmp.sourmash('sig', 'kmers', '--sig', 'short.fa.sig', - '--seq', seqfile, - '--save-kmers', 'short.csv', - '--save-sequences', 'matched.fa') + assert mh.moltype == "DNA" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "short.fa.sig", + "--seq", + seqfile, + "--save-kmers", + "short.csv", + "--save-sequences", + "matched.fa", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 50' in err - assert 'found 50 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 50" in err + assert "found 50 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 1 assert len(records[0].sequence) == 1000, len(records[0].sequence) @@ -4203,8 +4430,8 @@ def test_sig_kmers_1_dna_num(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('short.csv')) - with open(runtmp.output('short.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("short.csv")) + with open(runtmp.output("short.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 50 @@ -4212,37 +4439,46 @@ def test_sig_kmers_1_dna_num(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_sequence(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_sequence(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_kmers_1_dna_translate_protein(runtmp): # test sig kmers on dna - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") - runtmp.sourmash('sketch', 'translate', seqfile, '-p', 'scaled=1') - ss = sourmash.load_one_signature(runtmp.output('short.fa.sig')) + runtmp.sourmash("sketch", "translate", seqfile, "-p", "scaled=1") + ss = sourmash.load_one_signature(runtmp.output("short.fa.sig")) mh = ss.minhash - assert mh.moltype == 'protein' - - runtmp.sourmash('sig', 'kmers', '--sig', 'short.fa.sig', - '--seq', seqfile, - '--save-kmers', 'short.csv', - '--save-sequences', 'matched.fa', '--translate') + assert mh.moltype == "protein" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "short.fa.sig", + "--seq", + seqfile, + "--save-kmers", + "short.csv", + "--save-sequences", + "matched.fa", + "--translate", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 1942' in err - assert 'found 1942 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 1942" in err + assert "found 1942 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 1 assert len(records[0].sequence) == 1000, len(records[0].sequence) @@ -4253,8 +4489,8 @@ def test_sig_kmers_1_dna_translate_protein(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('short.csv')) - with open(runtmp.output('short.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("short.csv")) + with open(runtmp.output("short.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 1942 @@ -4262,37 +4498,46 @@ def test_sig_kmers_1_dna_translate_protein(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_sequence(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_sequence(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_kmers_1_dna_translate_dayhoff(runtmp): # test sig kmers on dna - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") - runtmp.sourmash('sketch', 'translate', seqfile, '-p', 'scaled=1,dayhoff') - ss = sourmash.load_one_signature(runtmp.output('short.fa.sig')) + runtmp.sourmash("sketch", "translate", seqfile, "-p", "scaled=1,dayhoff") + ss = sourmash.load_one_signature(runtmp.output("short.fa.sig")) mh = ss.minhash - assert mh.moltype == 'dayhoff' - - runtmp.sourmash('sig', 'kmers', '--sig', 'short.fa.sig', - '--seq', seqfile, - '--save-kmers', 'short.csv', - '--save-sequences', 'matched.fa', '--translate') + assert mh.moltype == "dayhoff" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "short.fa.sig", + "--seq", + seqfile, + "--save-kmers", + "short.csv", + "--save-sequences", + "matched.fa", + "--translate", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 1906' in err - assert 'found 1906 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 1906" in err + assert "found 1906 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 1 assert len(records[0].sequence) == 1000, len(records[0].sequence) @@ -4303,8 +4548,8 @@ def test_sig_kmers_1_dna_translate_dayhoff(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('short.csv')) - with open(runtmp.output('short.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("short.csv")) + with open(runtmp.output("short.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 1906 @@ -4312,37 +4557,46 @@ def test_sig_kmers_1_dna_translate_dayhoff(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_sequence(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_sequence(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_kmers_1_dna_translate_hp(runtmp): # test sig kmers on dna - seqfile = utils.get_test_data('short.fa') + seqfile = utils.get_test_data("short.fa") - runtmp.sourmash('sketch', 'translate', seqfile, '-p', 'scaled=1,hp') - ss = sourmash.load_one_signature(runtmp.output('short.fa.sig')) + runtmp.sourmash("sketch", "translate", seqfile, "-p", "scaled=1,hp") + ss = sourmash.load_one_signature(runtmp.output("short.fa.sig")) mh = ss.minhash - assert mh.moltype == 'hp' - - runtmp.sourmash('sig', 'kmers', '--sig', 'short.fa.sig', - '--seq', seqfile, - '--save-kmers', 'short.csv', - '--save-sequences', 'matched.fa', '--translate') + assert mh.moltype == "hp" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "short.fa.sig", + "--seq", + seqfile, + "--save-kmers", + "short.csv", + "--save-sequences", + "matched.fa", + "--translate", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 1750' in err - assert 'found 1750 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 1750" in err + assert "found 1750 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 1 assert len(records[0].sequence) == 1000, len(records[0].sequence) @@ -4353,8 +4607,8 @@ def test_sig_kmers_1_dna_translate_hp(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('short.csv')) - with open(runtmp.output('short.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("short.csv")) + with open(runtmp.output("short.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 1750 @@ -4362,37 +4616,45 @@ def test_sig_kmers_1_dna_translate_hp(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_sequence(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_sequence(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_kmers_2_protein(runtmp): # test out sig kmers on an faa file - seqfile = utils.get_test_data('ecoli.faa') + seqfile = utils.get_test_data("ecoli.faa") - runtmp.sourmash('sketch', 'protein', seqfile, '-p', 'scaled=1') - ss = sourmash.load_one_signature(runtmp.output('ecoli.faa.sig')) + runtmp.sourmash("sketch", "protein", seqfile, "-p", "scaled=1") + ss = sourmash.load_one_signature(runtmp.output("ecoli.faa.sig")) mh = ss.minhash - assert mh.moltype == 'protein' - - runtmp.sourmash('sig', 'kmers', '--sig', 'ecoli.faa.sig', - '--seq', seqfile, - '--save-kmers', 'ecoli.csv', - '--save-sequences', 'matched.fa') + assert mh.moltype == "protein" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "ecoli.faa.sig", + "--seq", + seqfile, + "--save-kmers", + "ecoli.csv", + "--save-sequences", + "matched.fa", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 1112' in err - assert 'found 1112 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 1112" in err + assert "found 1112 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 2 assert len(records[0].sequence) == 820, len(records[0].sequence) @@ -4404,8 +4666,8 @@ def test_sig_kmers_2_protein(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('ecoli.csv')) - with open(runtmp.output('ecoli.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("ecoli.csv")) + with open(runtmp.output("ecoli.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 1112 @@ -4413,37 +4675,45 @@ def test_sig_kmers_2_protein(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_protein(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_protein(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_kmers_2_dayhoff(runtmp): # test out sig kmers on an faa file - seqfile = utils.get_test_data('ecoli.faa') + seqfile = utils.get_test_data("ecoli.faa") - runtmp.sourmash('sketch', 'protein', seqfile, '-p', 'scaled=1,dayhoff') - ss = sourmash.load_one_signature(runtmp.output('ecoli.faa.sig')) + runtmp.sourmash("sketch", "protein", seqfile, "-p", "scaled=1,dayhoff") + ss = sourmash.load_one_signature(runtmp.output("ecoli.faa.sig")) mh = ss.minhash - assert mh.moltype == 'dayhoff' - - runtmp.sourmash('sig', 'kmers', '--sig', 'ecoli.faa.sig', - '--seq', seqfile, - '--save-kmers', 'ecoli.csv', - '--save-sequences', 'matched.fa') + assert mh.moltype == "dayhoff" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "ecoli.faa.sig", + "--seq", + seqfile, + "--save-kmers", + "ecoli.csv", + "--save-sequences", + "matched.fa", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 1100' in err - assert 'found 1100 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 1100" in err + assert "found 1100 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 2 assert len(records[0].sequence) == 820, len(records[0].sequence) @@ -4455,8 +4725,8 @@ def test_sig_kmers_2_dayhoff(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('ecoli.csv')) - with open(runtmp.output('ecoli.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("ecoli.csv")) + with open(runtmp.output("ecoli.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 1100 @@ -4464,37 +4734,45 @@ def test_sig_kmers_2_dayhoff(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_protein(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_protein(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_kmers_2_hp(runtmp): # test out sig kmers on an faa file - seqfile = utils.get_test_data('ecoli.faa') + seqfile = utils.get_test_data("ecoli.faa") - runtmp.sourmash('sketch', 'protein', seqfile, '-p', 'scaled=1,hp') - ss = sourmash.load_one_signature(runtmp.output('ecoli.faa.sig')) + runtmp.sourmash("sketch", "protein", seqfile, "-p", "scaled=1,hp") + ss = sourmash.load_one_signature(runtmp.output("ecoli.faa.sig")) mh = ss.minhash - assert mh.moltype == 'hp' - - runtmp.sourmash('sig', 'kmers', '--sig', 'ecoli.faa.sig', - '--seq', seqfile, - '--save-kmers', 'ecoli.csv', - '--save-sequences', 'matched.fa') + assert mh.moltype == "hp" + + runtmp.sourmash( + "sig", + "kmers", + "--sig", + "ecoli.faa.sig", + "--seq", + seqfile, + "--save-kmers", + "ecoli.csv", + "--save-sequences", + "matched.fa", + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'total hashes in merged signature: 1048' in err - assert 'found 1048 distinct matching hashes (100.0%)' in err + assert "total hashes in merged signature: 1048" in err + assert "found 1048 distinct matching hashes (100.0%)" in err # check FASTA output - assert os.path.exists(runtmp.output('matched.fa')) - with screed.open(runtmp.output('matched.fa')) as f: + assert os.path.exists(runtmp.output("matched.fa")) + with screed.open(runtmp.output("matched.fa")) as f: records = list(f) assert len(records) == 2 assert len(records[0].sequence) == 820, len(records[0].sequence) @@ -4506,8 +4784,8 @@ def test_sig_kmers_2_hp(runtmp): assert seq_mh.similarity(mh) == 1.0 # check CSV output w/k-mers and hashes etc - assert os.path.exists(runtmp.output('ecoli.csv')) - with open(runtmp.output('ecoli.csv'), newline='') as fp: + assert os.path.exists(runtmp.output("ecoli.csv")) + with open(runtmp.output("ecoli.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 1048 @@ -4515,33 +4793,33 @@ def test_sig_kmers_2_hp(runtmp): check_mh = mh.copy_and_clear() check_mh2 = mh.copy_and_clear() for row in rows: - check_mh.add_protein(row['kmer']) - check_mh2.add_hash(int(row['hashval'])) + check_mh.add_protein(row["kmer"]) + check_mh2.add_hash(int(row["hashval"])) assert check_mh.similarity(mh) == 1.0 assert check_mh2.similarity(mh) == 1.0 def test_sig_check_1(runtmp): # basic check functionality - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist.csv') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") - runtmp.sourmash('sig', 'check', *sigfiles, - "--picklist", f"{picklist}::manifest", - "-m", "mf.csv") + runtmp.sourmash( + "sig", "check", *sigfiles, "--picklist", f"{picklist}::manifest", "-m", "mf.csv" + ) - out_mf = runtmp.output('mf.csv') + out_mf = runtmp.output("mf.csv") assert os.path.exists(out_mf) # all should match. - with open(out_mf, newline='') as fp: + with open(out_mf, newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 24 idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 24 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 3 assert 11 in ksizes assert 21 in ksizes @@ -4550,25 +4828,31 @@ def test_sig_check_1(runtmp): def test_sig_check_1_mf_csv_gz(runtmp): # basic check functionality, with gzipped manifest output - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist.csv') - - runtmp.sourmash('sig', 'check', *sigfiles, - "--picklist", f"{picklist}::manifest", - "-m", "mf.csv.gz") - - out_mf = runtmp.output('mf.csv.gz') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "--picklist", + f"{picklist}::manifest", + "-m", + "mf.csv.gz", + ) + + out_mf = runtmp.output("mf.csv.gz") assert os.path.exists(out_mf) # all should match. - with gzip.open(out_mf, "rt", newline='') as fp: + with gzip.open(out_mf, "rt", newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 24 idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 24 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 3 assert 11 in ksizes assert 21 in ksizes @@ -4577,30 +4861,36 @@ def test_sig_check_1_mf_csv_gz(runtmp): def test_sig_check_1_gz(runtmp): # basic check functionality with gzipped picklist - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist.csv') - picklist_gz = runtmp.output('salmonella.csv.gz') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + picklist_gz = runtmp.output("salmonella.csv.gz") with gzip.open(picklist_gz, "w") as outfp: with open(picklist, "rb") as infp: outfp.write(infp.read()) - runtmp.sourmash('sig', 'check', *sigfiles, - "--picklist", "salmonella.csv.gz::manifest", - "-m", "mf.csv") - - out_mf = runtmp.output('mf.csv') + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "--picklist", + "salmonella.csv.gz::manifest", + "-m", + "mf.csv", + ) + + out_mf = runtmp.output("mf.csv") assert os.path.exists(out_mf) # all should match. - with open(out_mf, newline='') as fp: + with open(out_mf, newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 24 idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 24 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 3 assert 11 in ksizes assert 21 in ksizes @@ -4609,25 +4899,32 @@ def test_sig_check_1_gz(runtmp): def test_sig_check_1_nofail(runtmp): # basic check functionality with --fail-if-missing - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist.csv') - - runtmp.sourmash('sig', 'check', *sigfiles, - "--picklist", f"{picklist}::manifest", - "-m", "mf.csv", '--fail-if-missing') - - out_mf = runtmp.output('mf.csv') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "--picklist", + f"{picklist}::manifest", + "-m", + "mf.csv", + "--fail-if-missing", + ) + + out_mf = runtmp.output("mf.csv") assert os.path.exists(out_mf) # all should match. - with open(out_mf, newline='') as fp: + with open(out_mf, newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 24 idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 24 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 3 assert 11 in ksizes assert 21 in ksizes @@ -4636,44 +4933,54 @@ def test_sig_check_1_nofail(runtmp): def test_sig_check_1_no_picklist(runtmp): # basic check functionality - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist.csv') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + utils.get_test_data("gather/salmonella-picklist.csv") with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sig', 'check', *sigfiles) + runtmp.sourmash("sig", "check", *sigfiles) assert "No picklist provided?! Exiting." in str(exc) -@pytest.mark.parametrize("column, coltype", - (('md5', 'md5'), - ('md5', 'md5prefix8'), - ('name', 'name'), - ('name', 'ident'), - ('name', 'identprefix'), - )) +@pytest.mark.parametrize( + "column, coltype", + ( + ("md5", "md5"), + ("md5", "md5prefix8"), + ("name", "name"), + ("name", "ident"), + ("name", "identprefix"), + ), +) def test_sig_check_1_column(runtmp, column, coltype): # basic check functionality for various columns/coltypes - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist.csv') - - runtmp.sourmash('sig', 'check', *sigfiles, - "--picklist", f"{picklist}:{column}:{coltype}", - "-m", "mf.csv", - "-o", "missing.csv") - - out_mf = runtmp.output('mf.csv') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "--picklist", + f"{picklist}:{column}:{coltype}", + "-m", + "mf.csv", + "-o", + "missing.csv", + ) + + out_mf = runtmp.output("mf.csv") assert os.path.exists(out_mf) # all should match. - with open(out_mf, newline='') as fp: + with open(out_mf, newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 24 idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 24 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 3 assert 11 in ksizes assert 21 in ksizes @@ -4682,113 +4989,133 @@ def test_sig_check_1_column(runtmp, column, coltype): def test_sig_check_1_diff_col_name(runtmp): # 'sig check' with 'name2' column instead of default name - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist-diffcolumn.csv') - - runtmp.sourmash('sig', 'check', *sigfiles, - "--picklist", f"{picklist}:name2:name", - "-o", "missing.csv", - '-m', 'mf.csv') - - out_mf = runtmp.output('mf.csv') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist-diffcolumn.csv") + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "--picklist", + f"{picklist}:name2:name", + "-o", + "missing.csv", + "-m", + "mf.csv", + ) + + out_mf = runtmp.output("mf.csv") assert os.path.exists(out_mf) - missing_csv = runtmp.output('missing.csv') + missing_csv = runtmp.output("missing.csv") assert os.path.exists(missing_csv) # should be 24 matching manifest rows - with open(out_mf, newline='') as fp: + with open(out_mf, newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 24 # internal locations should match sigfile_set = set(sigfiles) for row in mf.rows: - assert row['internal_location'] in sigfile_set + assert row["internal_location"] in sigfile_set idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 24 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 3 assert 11 in ksizes assert 21 in ksizes assert 31 in ksizes # should be one non-matching picklist row - with open(missing_csv, newline='') as fp: + with open(missing_csv, newline="") as fp: rows = list(csv.reader(fp)) - assert len(rows) == 2 # header row + data row - assert rows[1][0] == 'NOT THERE' + assert len(rows) == 2 # header row + data row + assert rows[1][0] == "NOT THERE" def test_sig_check_1_diff_col_name_zip(runtmp): # 'sig check' with 'name2' column instead of default name, on a zip file - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist-diffcolumn.csv') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist-diffcolumn.csv") # first create a zip db - runtmp.sourmash('sig', 'cat', *sigfiles, '-o', 'gcf.zip') + runtmp.sourmash("sig", "cat", *sigfiles, "-o", "gcf.zip") # now run against this zip - runtmp.sourmash('sig', 'check', 'gcf.zip', - "--picklist", f"{picklist}:name2:name", - "-o", "missing.csv", - '-m', 'mf.csv') - - out_mf = runtmp.output('mf.csv') + runtmp.sourmash( + "sig", + "check", + "gcf.zip", + "--picklist", + f"{picklist}:name2:name", + "-o", + "missing.csv", + "-m", + "mf.csv", + ) + + out_mf = runtmp.output("mf.csv") assert os.path.exists(out_mf) - missing_csv = runtmp.output('missing.csv') + missing_csv = runtmp.output("missing.csv") assert os.path.exists(missing_csv) # should be 24 matching manifest rows - with open(out_mf, newline='') as fp: + with open(out_mf, newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 24 # internal locations should all point to zip - ilocs = set(( row['internal_location'] for row in mf.rows )) + ilocs = set(row["internal_location"] for row in mf.rows) assert len(ilocs) == 1 # can we get 'em? idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 24 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 3 assert 11 in ksizes assert 21 in ksizes assert 31 in ksizes # should be one non-matching picklist row - with open(missing_csv, newline='') as fp: + with open(missing_csv, newline="") as fp: rows = list(csv.reader(fp)) - assert len(rows) == 2 # header row + data row - assert rows[1][0] == 'NOT THERE' + assert len(rows) == 2 # header row + data row + assert rows[1][0] == "NOT THERE" def test_sig_check_1_diff_col_name_exclude(runtmp): # 'sig check' with 'name2' column, :exclude picklist - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist-diffcolumn.csv') - - runtmp.sourmash('sig', 'check', *sigfiles, - "--picklist", f"{picklist}:name2:name:exclude", - '-m', 'mf.csv') - - out_mf = runtmp.output('mf.csv') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist-diffcolumn.csv") + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "--picklist", + f"{picklist}:name2:name:exclude", + "-m", + "mf.csv", + ) + + out_mf = runtmp.output("mf.csv") assert os.path.exists(out_mf) # should be 12 matching manifest rows - with open(out_mf, newline='') as fp: + with open(out_mf, newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 12 idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 12 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 3 assert 11 in ksizes assert 21 in ksizes @@ -4797,72 +5124,98 @@ def test_sig_check_1_diff_col_name_exclude(runtmp): def test_sig_check_1_ksize(runtmp): # basic check functionality with selection for ksize - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist.csv') - - runtmp.sourmash('sig', 'check', *sigfiles, '-k', '31', - "--picklist", f"{picklist}::manifest", - "-m", "mf.csv") - - out_mf = runtmp.output('mf.csv') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "-k", + "31", + "--picklist", + f"{picklist}::manifest", + "-m", + "mf.csv", + ) + + out_mf = runtmp.output("mf.csv") assert os.path.exists(out_mf) # 8 of the 24 should match. - with open(out_mf, newline='') as fp: + with open(out_mf, newline="") as fp: mf = CollectionManifest.load_from_csv(fp) assert len(mf) == 8 idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 8 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 1 assert 31 in ksizes def test_sig_check_1_ksize_output_sql(runtmp): # basic check functionality with selection for ksize - sigfiles = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/salmonella-picklist.csv') - - runtmp.sourmash('sig', 'check', *sigfiles, '-k', '31', - "--picklist", f"{picklist}::manifest", - "-m", "mf.mfsql", "-F", "sql") - - out_mf = runtmp.output('mf.mfsql') + sigfiles = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + runtmp.sourmash( + "sig", + "check", + *sigfiles, + "-k", + "31", + "--picklist", + f"{picklist}::manifest", + "-m", + "mf.mfsql", + "-F", + "sql", + ) + + out_mf = runtmp.output("mf.mfsql") assert os.path.exists(out_mf) # 8 of the 24 should match. mf = CollectionManifest.load_from_filename(out_mf) assert len(mf) == 8 - assert mf.conn # check that it's a sqlite manifest! hacky... + assert mf.conn # check that it's a sqlite manifest! hacky... idx = sourmash.load_file_as_index(out_mf) siglist = list(idx.signatures()) assert len(siglist) == 8 - ksizes = set([ ss.minhash.ksize for ss in siglist ]) + ksizes = set([ss.minhash.ksize for ss in siglist]) assert len(ksizes) == 1 assert 31 in ksizes def test_sig_check_2_output_missing(runtmp): # output missing all as identical to input picklist - sigfiles = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/salmonella-picklist.csv') - - runtmp.sourmash('sig', 'check', sigfiles, - "--picklist", f"{picklist}::manifest", - "-o", "missing.csv", "-m", "mf.csv") - - out_csv = runtmp.output('missing.csv') + sigfiles = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + runtmp.sourmash( + "sig", + "check", + sigfiles, + "--picklist", + f"{picklist}::manifest", + "-o", + "missing.csv", + "-m", + "mf.csv", + ) + + out_csv = runtmp.output("missing.csv") assert os.path.exists(out_csv) - mf_csv = runtmp.output('mf.csv') + mf_csv = runtmp.output("mf.csv") assert not os.path.exists(mf_csv) assert "not saving matching manifest" in runtmp.last_result.err # everything is missing with 'combined.sig' - with open(out_csv, newline='') as fp: + with open(out_csv, newline="") as fp: r = csv.DictReader(fp) rows = list(r) @@ -4871,51 +5224,67 @@ def test_sig_check_2_output_missing(runtmp): def test_sig_check_2_output_missing_error_exit(runtmp): # output missing all as identical to input picklist - sigfiles = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/salmonella-picklist.csv') + sigfiles = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/salmonella-picklist.csv") # should error exit... with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'check', sigfiles, - "--picklist", f"{picklist}::manifest", - "-o", "missing.csv", '--fail') + runtmp.sourmash( + "sig", + "check", + sigfiles, + "--picklist", + f"{picklist}::manifest", + "-o", + "missing.csv", + "--fail", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) # ...and also output stuff! - out_csv = runtmp.output('missing.csv') + out_csv = runtmp.output("missing.csv") assert os.path.exists(out_csv) # everything is missing with 'combined.sig' - with open(out_csv, newline='') as fp: + with open(out_csv, newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 24 -@pytest.mark.parametrize("column, coltype", - (('md5', 'md5'), - ('md5', 'md5prefix8'), - ('name', 'name'), - ('name', 'ident'), - ('name', 'identprefix'), - )) +@pytest.mark.parametrize( + "column, coltype", + ( + ("md5", "md5"), + ("md5", "md5prefix8"), + ("name", "name"), + ("name", "ident"), + ("name", "identprefix"), + ), +) def test_sig_check_2_output_missing_column(runtmp, column, coltype): # output missing all as identical to input picklist - sigfiles = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/salmonella-picklist.csv') - - runtmp.sourmash('sig', 'check', sigfiles, - "--picklist", f"{picklist}::manifest", - "-o", "missing.csv") - - out_csv = runtmp.output('missing.csv') + sigfiles = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/salmonella-picklist.csv") + + runtmp.sourmash( + "sig", + "check", + sigfiles, + "--picklist", + f"{picklist}::manifest", + "-o", + "missing.csv", + ) + + out_csv = runtmp.output("missing.csv") assert os.path.exists(out_csv) # everything is missing with 'combined.sig' - with open(out_csv, newline='') as fp: + with open(out_csv, newline="") as fp: r = csv.DictReader(fp) rows = list(r) @@ -4924,25 +5293,33 @@ def test_sig_check_2_output_missing_column(runtmp, column, coltype): def test_sig_check_2_output_missing_exclude(runtmp): # 'exclude' with '-o' shouldn't work - sigfiles = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/salmonella-picklist.csv') + sigfiles = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/salmonella-picklist.csv") with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sig', 'check', sigfiles, - "--picklist", f"{picklist}:name:name:exclude", - "-o", "missing.csv") - - assert "** ERROR: Cannot use an 'exclude' picklist with '-o/--output-missing'" in str(exc) + runtmp.sourmash( + "sig", + "check", + sigfiles, + "--picklist", + f"{picklist}:name:name:exclude", + "-o", + "missing.csv", + ) + + assert ( + "** ERROR: Cannot use an 'exclude' picklist with '-o/--output-missing'" + in str(exc) + ) def test_sig_check_3_no_manifest(runtmp): # fail check when no manifest, by default - sbt = utils.get_test_data('v6.sbt.zip') - picklist = utils.get_test_data('v6.sbt.zip.mf.csv') + sbt = utils.get_test_data("v6.sbt.zip") + picklist = utils.get_test_data("v6.sbt.zip.mf.csv") - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('sig', 'check', sbt, - '--picklist', f"{picklist}::manifest") + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash("sig", "check", sbt, "--picklist", f"{picklist}::manifest") print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -4953,12 +5330,21 @@ def test_sig_check_3_no_manifest(runtmp): def test_sig_check_3_no_manifest_ok(runtmp): # generate manifest if --no-require-manifest - sbt = utils.get_test_data('v6.sbt.zip') - picklist = utils.get_test_data('v6.sbt.zip.mf.csv') - - runtmp.run_sourmash('sig', 'check', sbt, "--no-require-manifest", - '--picklist', f"{picklist}::manifest") + sbt = utils.get_test_data("v6.sbt.zip") + picklist = utils.get_test_data("v6.sbt.zip.mf.csv") + + runtmp.run_sourmash( + "sig", + "check", + sbt, + "--no-require-manifest", + "--picklist", + f"{picklist}::manifest", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert "for given picklist, found 7 matches to 7 distinct values" in runtmp.last_result.err + assert ( + "for given picklist, found 7 matches to 7 distinct values" + in runtmp.last_result.err + ) diff --git a/tests/test_cmd_signature_collect.py b/tests/test_cmd_signature_collect.py index 61f703080f..edd7c16a29 100644 --- a/tests/test_cmd_signature_collect.py +++ b/tests/test_cmd_signature_collect.py @@ -15,13 +15,13 @@ def test_sig_collect_0_nothing(runtmp, manifest_db_format): # run with just output - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' - if manifest_db_format != 'sql': return + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + if manifest_db_format != "sql": + return - runtmp.sourmash('sig', 'collect', '-o', f'mf.{ext}', - '-F', manifest_db_format) + runtmp.sourmash("sig", "collect", "-o", f"mf.{ext}", "-F", manifest_db_format) - manifest_fn = runtmp.output(f'mf.{ext}') + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 0 @@ -29,124 +29,125 @@ def test_sig_collect_0_nothing(runtmp, manifest_db_format): def test_sig_collect_1_zipfile(runtmp, manifest_db_format): # collect a manifest from a .zip file - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + ext = "sqlmf" if manifest_db_format == "sql" else "csv" - runtmp.sourmash('sig', 'collect', protzip, '-o', f'mf.{ext}', - '-F', manifest_db_format) + runtmp.sourmash( + "sig", "collect", protzip, "-o", f"mf.{ext}", "-F", manifest_db_format + ) - manifest_fn = runtmp.output(f'mf.{ext}') + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_collect_1_zipfile_csv_gz(runtmp): # collect a manifest from a .zip file, save to csv.gz - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") - runtmp.sourmash('sig', 'collect', protzip, '-o', 'mf.csv.gz', - '-F', 'csv') + runtmp.sourmash("sig", "collect", protzip, "-o", "mf.csv.gz", "-F", "csv") - manifest_fn = runtmp.output('mf.csv.gz') + manifest_fn = runtmp.output("mf.csv.gz") # gzip, yes? - print('XXX', manifest_fn) - with gzip.open(manifest_fn, 'rt', newline='') as fp: + print("XXX", manifest_fn) + with gzip.open(manifest_fn, "rt", newline="") as fp: fp.read() manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_collect_1_zipfile_csv_gz_roundtrip(runtmp): # collect a manifest from a .zip file, save to csv.gz; then load again - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") - runtmp.sourmash('sig', 'collect', protzip, '-o', 'mf.csv.gz', - '-F', 'csv') + runtmp.sourmash("sig", "collect", protzip, "-o", "mf.csv.gz", "-F", "csv") - manifest_fn = runtmp.output('mf.csv.gz') + manifest_fn = runtmp.output("mf.csv.gz") # gzip, yes? - print('XXX', manifest_fn) - with gzip.open(manifest_fn, 'rt', newline='') as fp: + print("XXX", manifest_fn) + with gzip.open(manifest_fn, "rt", newline="") as fp: fp.read() manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list # can we read a csv.gz? - runtmp.sourmash('sig', 'collect', 'mf.csv.gz', '-o', 'mf2.csv', - '-F', 'csv') + runtmp.sourmash("sig", "collect", "mf.csv.gz", "-o", "mf2.csv", "-F", "csv") - manifest_fn2 = runtmp.output('mf2.csv') + manifest_fn2 = runtmp.output("mf2.csv") manifest2 = BaseCollectionManifest.load_from_filename(manifest_fn2) assert len(manifest2) == 2 - md5_list = [ row['md5'] for row in manifest2.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list - + md5_list = [row["md5"] for row in manifest2.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_sig_collect_2_exists_fail(runtmp, manifest_db_format): # collect a manifest from two .zip files - protzip = utils.get_test_data('prot/protein.zip') - allzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") + allzip = utils.get_test_data("prot/protein.zip") - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + ext = "sqlmf" if manifest_db_format == "sql" else "csv" - runtmp.sourmash('sig', 'collect', protzip, '-o', f'mf.{ext}', - '-F', manifest_db_format) + runtmp.sourmash( + "sig", "collect", protzip, "-o", f"mf.{ext}", "-F", manifest_db_format + ) - manifest_fn = runtmp.output(f'mf.{ext}') + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list # now run with same filename - should fail with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'collect', allzip, '-o', manifest_fn, - '-F', manifest_db_format) + runtmp.sourmash( + "sig", "collect", allzip, "-o", manifest_fn, "-F", manifest_db_format + ) def test_sig_collect_2_exists_merge(runtmp, manifest_db_format): # collect a manifest from two .zip files - protzip = utils.get_test_data('prot/protein.zip') - allzip = utils.get_test_data('prot/all.zip') + protzip = utils.get_test_data("prot/protein.zip") + allzip = utils.get_test_data("prot/all.zip") - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + ext = "sqlmf" if manifest_db_format == "sql" else "csv" - runtmp.sourmash('sig', 'collect', protzip, '-o', f'mf.{ext}', - '-F', manifest_db_format) + runtmp.sourmash( + "sig", "collect", protzip, "-o", f"mf.{ext}", "-F", manifest_db_format + ) - manifest_fn = runtmp.output(f'mf.{ext}') + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list # now run with same filename - should merge - runtmp.sourmash('sig', 'collect', allzip, '-o', manifest_fn, - '-F', manifest_db_format, '--merge') + runtmp.sourmash( + "sig", "collect", allzip, "-o", manifest_fn, "-F", manifest_db_format, "--merge" + ) manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 10 @@ -154,67 +155,68 @@ def test_sig_collect_2_exists_merge(runtmp, manifest_db_format): def test_sig_collect_2_exists_sql_merge_csv(runtmp, manifest_db_format): # try to merge csv into sql - protzip = utils.get_test_data('prot/protein.zip') - allzip = utils.get_test_data('prot/all.zip') + protzip = utils.get_test_data("prot/protein.zip") + allzip = utils.get_test_data("prot/all.zip") - ext = 'sqlmf' + ext = "sqlmf" # save as sql... - runtmp.sourmash('sig', 'collect', protzip, '-o', f'mf.{ext}', - '-F', 'sql') + runtmp.sourmash("sig", "collect", protzip, "-o", f"mf.{ext}", "-F", "sql") - manifest_fn = runtmp.output(f'mf.{ext}') + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'collect', allzip, '-o', manifest_fn, - '-F', 'csv', '--merge') + runtmp.sourmash( + "sig", "collect", allzip, "-o", manifest_fn, "-F", "csv", "--merge" + ) assert "ERROR loading" in runtmp.last_result.err def test_sig_collect_2_exists_csv_merge_sql(runtmp): # try to merge sql into csv - protzip = utils.get_test_data('prot/protein.zip') - allzip = utils.get_test_data('prot/all.zip') + protzip = utils.get_test_data("prot/protein.zip") + allzip = utils.get_test_data("prot/all.zip") - ext = 'csv' + ext = "csv" # save as csv... - runtmp.sourmash('sig', 'collect', protzip, '-o', f'mf.{ext}', - '-F', 'csv') + runtmp.sourmash("sig", "collect", protzip, "-o", f"mf.{ext}", "-F", "csv") - manifest_fn = runtmp.output(f'mf.{ext}') + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'collect', allzip, '-o', manifest_fn, - '-F', 'sql', '--merge') + runtmp.sourmash( + "sig", "collect", allzip, "-o", manifest_fn, "-F", "sql", "--merge" + ) assert "ERROR loading" in runtmp.last_result.err def test_sig_collect_2_no_exists_merge(runtmp, manifest_db_format): # test 'merge' when args.output doesn't already exist => warning - protzip = utils.get_test_data('prot/protein.zip') - allzip = utils.get_test_data('prot/all.zip') + utils.get_test_data("prot/protein.zip") + allzip = utils.get_test_data("prot/all.zip") - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' - manifest_fn = runtmp.output(f'mf.{ext}') + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + manifest_fn = runtmp.output(f"mf.{ext}") # run with --merge but no previous: - runtmp.sourmash('sig', 'collect', allzip, '-o', manifest_fn, - '-F', manifest_db_format, '--merge') + runtmp.sourmash( + "sig", "collect", allzip, "-o", manifest_fn, "-F", manifest_db_format, "--merge" + ) manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 8 @@ -226,28 +228,37 @@ def test_sig_collect_2_no_exists_merge(runtmp, manifest_db_format): def test_sig_collect_3_multiple(runtmp, manifest_db_format): # collect a manifest from two .zip files - protzip = utils.get_test_data('prot/protein.zip') - hpzip = utils.get_test_data('prot/hp.zip') - dayzip = utils.get_test_data('prot/dayhoff.zip') - - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' - - runtmp.sourmash('sig', 'collect', protzip, hpzip, dayzip, - '-o', f'mf.{ext}', '-F', manifest_db_format) - - manifest_fn = runtmp.output(f'mf.{ext}') + protzip = utils.get_test_data("prot/protein.zip") + hpzip = utils.get_test_data("prot/hp.zip") + dayzip = utils.get_test_data("prot/dayhoff.zip") + + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + + runtmp.sourmash( + "sig", + "collect", + protzip, + hpzip, + dayzip, + "-o", + f"mf.{ext}", + "-F", + manifest_db_format, + ) + + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 6 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list - assert 'ea2a1ad233c2908529d124a330bcb672' in md5_list - assert 'bb0e6d90df01b7bd5d0956a5f9e3ed12' in md5_list - assert 'fbca5e5211e4d58427997fd5c8343e9a' in md5_list - assert '1cbd888bf910f83ad8f1715509183223' in md5_list - - locations = set([ row['internal_location'] for row in manifest.rows ]) + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list + assert "ea2a1ad233c2908529d124a330bcb672" in md5_list + assert "bb0e6d90df01b7bd5d0956a5f9e3ed12" in md5_list + assert "fbca5e5211e4d58427997fd5c8343e9a" in md5_list + assert "1cbd888bf910f83ad8f1715509183223" in md5_list + + locations = set([row["internal_location"] for row in manifest.rows]) assert protzip in locations assert hpzip in locations assert dayzip in locations @@ -256,34 +267,42 @@ def test_sig_collect_3_multiple(runtmp, manifest_db_format): def test_sig_collect_3_multiple_use_fromfile(runtmp, manifest_db_format): # collect a manifest from two .zip files using --from-file - protzip = utils.get_test_data('prot/protein.zip') - hpzip = utils.get_test_data('prot/hp.zip') - dayzip = utils.get_test_data('prot/dayhoff.zip') + protzip = utils.get_test_data("prot/protein.zip") + hpzip = utils.get_test_data("prot/hp.zip") + dayzip = utils.get_test_data("prot/dayhoff.zip") - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + ext = "sqlmf" if manifest_db_format == "sql" else "csv" - fromfile = runtmp.output('fromfile.txt') - with open(fromfile, 'wt') as fp: + fromfile = runtmp.output("fromfile.txt") + with open(fromfile, "w") as fp: print(protzip, file=fp) print(hpzip, file=fp) print(dayzip, file=fp) - runtmp.sourmash('sig', 'collect', '--from-file', 'fromfile.txt', - '-o', f'mf.{ext}', '-F', manifest_db_format) - - manifest_fn = runtmp.output(f'mf.{ext}') + runtmp.sourmash( + "sig", + "collect", + "--from-file", + "fromfile.txt", + "-o", + f"mf.{ext}", + "-F", + manifest_db_format, + ) + + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 6 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list - assert 'ea2a1ad233c2908529d124a330bcb672' in md5_list - assert 'bb0e6d90df01b7bd5d0956a5f9e3ed12' in md5_list - assert 'fbca5e5211e4d58427997fd5c8343e9a' in md5_list - assert '1cbd888bf910f83ad8f1715509183223' in md5_list - - locations = set([ row['internal_location'] for row in manifest.rows ]) + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list + assert "ea2a1ad233c2908529d124a330bcb672" in md5_list + assert "bb0e6d90df01b7bd5d0956a5f9e3ed12" in md5_list + assert "fbca5e5211e4d58427997fd5c8343e9a" in md5_list + assert "1cbd888bf910f83ad8f1715509183223" in md5_list + + locations = set([row["internal_location"] for row in manifest.rows]) assert protzip in locations assert hpzip in locations assert dayzip in locations @@ -292,23 +311,24 @@ def test_sig_collect_3_multiple_use_fromfile(runtmp, manifest_db_format): def test_sig_collect_4_multiple_from_sig(runtmp, manifest_db_format): # collect a manifest from sig files - sig43 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig43 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + ext = "sqlmf" if manifest_db_format == "sql" else "csv" - runtmp.sourmash('sig', 'collect', sig43, sig63, - '-o', f'mf.{ext}', '-F', manifest_db_format) + runtmp.sourmash( + "sig", "collect", sig43, sig63, "-o", f"mf.{ext}", "-F", manifest_db_format + ) - manifest_fn = runtmp.output(f'mf.{ext}') + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '09a08691ce52952152f0e866a59f6261' in md5_list - assert '38729c6374925585db28916b82a6f513' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "09a08691ce52952152f0e866a59f6261" in md5_list + assert "38729c6374925585db28916b82a6f513" in md5_list - locations = set([ row['internal_location'] for row in manifest.rows ]) + locations = set([row["internal_location"] for row in manifest.rows]) assert sig43 in locations assert sig63 in locations assert len(locations) == 2, locations @@ -316,89 +336,115 @@ def test_sig_collect_4_multiple_from_sig(runtmp, manifest_db_format): def test_sig_collect_4_multiple_from_sig_abspath(runtmp, manifest_db_format): # collect a manifest from sig files, forcing abspath - sig43 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - - shutil.copyfile(sig43, runtmp.output('47.fa.sig')) - shutil.copyfile(sig63, runtmp.output('63.fa.sig')) - - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' - - runtmp.sourmash('sig', 'collect', '47.fa.sig', '63.fa.sig', '--abspath', - '-o', f'mf.{ext}', '-F', manifest_db_format) + sig43 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + + shutil.copyfile(sig43, runtmp.output("47.fa.sig")) + shutil.copyfile(sig63, runtmp.output("63.fa.sig")) + + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + + runtmp.sourmash( + "sig", + "collect", + "47.fa.sig", + "63.fa.sig", + "--abspath", + "-o", + f"mf.{ext}", + "-F", + manifest_db_format, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - manifest_fn = runtmp.output(f'mf.{ext}') + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '09a08691ce52952152f0e866a59f6261' in md5_list - assert '38729c6374925585db28916b82a6f513' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "09a08691ce52952152f0e866a59f6261" in md5_list + assert "38729c6374925585db28916b82a6f513" in md5_list - locations = set([ row['internal_location'] for row in manifest.rows ]) + locations = set([row["internal_location"] for row in manifest.rows]) print(locations) assert len(locations) == 2, locations for xx in locations: - assert xx.startswith('/') + assert xx.startswith("/") def test_sig_collect_4_multiple_no_abspath(runtmp, manifest_db_format): # collect a manifest from sig files, no abspath - sig43 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig43 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") # copy files to tmp, where they will not have full paths - shutil.copyfile(sig43, runtmp.output('47.fa.sig')) - shutil.copyfile(sig63, runtmp.output('63.fa.sig')) - - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' - - runtmp.sourmash('sig', 'collect', '47.fa.sig', '63.fa.sig', - '-o', f'mf.{ext}', '-F', manifest_db_format) - - manifest_fn = runtmp.output(f'mf.{ext}') + shutil.copyfile(sig43, runtmp.output("47.fa.sig")) + shutil.copyfile(sig63, runtmp.output("63.fa.sig")) + + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + + runtmp.sourmash( + "sig", + "collect", + "47.fa.sig", + "63.fa.sig", + "-o", + f"mf.{ext}", + "-F", + manifest_db_format, + ) + + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '09a08691ce52952152f0e866a59f6261' in md5_list - assert '38729c6374925585db28916b82a6f513' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "09a08691ce52952152f0e866a59f6261" in md5_list + assert "38729c6374925585db28916b82a6f513" in md5_list - locations = set([ row['internal_location'] for row in manifest.rows ]) + locations = set([row["internal_location"] for row in manifest.rows]) print(locations) assert len(locations) == 2, locations - assert '47.fa.sig' in locations - assert '63.fa.sig' in locations + assert "47.fa.sig" in locations + assert "63.fa.sig" in locations def test_sig_collect_5_no_manifest_sbt_fail(runtmp, manifest_db_format): # collect a manifest from files that don't have one - sbt_zip = utils.get_test_data('v6.sbt.zip') + sbt_zip = utils.get_test_data("v6.sbt.zip") - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' + ext = "sqlmf" if manifest_db_format == "sql" else "csv" with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'collect', sbt_zip, - '-o', f'mf.{ext}', '-F', manifest_db_format) + runtmp.sourmash( + "sig", "collect", sbt_zip, "-o", f"mf.{ext}", "-F", manifest_db_format + ) def test_sig_collect_5_no_manifest_sbt_succeed(runtmp, manifest_db_format): # generate a manifest from files that don't have one when --no-require - sbt_zip = utils.get_test_data('v6.sbt.zip') - - ext = 'sqlmf' if manifest_db_format == 'sql' else 'csv' - - runtmp.sourmash('sig', 'collect', sbt_zip, '--no-require-manifest', - '-o', f'mf.{ext}', '-F', manifest_db_format) - - manifest_fn = runtmp.output(f'mf.{ext}') + sbt_zip = utils.get_test_data("v6.sbt.zip") + + ext = "sqlmf" if manifest_db_format == "sql" else "csv" + + runtmp.sourmash( + "sig", + "collect", + sbt_zip, + "--no-require-manifest", + "-o", + f"mf.{ext}", + "-F", + manifest_db_format, + ) + + manifest_fn = runtmp.output(f"mf.{ext}") manifest = BaseCollectionManifest.load_from_filename(manifest_fn) assert len(manifest) == 7 - locations = set([ row['internal_location'] for row in manifest.rows ]) + locations = set([row["internal_location"] for row in manifest.rows]) assert len(locations) == 1, locations assert sbt_zip in locations diff --git a/tests/test_cmd_signature_fileinfo.py b/tests/test_cmd_signature_fileinfo.py index 33bd649748..25e29a5b4f 100644 --- a/tests/test_cmd_signature_fileinfo.py +++ b/tests/test_cmd_signature_fileinfo.py @@ -16,10 +16,10 @@ def test_fileinfo_1_sig(runtmp): # get basic info on a signature - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - shutil.copyfile(sig47, runtmp.output('sig47.sig')) - runtmp.run_sourmash('sig', 'fileinfo', 'sig47.sig') + shutil.copyfile(sig47, runtmp.output("sig47.sig")) + runtmp.run_sourmash("sig", "fileinfo", "sig47.sig") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -40,10 +40,10 @@ def test_fileinfo_1_sig(runtmp): def test_fileinfo_1_sig_summarize(runtmp): # get basic info on a signature with 'summarize' as alias for fileinfo - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - shutil.copyfile(sig47, runtmp.output('sig47.sig')) - runtmp.run_sourmash('sig', 'summarize', 'sig47.sig') + shutil.copyfile(sig47, runtmp.output("sig47.sig")) + runtmp.run_sourmash("sig", "summarize", "sig47.sig") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -64,10 +64,10 @@ def test_fileinfo_1_sig_summarize(runtmp): def test_fileinfo_1_sig_abund(runtmp): # get basic info on a signature with abundance - sig47 = utils.get_test_data('track_abund/47.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") - shutil.copyfile(sig47, runtmp.output('sig47.sig')) - runtmp.run_sourmash('sig', 'fileinfo', 'sig47.sig') + shutil.copyfile(sig47, runtmp.output("sig47.sig")) + runtmp.run_sourmash("sig", "fileinfo", "sig47.sig") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -88,10 +88,10 @@ def test_fileinfo_1_sig_abund(runtmp): def test_fileinfo_2_lca(runtmp): # get basic info on an LCA database - prot = utils.get_test_data('prot/protein.lca.json.gz') + prot = utils.get_test_data("prot/protein.lca.json.gz") - shutil.copyfile(prot, runtmp.output('protein.lca.json.gz')) - runtmp.run_sourmash('sig', 'fileinfo', 'protein.lca.json.gz') + shutil.copyfile(prot, runtmp.output("protein.lca.json.gz")) + runtmp.run_sourmash("sig", "fileinfo", "protein.lca.json.gz") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -112,10 +112,10 @@ def test_fileinfo_2_lca(runtmp): def test_fileinfo_3_sbt_zip(runtmp): # test on an SBT.zip - prot = utils.get_test_data('prot/protein.sbt.zip') + prot = utils.get_test_data("prot/protein.sbt.zip") - shutil.copyfile(prot, runtmp.output('protein.sbt.zip')) - runtmp.run_sourmash('sig', 'fileinfo', 'protein.sbt.zip') + shutil.copyfile(prot, runtmp.output("protein.sbt.zip")) + runtmp.run_sourmash("sig", "fileinfo", "protein.sbt.zip") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -136,10 +136,10 @@ def test_fileinfo_3_sbt_zip(runtmp): def test_fileinfo_4_zip(runtmp): # test on a ZipFileLinearIndex - prot = utils.get_test_data('prot/all.zip') + prot = utils.get_test_data("prot/all.zip") - shutil.copyfile(prot, runtmp.output('all.zip')) - runtmp.run_sourmash('sig', 'fileinfo', 'all.zip') + shutil.copyfile(prot, runtmp.output("all.zip")) + runtmp.run_sourmash("sig", "fileinfo", "all.zip") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -163,10 +163,10 @@ def test_fileinfo_4_zip(runtmp): def test_fileinfo_4_zip_json_out(runtmp): # check --json-out - prot = utils.get_test_data('prot/all.zip') + prot = utils.get_test_data("prot/all.zip") - shutil.copyfile(prot, runtmp.output('all.zip')) - runtmp.run_sourmash('sig', 'fileinfo', 'all.zip', '--json-out') + shutil.copyfile(prot, runtmp.output("all.zip")) + runtmp.run_sourmash("sig", "fileinfo", "all.zip", "--json-out") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -174,30 +174,62 @@ def test_fileinfo_4_zip_json_out(runtmp): # should succeed as loading as JSON, with correct info vals = json.loads(out) - assert vals['has_manifest'] - assert vals['is_database'] - assert vals['num_sketches'] == 8 - assert vals['path_filetype'] == 'ZipFileLinearIndex' - assert vals['total_hashes'] == 31758 - - d1 = {'ksize': 19, 'moltype': 'dayhoff', 'scaled': 100, 'num': 0, 'abund': False, 'count': 2, 'n_hashes': 7945} - d2 = {'ksize': 19, 'moltype': 'hp', 'scaled': 100, 'num': 0, 'abund': False, 'count': 2, 'n_hashes': 5184} - d3 = {'ksize': 19, 'moltype': 'protein', 'scaled': 100, 'num': 0, 'abund': False, 'count': 2, 'n_hashes': 8214} - d4 = {'ksize': 31, 'moltype': 'DNA', 'scaled': 1000, 'num': 0, 'abund': False, 'count': 2, 'n_hashes': 10415} - - assert d1 in vals['sketch_info'] - assert d2 in vals['sketch_info'] - assert d3 in vals['sketch_info'] - assert d4 in vals['sketch_info'] - assert len(vals['sketch_info']) == 4 + assert vals["has_manifest"] + assert vals["is_database"] + assert vals["num_sketches"] == 8 + assert vals["path_filetype"] == "ZipFileLinearIndex" + assert vals["total_hashes"] == 31758 + + d1 = { + "ksize": 19, + "moltype": "dayhoff", + "scaled": 100, + "num": 0, + "abund": False, + "count": 2, + "n_hashes": 7945, + } + d2 = { + "ksize": 19, + "moltype": "hp", + "scaled": 100, + "num": 0, + "abund": False, + "count": 2, + "n_hashes": 5184, + } + d3 = { + "ksize": 19, + "moltype": "protein", + "scaled": 100, + "num": 0, + "abund": False, + "count": 2, + "n_hashes": 8214, + } + d4 = { + "ksize": 31, + "moltype": "DNA", + "scaled": 1000, + "num": 0, + "abund": False, + "count": 2, + "n_hashes": 10415, + } + + assert d1 in vals["sketch_info"] + assert d2 in vals["sketch_info"] + assert d3 in vals["sketch_info"] + assert d4 in vals["sketch_info"] + assert len(vals["sketch_info"]) == 4 def test_fileinfo_4_zip_rebuild(runtmp): # test --rebuild - prot = utils.get_test_data('prot/all.zip') + prot = utils.get_test_data("prot/all.zip") - shutil.copyfile(prot, runtmp.output('all.zip')) - runtmp.run_sourmash('sig', 'fileinfo', 'all.zip', '--rebuild') + shutil.copyfile(prot, runtmp.output("all.zip")) + runtmp.run_sourmash("sig", "fileinfo", "all.zip", "--rebuild") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -224,12 +256,12 @@ def test_fileinfo_4_zip_rebuild(runtmp): def test_fileinfo_5_dir(runtmp): # test on a directory - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - os.mkdir(runtmp.output('subdir')) + os.mkdir(runtmp.output("subdir")) - shutil.copyfile(sig47, runtmp.output('subdir/sig47.sig')) - runtmp.run_sourmash('sig', 'fileinfo', 'subdir/') + shutil.copyfile(sig47, runtmp.output("subdir/sig47.sig")) + runtmp.run_sourmash("sig", "fileinfo", "subdir/") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -250,13 +282,13 @@ def test_fileinfo_5_dir(runtmp): def test_fileinfo_6_pathlist(runtmp): # test on a pathlist - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") shutil.copyfile(sig47, runtmp.output("47.fa.sig")) - with open(runtmp.output('pathlist.txt'), 'wt') as fp: + with open(runtmp.output("pathlist.txt"), "w") as fp: fp.write("47.fa.sig\n") - runtmp.run_sourmash('sig', 'fileinfo', 'pathlist.txt') + runtmp.run_sourmash("sig", "fileinfo", "pathlist.txt") out = runtmp.last_result.out print(runtmp.last_result.out) @@ -275,13 +307,22 @@ def test_fileinfo_6_pathlist(runtmp): assert line.strip() in out -@pytest.mark.parametrize("db", ['v6.sbt.json', 'v5.sbt.json', 'v4.sbt.json', - 'v3.sbt.json', 'v2.sbt.json', 'v1.sbt.json']) +@pytest.mark.parametrize( + "db", + [ + "v6.sbt.json", + "v5.sbt.json", + "v4.sbt.json", + "v3.sbt.json", + "v2.sbt.json", + "v1.sbt.json", + ], +) def test_fileinfo_7_sbt_json(runtmp, db): # test on multiple versions of SBT JSON files dbfile = utils.get_test_data(db) - runtmp.run_sourmash('sig', 'fileinfo', dbfile) + runtmp.run_sourmash("sig", "fileinfo", dbfile) out = runtmp.last_result.out print(runtmp.last_result.out) @@ -302,11 +343,13 @@ def test_fileinfo_7_sbt_json(runtmp, db): def test_sig_fileinfo_stdin(runtmp): # test on stdin - sig = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - with open(sig, 'rt') as fp: + sig = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + with open(sig) as fp: data = fp.read() - runtmp.run_sourmash('sig', 'fileinfo', '-', stdin_data=data) + runtmp.run_sourmash("sig", "fileinfo", "-", stdin_data=data) out = runtmp.last_result.out print(out) @@ -328,53 +371,56 @@ def test_sig_fileinfo_stdin(runtmp): def test_sig_fileinfo_does_not_exist(runtmp): # test on file that does not exist with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('sig', 'fileinfo', 'does-not-exist') + runtmp.run_sourmash("sig", "fileinfo", "does-not-exist") - assert "Cannot open 'does-not-exist' as a sourmash signature collection" in runtmp.last_result.err + assert ( + "Cannot open 'does-not-exist' as a sourmash signature collection" + in runtmp.last_result.err + ) def test_sig_fileinfo_8_manifest_works(runtmp): # test on a manifest with relative paths, in proper location - mf = utils.get_test_data('scaled/mf.csv') - runtmp.sourmash('sig', 'fileinfo', mf) + mf = utils.get_test_data("scaled/mf.csv") + runtmp.sourmash("sig", "fileinfo", mf) out = runtmp.last_result.out print(out) - assert '15 sketches with DNA, k=31, scaled=10000 717 total hashes' in out - assert 'num signatures: 15' in out - assert 'has manifest? yes' in out - assert 'is database? yes' in out - assert 'path filetype: StandaloneManifestIndex' in out + assert "15 sketches with DNA, k=31, scaled=10000 717 total hashes" in out + assert "num signatures: 15" in out + assert "has manifest? yes" in out + assert "is database? yes" in out + assert "path filetype: StandaloneManifestIndex" in out def test_sig_fileinfo_8_manifest_works_when_moved(runtmp): # test on a manifest with relative paths, when in wrong place # note: this works, unlike 'describe', because all the necessary info # for 'fileinfo' is in the manifest. - mf = utils.get_test_data('scaled/mf.csv') - shutil.copyfile(mf, runtmp.output('mf.csv')) + mf = utils.get_test_data("scaled/mf.csv") + shutil.copyfile(mf, runtmp.output("mf.csv")) - runtmp.sourmash('sig', 'fileinfo', 'mf.csv') + runtmp.sourmash("sig", "fileinfo", "mf.csv") out = runtmp.last_result.out print(out) - assert '15 sketches with DNA, k=31, scaled=10000 717 total hashes' in out - assert 'num signatures: 15' in out - assert 'has manifest? yes' in out - assert 'is database? yes' in out - assert 'path filetype: StandaloneManifestIndex' in out + assert "15 sketches with DNA, k=31, scaled=10000 717 total hashes" in out + assert "num signatures: 15" in out + assert "has manifest? yes" in out + assert "is database? yes" in out + assert "path filetype: StandaloneManifestIndex" in out def test_sig_fileinfo_9_sqldb_make(runtmp): # make a sqldb and run fileinfo on it - gcf_all = glob.glob(utils.get_test_data('gather/GCF*.sig')) - sqldb = runtmp.output('some.sqldb') + gcf_all = glob.glob(utils.get_test_data("gather/GCF*.sig")) + sqldb = runtmp.output("some.sqldb") - runtmp.sourmash('sig', 'cat', '-k', '31', *gcf_all, '-o', sqldb) + runtmp.sourmash("sig", "cat", "-k", "31", *gcf_all, "-o", sqldb) - runtmp.sourmash('sig', 'fileinfo', sqldb) + runtmp.sourmash("sig", "fileinfo", sqldb) err = runtmp.last_result.err print(err) @@ -387,8 +433,8 @@ def test_sig_fileinfo_9_sqldb_make(runtmp): def test_sig_fileinfo_9_sqldb_exists(runtmp): # run fileinfo on existing sqldb - sqldb = utils.get_test_data('sqlite/index.sqldb') - runtmp.sourmash('sig', 'fileinfo', sqldb) + sqldb = utils.get_test_data("sqlite/index.sqldb") + runtmp.sourmash("sig", "fileinfo", sqldb) err = runtmp.last_result.err print(err) @@ -397,13 +443,15 @@ def test_sig_fileinfo_9_sqldb_exists(runtmp): print(out) assert "path filetype: SqliteIndex" in out - assert "2 sketches with DNA, k=31, scaled=1000 10415 total hashes" in out + assert ( + "2 sketches with DNA, k=31, scaled=1000 10415 total hashes" in out + ) def test_sig_fileinfo_9_sql_manifest(runtmp): # run fileinfo on existing sqldb - sqldb = utils.get_test_data('sqlite/prot.sqlmf') - runtmp.sourmash('sig', 'fileinfo', sqldb) + sqldb = utils.get_test_data("sqlite/prot.sqlmf") + runtmp.sourmash("sig", "fileinfo", sqldb) err = runtmp.last_result.err print(err) @@ -421,8 +469,8 @@ def test_sig_fileinfo_9_sql_manifest(runtmp): def test_sig_fileinfo_9_sql_lca_db(runtmp): # run fileinfo on existing sqldb - sqldb = utils.get_test_data('sqlite/lca.sqldb') - runtmp.sourmash('sig', 'fileinfo', sqldb) + sqldb = utils.get_test_data("sqlite/lca.sqldb") + runtmp.sourmash("sig", "fileinfo", sqldb) err = runtmp.last_result.err print(err) diff --git a/tests/test_cmd_signature_grep.py b/tests/test_cmd_signature_grep.py index 17dd5ee2dc..fa1a5b7dfb 100644 --- a/tests/test_cmd_signature_grep.py +++ b/tests/test_cmd_signature_grep.py @@ -18,299 +18,308 @@ def test_grep_1_sig_name(runtmp): # search on substring in name - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - runtmp.run_sourmash('sig', 'grep', 'Shewanella', sig47) + runtmp.run_sourmash("sig", "grep", "Shewanella", sig47) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella' in ss.name - assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + assert "Shewanella" in ss.name + assert ss.md5sum() == "09a08691ce52952152f0e866a59f6261" def test_grep_1_sig_name_case_sensitive(runtmp): # search on substring in name - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('sig', 'grep', 'shewanella', sig47) + runtmp.run_sourmash("sig", "grep", "shewanella", sig47) def test_grep_1_sig_name_case_insensitive(runtmp): # search on substring in name, case insensitive - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - runtmp.run_sourmash('sig', 'grep', '-i', 'shewanella', sig47) + runtmp.run_sourmash("sig", "grep", "-i", "shewanella", sig47) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella' in ss.name - assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + assert "Shewanella" in ss.name + assert ss.md5sum() == "09a08691ce52952152f0e866a59f6261" def test_grep_1_sig_name_exclude(runtmp): # search on substring in name, case insensitive - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") # no matches! with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('sig', 'grep', '-v', 'Shewanella', sig47) + runtmp.run_sourmash("sig", "grep", "-v", "Shewanella", sig47) def test_grep_2_sig_md5(runtmp): # search on substring in md5 - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - runtmp.run_sourmash('sig', 'grep', 'ce52952152f0', sig47) + runtmp.run_sourmash("sig", "grep", "ce52952152f0", sig47) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + assert ss.md5sum() == "09a08691ce52952152f0e866a59f6261" def test_grep_2_sig_md5_case_sensitive(runtmp): # case sensitive no match - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('sig', 'grep', 'CE52952152f0', sig47) + runtmp.run_sourmash("sig", "grep", "CE52952152f0", sig47) def test_grep_2_sig_md5_case_insensitive(runtmp): # search on substring in md5, case insensitive - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - runtmp.run_sourmash('sig', 'grep', '-i', 'CE52952152f0', sig47) + runtmp.run_sourmash("sig", "grep", "-i", "CE52952152f0", sig47) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + assert ss.md5sum() == "09a08691ce52952152f0e866a59f6261" def test_grep_3_filename(runtmp): # filename match - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - runtmp.run_sourmash('sig', 'grep', '47.fa', sig47) + runtmp.run_sourmash("sig", "grep", "47.fa", sig47) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert '47.fa' in ss.filename - assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + assert "47.fa" in ss.filename + assert ss.md5sum() == "09a08691ce52952152f0e866a59f6261" def test_grep_3_filename_regexp(runtmp): # search for a regexp on filename - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") - runtmp.run_sourmash('sig', 'grep', '^47.fa', sig47) + runtmp.run_sourmash("sig", "grep", "^47.fa", sig47) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert '7.fa' in ss.filename - assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + assert "7.fa" in ss.filename + assert ss.md5sum() == "09a08691ce52952152f0e866a59f6261" def test_grep_4_no_manifest(runtmp): # fail search when no manifest, by default - sbt = utils.get_test_data('v6.sbt.zip') + sbt = utils.get_test_data("v6.sbt.zip") - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('sig', 'grep', 'e60265', sbt) + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash("sig", "grep", "e60265", sbt) print(runtmp.last_result.err) - assert 'ERROR on filename' in runtmp.last_result.err - assert 'sig grep requires a manifest by default, but no manifest present.' in runtmp.last_result.err + assert "ERROR on filename" in runtmp.last_result.err + assert ( + "sig grep requires a manifest by default, but no manifest present." + in runtmp.last_result.err + ) def test_grep_4_no_manifest_ok(runtmp): # generate manifest if --no-require-manifest - sbt = utils.get_test_data('v6.sbt.zip') + sbt = utils.get_test_data("v6.sbt.zip") - runtmp.run_sourmash('sig', 'grep', 'e60265', sbt, '--no-require-manifest') + runtmp.run_sourmash("sig", "grep", "e60265", sbt, "--no-require-manifest") ss = load_signatures(runtmp.last_result.out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'e60265' in ss.md5sum() + assert "e60265" in ss.md5sum() def test_grep_5_zip_include(runtmp): # search zip, include on case sensitive match to name - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - runtmp.run_sourmash('sig', 'grep', '--dna', 'OS223', allzip) + runtmp.run_sourmash("sig", "grep", "--dna", "OS223", allzip) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" def test_grep_5_zip_include_picklist(runtmp): # search zip, include on case sensitive match to name - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - pickfile = runtmp.output('pick.csv') - with open(pickfile, 'w', newline="") as fp: - w = csv.DictWriter(fp, fieldnames=['md5']) + pickfile = runtmp.output("pick.csv") + with open(pickfile, "w", newline="") as fp: + w = csv.DictWriter(fp, fieldnames=["md5"]) w.writeheader() - w.writerow(dict(md5='09a08691ce52952152f0e866a59f6261')) - w.writerow(dict(md5='38729c6374925585db28916b82a6f513')) + w.writerow(dict(md5="09a08691ce52952152f0e866a59f6261")) + w.writerow(dict(md5="38729c6374925585db28916b82a6f513")) - runtmp.run_sourmash('sig', 'grep', '--dna', 'OS223', allzip, - '--picklist', f"{pickfile}:md5:md5") + runtmp.run_sourmash( + "sig", "grep", "--dna", "OS223", allzip, "--picklist", f"{pickfile}:md5:md5" + ) out = runtmp.last_result.out print(out) err = runtmp.last_result.err print(err) - assert 'for given picklist, found 2 matches to 2 distinct values' in err + assert "for given picklist, found 2 matches to 2 distinct values" in err ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" def test_grep_5_zip_include_case_insensitive(runtmp): # search zip, include on case insensitive match to name - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - runtmp.run_sourmash('sig', 'grep', '--dna', '-i', 'os223', allzip) + runtmp.run_sourmash("sig", "grep", "--dna", "-i", "os223", allzip) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" def test_grep_5_zip_exclude(runtmp): # search zip, exclude on case-sensitive match - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - runtmp.run_sourmash('sig', 'grep', '--dna', '-v', 'OS185', allzip) + runtmp.run_sourmash("sig", "grep", "--dna", "-v", "OS185", allzip) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" def test_grep_5_zip_exclude_case_insensitive(runtmp): # search zip, exclude on case-insensitive match - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - runtmp.run_sourmash('sig', 'grep', '--dna', '-vi', 'os185', allzip) + runtmp.run_sourmash("sig", "grep", "--dna", "-vi", "os185", allzip) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" def test_grep_6_zip_manifest_csv(runtmp): # do --csv and use result as picklist - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - runtmp.run_sourmash('sig', 'grep', '--dna', 'OS223', allzip, - '--csv', 'match.csv') + runtmp.run_sourmash("sig", "grep", "--dna", "OS223", allzip, "--csv", "match.csv") out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" # now run cat with picklist - runtmp.run_sourmash('sig', 'cat', allzip, - '--picklist', 'match.csv::manifest') + runtmp.run_sourmash("sig", "cat", allzip, "--picklist", "match.csv::manifest") out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" def test_grep_6_zip_manifest_csv_gz(runtmp): # do --csv and use result as picklist - allzip = utils.get_test_data('prot/all.zip') + allzip = utils.get_test_data("prot/all.zip") - runtmp.run_sourmash('sig', 'grep', '--dna', 'OS223', allzip, - '--csv', 'match.csv.gz') + runtmp.run_sourmash( + "sig", "grep", "--dna", "OS223", allzip, "--csv", "match.csv.gz" + ) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" # check that match.csv.gz is a gzip file - with gzip.open(runtmp.output('match.csv.gz'), 'rt', newline='') as fp: + with gzip.open(runtmp.output("match.csv.gz"), "rt", newline="") as fp: fp.read() # now run cat with picklist - runtmp.run_sourmash('sig', 'cat', allzip, - '--picklist', 'match.csv.gz::manifest') + runtmp.run_sourmash("sig", "cat", allzip, "--picklist", "match.csv.gz::manifest") out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella baltica OS223' in ss.name - assert ss.md5sum() == '38729c6374925585db28916b82a6f513' + assert "Shewanella baltica OS223" in ss.name + assert ss.md5sum() == "38729c6374925585db28916b82a6f513" def test_sig_grep_7_lca(runtmp): # extract 47 from an LCA database, with --no-require-manifest - allzip = utils.get_test_data('lca/47+63.lca.json') - sig47 = utils.get_test_data('47.fa.sig') - - runtmp.sourmash('sig', 'grep', "50a9274021e4", allzip, - '--no-require-manifest', '-o', 'matches.sig') - - match = sourmash.load_file_as_signatures(runtmp.output('matches.sig')) + allzip = utils.get_test_data("lca/47+63.lca.json") + sig47 = utils.get_test_data("47.fa.sig") + + runtmp.sourmash( + "sig", + "grep", + "50a9274021e4", + allzip, + "--no-require-manifest", + "-o", + "matches.sig", + ) + + match = sourmash.load_file_as_signatures(runtmp.output("matches.sig")) match = list(match)[0] ss47 = sourmash.load_file_as_signatures(sig47) @@ -324,50 +333,63 @@ def test_sig_grep_7_lca(runtmp): def test_sig_grep_7_picklist_md5_lca_fail(runtmp): # extract 47 from an LCA database, using a picklist w/full md5 => fail - allzip = utils.get_test_data('lca/47+63.lca.json') + allzip = utils.get_test_data("lca/47+63.lca.json") # select on any of these attributes - row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', - md5full='50a9274021e43eda8b2e77f8fa60ae8e', - md5short='50a9274021e43eda8b2e77f8fa60ae8e'[:8], - fullIdent='NC_009665.1', - nodotIdent='NC_009665') + row = dict( + exactName="NC_009665.1 Shewanella baltica OS185, complete genome", + md5full="50a9274021e43eda8b2e77f8fa60ae8e", + md5short="50a9274021e43eda8b2e77f8fa60ae8e"[:8], + fullIdent="NC_009665.1", + nodotIdent="NC_009665", + ) # make picklist - picklist_csv = runtmp.output('pick.csv') - with open(picklist_csv, 'w', newline='') as csvfp: + picklist_csv = runtmp.output("pick.csv") + with open(picklist_csv, "w", newline="") as csvfp: w = csv.DictWriter(csvfp, fieldnames=row.keys()) w.writeheader() w.writerow(row) picklist_arg = f"{picklist_csv}:md5full:md5" - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sig', 'grep', '50a92740', allzip, - '--picklist', picklist_arg, - '--no-require-manifest') + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash( + "sig", + "grep", + "50a92740", + allzip, + "--picklist", + picklist_arg, + "--no-require-manifest", + ) # this happens b/c the implementation of 'grep' uses picklists, and # LCA databases don't support multiple picklists. print(runtmp.last_result.err) - assert "This input collection doesn't support 'grep' with picklists." in runtmp.last_result.err + assert ( + "This input collection doesn't support 'grep' with picklists." + in runtmp.last_result.err + ) def test_sig_grep_8_count(runtmp): - zips = ['prot/all.zip', - 'prot/dayhoff.sbt.zip', - 'prot/dayhoff.zip', - 'prot/hp.sbt.zip', - 'prot/hp.zip', - 'prot/protein.sbt.zip', - 'prot/protein.zip'] - - zip_src = [ utils.get_test_data(x) for x in zips ] - - os.mkdir(runtmp.output('prot')) + zips = [ + "prot/all.zip", + "prot/dayhoff.sbt.zip", + "prot/dayhoff.zip", + "prot/hp.sbt.zip", + "prot/hp.zip", + "prot/protein.sbt.zip", + "prot/protein.zip", + ] + + zip_src = [utils.get_test_data(x) for x in zips] + + os.mkdir(runtmp.output("prot")) for src, dest in zip(zip_src, zips): shutil.copyfile(src, runtmp.output(dest)) - - runtmp.sourmash('sig', 'grep', '-c', '0015939', *zips) + + runtmp.sourmash("sig", "grep", "-c", "0015939", *zips) out = runtmp.last_result.out err = runtmp.last_result.err @@ -391,23 +413,23 @@ def test_sig_grep_8_count(runtmp): def test_sig_grep_identical_md5s(runtmp): # test that we properly handle different signatures with identical md5s - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = load_signatures(sig47) sig = list(ss)[0] new_sig = sig.to_mutable() - new_sig.name = 'foo' - sig47foo = runtmp.output('foo.sig') + new_sig.name = "foo" + sig47foo = runtmp.output("foo.sig") # this was only a problem when the signatures are stored in the same file - with open(sig47foo, 'wt') as fp: + with open(sig47foo, "w") as fp: sourmash.save_signatures([new_sig, sig], fp) - runtmp.run_sourmash('sig', 'grep', '-i', 'foo', sig47foo) + runtmp.run_sourmash("sig", "grep", "-i", "foo", sig47foo) out = runtmp.last_result.out ss = load_signatures(out) ss = list(ss) assert len(ss) == 1 ss = ss[0] - assert 'Shewanella' not in ss.name - assert 'foo' in ss.name - assert ss.md5sum() == '09a08691ce52952152f0e866a59f6261' + assert "Shewanella" not in ss.name + assert "foo" in ss.name + assert ss.md5sum() == "09a08691ce52952152f0e866a59f6261" diff --git a/tests/test_compare.py b/tests/test_compare.py index bc25e98e3c..9821295cac 100644 --- a/tests/test_compare.py +++ b/tests/test_compare.py @@ -5,9 +5,14 @@ import pytest import sourmash -from sourmash.compare import (compare_all_pairs, compare_parallel, - compare_serial, compare_serial_containment, - compare_serial_max_containment, compare_serial_avg_containment) +from sourmash.compare import ( + compare_all_pairs, + compare_parallel, + compare_serial, + compare_serial_containment, + compare_serial_max_containment, + compare_serial_avg_containment, +) import sourmash_tst_utils as utils @@ -44,66 +49,90 @@ def test_compare_serial(siglist, ignore_abundance): similarities = compare_serial(siglist, ignore_abundance, downsample=False) true_similarities = np.array( - [[1., 0.356, 0.078, 0.086, 0., 0., 0.], - [0.356, 1., 0.072, 0.078, 0., 0., 0.], - [0.078, 0.072, 1., 0.074, 0., 0., 0.], - [0.086, 0.078, 0.074, 1., 0., 0., 0.], - [0., 0., 0., 0., 1., 0.382, 0.364], - [0., 0., 0., 0., 0.382, 1., 0.386], - [0., 0., 0., 0., 0.364, 0.386, 1.]]) + [ + [1.0, 0.356, 0.078, 0.086, 0.0, 0.0, 0.0], + [0.356, 1.0, 0.072, 0.078, 0.0, 0.0, 0.0], + [0.078, 0.072, 1.0, 0.074, 0.0, 0.0, 0.0], + [0.086, 0.078, 0.074, 1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 1.0, 0.382, 0.364], + [0.0, 0.0, 0.0, 0.0, 0.382, 1.0, 0.386], + [0.0, 0.0, 0.0, 0.0, 0.364, 0.386, 1.0], + ] + ) np.testing.assert_array_equal(similarities, true_similarities) def test_compare_parallel(siglist, ignore_abundance): - similarities = compare_parallel(siglist, ignore_abundance, downsample=False, n_jobs=2) + similarities = compare_parallel( + siglist, ignore_abundance, downsample=False, n_jobs=2 + ) true_similarities = np.array( - [[1., 0.356, 0.078, 0.086, 0., 0., 0.], - [0.356, 1., 0.072, 0.078, 0., 0., 0.], - [0.078, 0.072, 1., 0.074, 0., 0., 0.], - [0.086, 0.078, 0.074, 1., 0., 0., 0.], - [0., 0., 0., 0., 1., 0.382, 0.364], - [0., 0., 0., 0., 0.382, 1., 0.386], - [0., 0., 0., 0., 0.364, 0.386, 1.]]) + [ + [1.0, 0.356, 0.078, 0.086, 0.0, 0.0, 0.0], + [0.356, 1.0, 0.072, 0.078, 0.0, 0.0, 0.0], + [0.078, 0.072, 1.0, 0.074, 0.0, 0.0, 0.0], + [0.086, 0.078, 0.074, 1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 1.0, 0.382, 0.364], + [0.0, 0.0, 0.0, 0.0, 0.382, 1.0, 0.386], + [0.0, 0.0, 0.0, 0.0, 0.364, 0.386, 1.0], + ] + ) np.testing.assert_array_equal(similarities, true_similarities) def test_compare_all_pairs(siglist, ignore_abundance): - similarities_parallel = compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=2) + similarities_parallel = compare_all_pairs( + siglist, ignore_abundance, downsample=False, n_jobs=2 + ) similarities_serial = compare_serial(siglist, ignore_abundance, downsample=False) np.testing.assert_array_equal(similarities_parallel, similarities_serial) def test_compare_serial_jaccardANI(scaled_siglist, ignore_abundance): - jANI = compare_serial(scaled_siglist, ignore_abundance, downsample=False, return_ani=True) + jANI = compare_serial( + scaled_siglist, ignore_abundance, downsample=False, return_ani=True + ) print(jANI) - + true_jaccard_ANI = np.array( - [[1., 0.978, 0., 0.], - [0.978, 1., 0.96973012, 0.99262776], - [0., 0.96973012, 1., 0.97697011], - [0., 0.99262776, 0.97697011, 1.]]) + [ + [1.0, 0.978, 0.0, 0.0], + [0.978, 1.0, 0.96973012, 0.99262776], + [0.0, 0.96973012, 1.0, 0.97697011], + [0.0, 0.99262776, 0.97697011, 1.0], + ] + ) np.testing.assert_array_almost_equal(jANI, true_jaccard_ANI, decimal=3) def test_compare_parallel_jaccardANI(scaled_siglist, ignore_abundance): - jANI = compare_parallel(scaled_siglist, ignore_abundance, downsample=False, n_jobs=2, return_ani=True) + jANI = compare_parallel( + scaled_siglist, ignore_abundance, downsample=False, n_jobs=2, return_ani=True + ) true_jaccard_ANI = np.array( - [[1., 0.978, 0., 0.], - [0.978, 1., 0.96973012, 0.99262776], - [0., 0.96973012, 1., 0.97697011], - [0., 0.99262776, 0.97697011, 1.]]) + [ + [1.0, 0.978, 0.0, 0.0], + [0.978, 1.0, 0.96973012, 0.99262776], + [0.0, 0.96973012, 1.0, 0.97697011], + [0.0, 0.99262776, 0.97697011, 1.0], + ] + ) np.testing.assert_array_almost_equal(jANI, true_jaccard_ANI, decimal=3) def test_compare_all_pairs_jaccardANI(scaled_siglist, ignore_abundance): - similarities_parallel = compare_all_pairs(scaled_siglist, ignore_abundance, downsample=False, n_jobs=2, return_ani=True) - similarities_serial = compare_serial(scaled_siglist, ignore_abundance, downsample=False, return_ani=True) + similarities_parallel = compare_all_pairs( + scaled_siglist, ignore_abundance, downsample=False, n_jobs=2, return_ani=True + ) + similarities_serial = compare_serial( + scaled_siglist, ignore_abundance, downsample=False, return_ani=True + ) np.testing.assert_array_equal(similarities_parallel, similarities_serial) @@ -112,39 +141,56 @@ def test_compare_serial_containmentANI(scaled_siglist): print(containment_ANI) true_containment_ANI = np.array( - [[1, 0.966, 0., 0.], - [1, 1., 0.97715525, 1.], - [0., 0.96377054, 1., 0.97678608], - [0., 0.98667513, 0.97715525, 1.]]) + [ + [1, 0.966, 0.0, 0.0], + [1, 1.0, 0.97715525, 1.0], + [0.0, 0.96377054, 1.0, 0.97678608], + [0.0, 0.98667513, 0.97715525, 1.0], + ] + ) - np.testing.assert_array_almost_equal(containment_ANI, true_containment_ANI, decimal=3) + np.testing.assert_array_almost_equal( + containment_ANI, true_containment_ANI, decimal=3 + ) def test_compare_serial_maxcontainmentANI(scaled_siglist): - # check max_containment ANI - max_containment_ANI = compare_serial_max_containment(scaled_siglist, return_ani=True) + max_containment_ANI = compare_serial_max_containment( + scaled_siglist, return_ani=True + ) print(max_containment_ANI) true_max_containment_ANI = np.array( - [[1., 1., 0., 0.], - [1., 1., 0.97715525, 1.], - [0., 0.97715525, 1., 0.97715525], - [0., 1., 0.97715525, 1.]]) + [ + [1.0, 1.0, 0.0, 0.0], + [1.0, 1.0, 0.97715525, 1.0], + [0.0, 0.97715525, 1.0, 0.97715525], + [0.0, 1.0, 0.97715525, 1.0], + ] + ) - np.testing.assert_array_almost_equal(max_containment_ANI, true_max_containment_ANI, decimal=3) + np.testing.assert_array_almost_equal( + max_containment_ANI, true_max_containment_ANI, decimal=3 + ) def test_compare_serial_avg_containmentANI(scaled_siglist): - # check avg_containment ANI - avg_containment_ANI = compare_serial_avg_containment(scaled_siglist, return_ani=True) + avg_containment_ANI = compare_serial_avg_containment( + scaled_siglist, return_ani=True + ) print(avg_containment_ANI) true_avg_containment_ANI = np.array( - [[1., 0.983, 0., 0.], - [0.983, 1., 0.97046289, 0.99333757], - [0., 0.97046289, 1., 0.97697067], - [0., 0.99333757, 0.97697067, 1.]]) - - np.testing.assert_array_almost_equal(avg_containment_ANI, true_avg_containment_ANI, decimal=3) + [ + [1.0, 0.983, 0.0, 0.0], + [0.983, 1.0, 0.97046289, 0.99333757], + [0.0, 0.97046289, 1.0, 0.97697067], + [0.0, 0.99333757, 0.97697067, 1.0], + ] + ) + + np.testing.assert_array_almost_equal( + avg_containment_ANI, true_avg_containment_ANI, decimal=3 + ) diff --git a/tests/test_deprecated.py b/tests/test_deprecated.py index fdd9acc53c..34097dd695 100644 --- a/tests/test_deprecated.py +++ b/tests/test_deprecated.py @@ -1,13 +1,14 @@ from sourmash import signature import sourmash_tst_utils as utils + def test_load_textmode(track_abundance): # ijson required a file in binary mode or bytes, # but we had an API example in the docs using 'rt'. # I fixed the docs, but I'm keeping this test here # to make sure we still support it =/ - sigfile = utils.get_test_data('genome-s10+s11.sig') - with open(sigfile, 'rt') as sigfp: + sigfile = utils.get_test_data("genome-s10+s11.sig") + with open(sigfile) as sigfp: siglist = list(signature.load_signatures(sigfp)) loaded_sig = siglist[0] - assert loaded_sig.name == 'genome-s10+s11' + assert loaded_sig.name == "genome-s10+s11" diff --git a/tests/test_distance_utils.py b/tests/test_distance_utils.py index 22067dcc68..6b44064a9e 100644 --- a/tests/test_distance_utils.py +++ b/tests/test_distance_utils.py @@ -3,23 +3,33 @@ """ import pytest import numpy as np -from sourmash.distance_utils import (containment_to_distance, get_exp_probability_nothing_common, - handle_seqlen_nkmers, jaccard_to_distance, - ANIResult, ciANIResult, jaccardANIResult, var_n_mutated, - set_size_chernoff, set_size_exact_prob) +from sourmash.distance_utils import ( + containment_to_distance, + get_exp_probability_nothing_common, + handle_seqlen_nkmers, + jaccard_to_distance, + ANIResult, + ciANIResult, + jaccardANIResult, + var_n_mutated, + set_size_chernoff, + set_size_exact_prob, +) + def test_aniresult(): res = ANIResult(0.4, 0.1) assert res.dist == 0.4 assert res.ani == 0.6 assert res.p_nothing_in_common == 0.1 - assert res.p_exceeds_threshold ==True + assert res.p_exceeds_threshold == True # check that they're equivalent res2 = ANIResult(0.4, 0.1) assert res == res2 res3 = ANIResult(0.5, 0) assert res != res3 - assert res3.p_exceeds_threshold ==False + assert res3.p_exceeds_threshold == False + def test_aniresult_bad_distance(): """ @@ -38,18 +48,18 @@ def test_aniresult_bad_distance(): def test_jaccard_aniresult(): res = jaccardANIResult(0.4, 0.1, jaccard_error=0.03) assert res.dist == 0.4 - assert res.ani == None + assert res.ani is None assert res.p_nothing_in_common == 0.1 assert res.jaccard_error == 0.03 - assert res.p_exceeds_threshold ==True - assert res.je_exceeds_threshold ==True + assert res.p_exceeds_threshold == True + assert res.je_exceeds_threshold == True res3 = jaccardANIResult(0.4, 0.1, jaccard_error=0.03, je_threshold=0.1) - assert res3.je_exceeds_threshold ==False + assert res3.je_exceeds_threshold == False assert res3.ani == 0.6 def test_jaccard_aniresult_nojaccarderror(): - #jaccard error is None + # jaccard error is None with pytest.raises(Exception) as exc: jaccardANIResult(0.4, 0.1, None) print("\n", str(exc.value)) @@ -57,14 +67,14 @@ def test_jaccard_aniresult_nojaccarderror(): def test_ci_aniresult(): - res = ciANIResult(0.4, 0.1, dist_low=0.3,dist_high=0.5) + res = ciANIResult(0.4, 0.1, dist_low=0.3, dist_high=0.5) print(res) assert res.dist == 0.4 assert res.ani == 0.6 assert res.p_nothing_in_common == 0.1 assert res.ani_low == 0.5 assert res.ani_high == 0.7 - res2 = ciANIResult(0.4, 0.1, dist_low=0.3,dist_high=0.5) + res2 = ciANIResult(0.4, 0.1, dist_low=0.3, dist_high=0.5) assert res == res2 res3 = ciANIResult(0.4, 0.2, dist_low=0.3, dist_high=0.5) assert res != res3 @@ -74,12 +84,14 @@ def test_containment_to_distance_zero(): contain = 0 scaled = 1 nkmers = 10000 - ksize=21 - res = containment_to_distance(contain,ksize,scaled, n_unique_kmers=nkmers, estimate_ci=True) + ksize = 21 + res = containment_to_distance( + contain, ksize, scaled, n_unique_kmers=nkmers, estimate_ci=True + ) print(res) # check results - exp_dist,exp_low,exp_high,pnc = 1.0,1.0,1.0,1.0 - exp_id, exp_idlow,exp_idhigh,pnc = 0.0,0.0,0.0,1.0 + exp_dist, exp_low, exp_high, pnc = 1.0, 1.0, 1.0, 1.0 + exp_id, exp_idlow, exp_idhigh, pnc = 0.0, 0.0, 0.0, 1.0 assert res.dist == exp_dist assert res.dist_low == exp_low assert res.dist_high == exp_high @@ -88,9 +100,15 @@ def test_containment_to_distance_zero(): assert res.ani_low == exp_idlow assert res.ani_high == exp_idhigh # check without returning ci - res2 = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) + res2 = containment_to_distance(contain, ksize, scaled, n_unique_kmers=nkmers) print(res2) - exp_res = ciANIResult(dist=1.0, dist_low=1.0, dist_high=1.0, p_nothing_in_common=1.0, p_threshold=0.001) + exp_res = ciANIResult( + dist=1.0, + dist_low=1.0, + dist_high=1.0, + p_nothing_in_common=1.0, + p_threshold=0.001, + ) assert res2 == exp_res @@ -98,11 +116,13 @@ def test_containment_to_distance_one(): contain = 1 scaled = 1 nkmers = 10000 - ksize=21 - res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,estimate_ci=True) + ksize = 21 + res = containment_to_distance( + contain, ksize, scaled, n_unique_kmers=nkmers, estimate_ci=True + ) print(res) - exp_dist, exp_low,exp_high,pnc = 0.0,0.0,0.0,0.0 - exp_id, exp_idlow,exp_idhigh,pnc = 1.0,1.0,1.0,0.0 + exp_dist, exp_low, exp_high, pnc = 0.0, 0.0, 0.0, 0.0 + exp_id, exp_idlow, exp_idhigh, pnc = 1.0, 1.0, 1.0, 0.0 assert res.dist == exp_dist assert res.dist_low == exp_low assert res.dist_high == exp_high @@ -112,7 +132,7 @@ def test_containment_to_distance_one(): assert res.ani_high == exp_idhigh # check without returning ci - res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) + res = containment_to_distance(contain, ksize, scaled, n_unique_kmers=nkmers) assert res.dist == exp_dist assert res.ani == exp_id assert res.p_nothing_in_common == pnc @@ -124,8 +144,10 @@ def test_containment_to_distance_scaled1(): contain = 0.5 scaled = 1 nkmers = 10000 - ksize=21 - res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,estimate_ci=True) + ksize = 21 + res = containment_to_distance( + contain, ksize, scaled, n_unique_kmers=nkmers, estimate_ci=True + ) print(res) # check results assert res.dist == 0.032468221476108394 @@ -136,17 +158,27 @@ def test_containment_to_distance_scaled1(): assert res.ani_low == 0.9635213980271021 assert res.p_nothing_in_common == 0.0 # without returning ci - res2 = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) - assert (res2.dist,res2.ani,res2.p_nothing_in_common) == (0.032468221476108394, 0.9675317785238916, 0.0) - assert (res2.dist,res2.ani,res2.p_nothing_in_common) == (res.dist, res.ani, res.p_nothing_in_common) + res2 = containment_to_distance(contain, ksize, scaled, n_unique_kmers=nkmers) + assert (res2.dist, res2.ani, res2.p_nothing_in_common) == ( + 0.032468221476108394, + 0.9675317785238916, + 0.0, + ) + assert (res2.dist, res2.ani, res2.p_nothing_in_common) == ( + res.dist, + res.ani, + res.p_nothing_in_common, + ) def test_containment_to_distance_scaled100(): contain = 0.1 scaled = 100 nkmers = 10000 - ksize=31 - res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,estimate_ci=True) + ksize = 31 + res = containment_to_distance( + contain, ksize, scaled, n_unique_kmers=nkmers, estimate_ci=True + ) print(res) # check results assert res.dist == 0.07158545548052564 @@ -160,8 +192,10 @@ def test_containment_to_distance_scaled100_2(): contain = 0.5 scaled = 100 nkmers = 10000 - ksize=21 - res= containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,estimate_ci=True) + ksize = 21 + res = containment_to_distance( + contain, ksize, scaled, n_unique_kmers=nkmers, estimate_ci=True + ) print(res) # check results assert res.dist == 0.032468221476108394 @@ -174,8 +208,10 @@ def test_containment_to_distance_k10(): contain = 0.5 scaled = 100 nkmers = 10000 - ksize=10 - res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,estimate_ci=True) + ksize = 10 + res = containment_to_distance( + contain, ksize, scaled, n_unique_kmers=nkmers, estimate_ci=True + ) print(res) # check results assert res.dist == 0.06696700846319259 @@ -188,17 +224,31 @@ def test_containment_to_distance_confidence(): contain = 0.1 scaled = 100 nkmers = 10000 - ksize=31 - confidence=0.99 - res = containment_to_distance(contain,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers, estimate_ci=True) + ksize = 31 + confidence = 0.99 + res = containment_to_distance( + contain, + ksize, + scaled, + confidence=confidence, + n_unique_kmers=nkmers, + estimate_ci=True, + ) print(res) # check results assert res.dist == 0.07158545548052564 assert res.dist_low == 0.04802880300938562 assert res.dist_high == 0.09619930040790341 assert res.p_exceeds_threshold == False - confidence=0.90 - res2 = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,confidence=confidence, estimate_ci=True) + confidence = 0.90 + res2 = containment_to_distance( + contain, + ksize, + scaled, + n_unique_kmers=nkmers, + confidence=confidence, + estimate_ci=True, + ) print(res2) # check results assert res2.dist == res.dist @@ -211,16 +261,30 @@ def test_nkmers_to_bp_containment(): containment = 0.1 scaled = 100 bp_len = 10030 - ksize=31 - nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp= bp_len) + ksize = 31 + nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp=bp_len) print("nkmers_from_bp:", nkmers) - confidence=0.99 - kmer_res = containment_to_distance(containment,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers,estimate_ci=True) - bp_res = containment_to_distance(containment,ksize,scaled,confidence=confidence,sequence_len_bp=bp_len,estimate_ci=True) + confidence = 0.99 + kmer_res = containment_to_distance( + containment, + ksize, + scaled, + confidence=confidence, + n_unique_kmers=nkmers, + estimate_ci=True, + ) + bp_res = containment_to_distance( + containment, + ksize, + scaled, + confidence=confidence, + sequence_len_bp=bp_len, + estimate_ci=True, + ) print(f"\nkDIST: {kmer_res}") print(f"\nbpDIST:,{bp_res}") # check results - assert kmer_res==bp_res + assert kmer_res == bp_res assert kmer_res.dist == 0.07158545548052564 assert kmer_res.dist_low == 0.04802880300938562 assert kmer_res.dist_high == 0.09619930040790341 @@ -230,8 +294,8 @@ def test_jaccard_to_distance_zero(): jaccard = 0 scaled = 1 nkmers = 10000 - ksize=21 - res= jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + ksize = 21 + res = jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=nkmers) print(res) # check results assert res.dist == 1.0 @@ -244,8 +308,8 @@ def test_jaccard_to_distance_one(): jaccard = 1 scaled = 1 nkmers = 10000 - ksize=21 - res= jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + ksize = 21 + res = jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=nkmers) print(res) # check results assert res.dist == 0.0 @@ -259,36 +323,38 @@ def test_jaccard_to_distance_scaled(): jaccard = 0.5 scaled = 1 nkmers = 10000 - ksize=21 - res = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + ksize = 21 + res = jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=nkmers) print(res) # check results assert round(res.dist, 3) == round(0.019122659390482077, 3) - assert res.ani == None + assert res.ani is None assert res.p_exceeds_threshold == False assert res.jaccard_error == 0.00018351337045518042 - assert res.je_exceeds_threshold ==True + assert res.je_exceeds_threshold == True scaled = 100 - res2 = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + res2 = jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=nkmers) print(res2) assert res2.dist == res.dist assert res2.jaccard_error == res.jaccard_error assert res2.p_nothing_in_common != res.p_nothing_in_common - assert res2.p_exceeds_threshold ==False + assert res2.p_exceeds_threshold == False def test_jaccard_to_distance_k31(): jaccard = 0.5 scaled = 100 nkmers = 10000 - ksize=31 - res = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + ksize = 31 + res = jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=nkmers) print(res) # check results - assert res.je_exceeds_threshold ==True - assert res.ani == None + assert res.je_exceeds_threshold == True + assert res.ani is None assert res.p_exceeds_threshold == False - res2 = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers, err_threshold=0.1) + res2 = jaccard_to_distance( + jaccard, ksize, scaled, n_unique_kmers=nkmers, err_threshold=0.1 + ) assert res2.je_exceeds_threshold == False assert res2.ani == 0.9870056455892898 @@ -297,8 +363,8 @@ def test_jaccard_to_distance_k31_2(): jaccard = 0.1 scaled = 100 nkmers = 10000 - ksize=31 - res = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) + ksize = 31 + res = jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=nkmers) print(res) # check results assert res.ani == 0.9464928391768298 @@ -310,11 +376,11 @@ def test_nkmers_to_bp_jaccard(): jaccard = 0.1 scaled = 100 bp_len = 10030 - ksize=31 - nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp= bp_len) + ksize = 31 + nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp=bp_len) print("nkmers_from_bp:", nkmers) - kmer_res = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) - bp_res = jaccard_to_distance(jaccard,ksize,scaled,sequence_len_bp=bp_len) + kmer_res = jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=nkmers) + bp_res = jaccard_to_distance(jaccard, ksize, scaled, sequence_len_bp=bp_len) print(f"\nkmer_res: {kmer_res}") print(f"\nbp_res: {bp_res}") # check results @@ -329,12 +395,16 @@ def test_exp_prob_nothing_common(): ksize = 31 scaled = 10 bp_len = 1000030 - nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp= bp_len) + nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp=bp_len) print("nkmers_from_bp:", nkmers) - nkmers_pnc = get_exp_probability_nothing_common(dist,ksize,scaled,n_unique_kmers=nkmers) + nkmers_pnc = get_exp_probability_nothing_common( + dist, ksize, scaled, n_unique_kmers=nkmers + ) print(f"prob nothing in common: {nkmers_pnc}") - bp_pnc = get_exp_probability_nothing_common(dist,ksize,scaled,sequence_len_bp=bp_len) + bp_pnc = get_exp_probability_nothing_common( + dist, ksize, scaled, sequence_len_bp=bp_len + ) assert nkmers_pnc == bp_pnc == 7.437016945722123e-07 @@ -347,15 +417,17 @@ def test_containment_to_distance_tinytestdata_var0(): contain = 0.9 scaled = 1 nkmers = 4 - ksize=31 - res = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers, estimate_ci=True) + ksize = 31 + res = containment_to_distance( + contain, ksize, scaled, n_unique_kmers=nkmers, estimate_ci=True + ) print(res) # check results assert res.dist == 0.003392957179023992 - assert res.dist_low == None - assert res.dist_high == None - assert res.ani_low == None - assert res.ani_high == None + assert res.dist_low is None + assert res.dist_high is None + assert res.ani_low is None + assert res.ani_high is None assert res.p_exceeds_threshold == False @@ -364,7 +436,7 @@ def test_var_n_mutated(): r = 0 ksize = 31 nkmers = 200 - var_n_mut = var_n_mutated(nkmers,ksize,r) + var_n_mut = var_n_mutated(nkmers, ksize, r) print(f"var_n_mutated: {var_n_mut}") assert var_n_mut == 0 # check var 0.0 valuerror @@ -372,51 +444,71 @@ def test_var_n_mutated(): ksize = 31 nkmers = 200 with pytest.raises(ValueError) as exc: - var_n_mut = var_n_mutated(nkmers,ksize,r) + var_n_mut = var_n_mutated(nkmers, ksize, r) assert "Error: varN <0.0!" in str(exc) # check successful r = 0.4 ksize = 31 nkmers = 200000 - var_n_mut = var_n_mutated(nkmers,ksize,r) + var_n_mut = var_n_mutated(nkmers, ksize, r) print(f"var_n_mutated: {var_n_mut}") assert var_n_mut == 0.10611425440741508 def test_handle_seqlen_nkmers(): bp_len = 10030 - ksize=31 + ksize = 31 # convert seqlen to nkmers - nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp= bp_len) + nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp=bp_len) assert nkmers == 10000 # if nkmers is provided, just use that - nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp= bp_len, n_unique_kmers= bp_len) + nkmers = handle_seqlen_nkmers(ksize, sequence_len_bp=bp_len, n_unique_kmers=bp_len) assert nkmers == 10030 # if neither seqlen or nkmers provided, complain with pytest.raises(ValueError) as exc: nkmers = handle_seqlen_nkmers(ksize) - assert("Error: distance estimation requires input of either 'sequence_len_bp' or 'n_unique_kmers'") in str(exc) + assert ( + "Error: distance estimation requires input of either 'sequence_len_bp' or 'n_unique_kmers'" + ) in str(exc) def test_set_size_chernoff(): - eps = 10**(-6) + eps = 10 ** (-6) rel_error = 0.01 set_size = 1000000 - s = 1/0.1 # I'm used to using a scale value between 0 and 1 + s = 1 / 0.1 # I'm used to using a scale value between 0 and 1 value_from_mathematica = 0.928652 - assert np.abs(set_size_chernoff(set_size, s, relative_error=rel_error) - value_from_mathematica) < eps + assert ( + np.abs( + set_size_chernoff(set_size, s, relative_error=rel_error) + - value_from_mathematica + ) + < eps + ) rel_error = 0.05 set_size = 10000 s = 1 value_from_mathematica = 0.999519 - assert np.abs(set_size_chernoff(set_size, s, relative_error=rel_error) - value_from_mathematica) < eps + assert ( + np.abs( + set_size_chernoff(set_size, s, relative_error=rel_error) + - value_from_mathematica + ) + < eps + ) rel_error = 0.001 set_size = 10 - s = 1/.01 + s = 1 / 0.01 value_from_mathematica = -1 - assert np.abs(set_size_chernoff(set_size, s, relative_error=rel_error) - value_from_mathematica) < eps + assert ( + np.abs( + set_size_chernoff(set_size, s, relative_error=rel_error) + - value_from_mathematica + ) + < eps + ) def test_set_size_exact_prob(): diff --git a/tests/test_hll.py b/tests/test_hll.py index da8d3aad68..d49336bf7a 100644 --- a/tests/test_hll.py +++ b/tests/test_hll.py @@ -11,7 +11,7 @@ K = 21 # size of kmer ERR_RATE = 0.01 N_UNIQUE = 3356 -TRANSLATE = {'A': 'T', 'C': 'G', 'T': 'A', 'G': 'C'} +TRANSLATE = {"A": "T", "C": "G", "T": "A", "G": "C"} def test_hll_add_python(): @@ -19,16 +19,16 @@ def test_hll_add_python(): # use the lower level add() method, which accepts anything, # and compare to an exact count using collections.Counter - filename = utils.get_test_data('ecoli.genes.fna') + filename = utils.get_test_data("ecoli.genes.fna") hll = HLL(ERR_RATE, K) counter = set() with open(filename) as f: for n, record in enumerate(fasta_iter(f)): - sequence = record['sequence'] + sequence = record["sequence"] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): - kmer = sequence[n:n + K] + kmer = sequence[n : n + K] rc = "".join(TRANSLATE[c] for c in kmer[::-1]) hll.add(kmer) @@ -47,12 +47,12 @@ def test_hll_consume_string(): # test rust code to count unique kmers using HyperLogLog, # using screed to feed each read to the counter. - filename = utils.get_test_data('ecoli.genes.fna') + filename = utils.get_test_data("ecoli.genes.fna") hll = HLL(ERR_RATE, K) - n_consumed = n = 0 + n = 0 with open(filename) as f: for n, record in enumerate(fasta_iter(f), 1): - hll.add_sequence(record['sequence']) + hll.add_sequence(record["sequence"]) assert abs(1 - float(len(hll)) / N_UNIQUE) < ERR_RATE @@ -60,10 +60,9 @@ def test_hll_consume_string(): def test_hll_similarity_containment(): N_UNIQUE_H1 = 500741 N_UNIQUE_H2 = 995845 - N_UNIQUE_U = 995845 SIMILARITY = 0.502783 - CONTAINMENT_H1 = 1. + CONTAINMENT_H1 = 1.0 CONTAINMENT_H2 = 0.502783 INTERSECTION = 500838 @@ -72,23 +71,23 @@ def test_hll_similarity_containment(): hll2 = HLL(ERR_RATE, K) hllu = HLL(ERR_RATE, K) - filename = utils.get_test_data('genome-s10.fa.gz') + filename = utils.get_test_data("genome-s10.fa.gz") with gzip.GzipFile(filename) as f: for n, record in enumerate(fasta_iter(f)): - sequence = record['sequence'] + sequence = record["sequence"] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): - kmer = sequence[n:n + K] + kmer = sequence[n : n + K] hll1.add(kmer) hllu.add(kmer) - filename = utils.get_test_data('genome-s10+s11.fa.gz') + filename = utils.get_test_data("genome-s10+s11.fa.gz") with gzip.GzipFile(filename) as f: for n, record in enumerate(fasta_iter(f)): - sequence = record['sequence'] + sequence = record["sequence"] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): - kmer = sequence[n:n + K] + kmer = sequence[n : n + K] hll2.add(kmer) hllu.add(kmer) @@ -113,13 +112,14 @@ def test_hll_similarity_containment(): assert abs(1 - float(hll1.intersection(hllu)) / N_UNIQUE_U) < ERR_RATE """ + def test_hll_save_load(): - filename = utils.get_test_data('ecoli.genes.fna') + filename = utils.get_test_data("ecoli.genes.fna") hll = HLL(ERR_RATE, K) - n_consumed = n = 0 + n = 0 with open(filename) as f: for n, record in enumerate(fasta_iter(f), 1): - hll.add_sequence(record['sequence']) + hll.add_sequence(record["sequence"]) assert abs(1 - float(len(hll)) / N_UNIQUE) < ERR_RATE diff --git a/tests/test_index.py b/tests/test_index.py index af0c1da890..b207376443 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -9,10 +9,15 @@ import sourmash from sourmash import load_one_signature, SourmashSignature -from sourmash.index import (LinearIndex, ZipFileLinearIndex, - make_jaccard_search_query, CounterGather, - LazyLinearIndex, MultiIndex, - StandaloneManifestIndex) +from sourmash.index import ( + LinearIndex, + ZipFileLinearIndex, + make_jaccard_search_query, + CounterGather, + LazyLinearIndex, + MultiIndex, + StandaloneManifestIndex, +) from sourmash.index.revindex import RevIndex from sourmash.sbt import SBT, GraphFactory from sourmash import sourmash_args @@ -90,7 +95,7 @@ def test_simple_index(n_children): def test_linear_index_prefetch_empty(): # check that an exception is raised upon for an empty LinearIndex - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) lidx = LinearIndex() @@ -111,8 +116,8 @@ class FakeSignature: def minhash(self): raise Exception("don't touch me!") - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) @@ -142,8 +147,8 @@ def minhash(self): def test_linear_index_search_subj_has_abundance(): # check that search signatures in the index are flattened appropriately. - queryfile = utils.get_test_data('47.fa.sig') - subjfile = utils.get_test_data('track_abund/47.fa.sig') + queryfile = utils.get_test_data("47.fa.sig") + subjfile = utils.get_test_data("track_abund/47.fa.sig") qs = sourmash.load_one_signature(queryfile) ss = sourmash.load_one_signature(subjfile) @@ -159,8 +164,8 @@ def test_linear_index_search_subj_has_abundance(): def test_linear_index_gather_subj_has_abundance(): # check that target signatures in the index are flattened appropriately. - queryfile = utils.get_test_data('47.fa.sig') - subjfile = utils.get_test_data('track_abund/47.fa.sig') + queryfile = utils.get_test_data("47.fa.sig") + subjfile = utils.get_test_data("track_abund/47.fa.sig") qs = sourmash.load_one_signature(queryfile) ss = sourmash.load_one_signature(subjfile) @@ -178,7 +183,9 @@ def test_linear_index_gather_subj_has_abundance(): def test_index_search_subj_scaled_is_lower(): # check that subject sketches are appropriately downsampled for scaled # sketches. - sigfile = utils.get_test_data('scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz') + sigfile = utils.get_test_data( + "scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz" + ) ss = sourmash.load_one_signature(sigfile) # double check :) @@ -201,7 +208,7 @@ def test_index_search_subj_scaled_is_lower(): def test_index_search_subj_num_is_lower(): # check that subject sketches are appropriately downsampled for num # sketches - sigfile = utils.get_test_data('num/47.fa.sig') + sigfile = utils.get_test_data("num/47.fa.sig") ss = sourmash.load_one_signature(sigfile, ksize=31) # double check :) @@ -223,7 +230,7 @@ def test_index_search_subj_num_is_lower(): def test_index_search_query_num_is_lower(): # check that query sketches are appropriately downsampled for num. - sigfile = utils.get_test_data('num/47.fa.sig') + sigfile = utils.get_test_data("num/47.fa.sig") qs = sourmash.load_one_signature(sigfile, ksize=31) # double check :) @@ -244,8 +251,8 @@ def test_index_search_query_num_is_lower(): def test_linear_index_search_abund(): # test Index.search_abund - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) @@ -262,8 +269,8 @@ def test_linear_index_search_abund(): def test_linear_index_search_abund_downsample_query(): # test Index.search_abund with query with higher scaled - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) @@ -285,8 +292,8 @@ def test_linear_index_search_abund_downsample_query(): def test_linear_index_search_abund_downsample_subj(): # test Index.search_abund with subj with higher scaled - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) @@ -308,8 +315,8 @@ def test_linear_index_search_abund_downsample_subj(): def test_linear_index_search_abund_requires_threshold(): # test that Index.search_abund requires a 'threshold' - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) @@ -319,15 +326,15 @@ def test_linear_index_search_abund_requires_threshold(): lidx.insert(ss63) with pytest.raises(TypeError) as exc: - results = list(lidx.search_abund(ss47, threshold=None)) + list(lidx.search_abund(ss47, threshold=None)) assert "'search_abund' requires 'threshold'" in str(exc.value) def test_linear_index_search_abund_query_flat(): # test that Index.search_abund requires an abund query sig - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) ss63 = sourmash.load_one_signature(sig63) @@ -337,15 +344,17 @@ def test_linear_index_search_abund_query_flat(): lidx.insert(ss63) with pytest.raises(TypeError) as exc: - results = list(lidx.search_abund(ss47, threshold=0)) + list(lidx.search_abund(ss47, threshold=0)) - assert "'search_abund' requires query signature with abundance information" in str(exc.value) + assert "'search_abund' requires query signature with abundance information" in str( + exc.value + ) def test_linear_index_search_abund_subj_flat(): # test Index.search_abund requires an abund subj - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) @@ -355,16 +364,19 @@ def test_linear_index_search_abund_subj_flat(): lidx.insert(ss63) with pytest.raises(TypeError) as exc: - results = list(lidx.search_abund(ss47, threshold=0)) + list(lidx.search_abund(ss47, threshold=0)) - assert "'search_abund' requires subject signatures with abundance information" in str(exc.value) + assert ( + "'search_abund' requires subject signatures with abundance information" + in str(exc.value) + ) def test_linear_index_save(runtmp): # test save output from LinearIndex => JSON - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) @@ -375,7 +387,7 @@ def test_linear_index_save(runtmp): linear.insert(ss47) linear.insert(ss63) - filename = runtmp.output('foo') + filename = runtmp.output("foo") linear.save(filename) si = set(sourmash.load_file_as_signatures(filename)) @@ -385,24 +397,24 @@ def test_linear_index_save(runtmp): print(len(si)) print(len(x)) - print('si: ', si) - print('x: ', x) + print("si: ", si) + print("x: ", x) assert si == x, si def test_linear_index_load(runtmp): # test .load class method of LinearIndex - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) - filename = runtmp.output('foo') - with open(filename, 'wt') as fp: + filename = runtmp.output("foo") + with open(filename, "w") as fp: sourmash.save_signatures([ss2, ss47, ss63], fp) linear = LinearIndex.load(filename) @@ -414,9 +426,9 @@ def test_linear_index_load(runtmp): def test_linear_index_save_load(runtmp): # LinearIndex save/load round trip - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) @@ -427,7 +439,7 @@ def test_linear_index_save_load(runtmp): linear.insert(ss47) linear.insert(ss63) - filename = runtmp.output('foo') + filename = runtmp.output("foo") linear.save(filename) linear2 = LinearIndex.load(filename) @@ -440,9 +452,9 @@ def test_linear_index_save_load(runtmp): def test_linear_gather_threshold_1(): # test gather() method, in some detail - sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) + sig2 = load_one_signature(utils.get_test_data("2.fa.sig"), ksize=31) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) linear = LinearIndex() @@ -498,11 +510,11 @@ def test_linear_gather_threshold_1(): def test_linear_gather_threshold_5(): # test gather() method above threshold - sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) + sig2 = load_one_signature(utils.get_test_data("2.fa.sig"), ksize=31) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) - linear = LinearIndex(filename='foo') + linear = LinearIndex(filename="foo") linear.insert(sig47) linear.insert(sig63) @@ -528,21 +540,20 @@ def test_linear_gather_threshold_5(): containment, match_sig, name = result assert containment == 1.0 assert match_sig == sig2 - assert name == 'foo' + assert name == "foo" # now, check with a threshold_bp that should be meet-able. - result = linear.best_containment(SourmashSignature(new_mh), - threshold_bp=5000) + result = linear.best_containment(SourmashSignature(new_mh), threshold_bp=5000) assert result containment, match_sig, name = result assert containment == 1.0 assert match_sig == sig2 - assert name == 'foo' + assert name == "foo" def test_linear_index_multik_select(): # test that LinearIndx can load multiple (three) ksizes, 21/31/51 - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") siglist = sourmash.load_file_as_signatures(sig2) linear = LinearIndex() @@ -550,17 +561,17 @@ def test_linear_index_multik_select(): linear.insert(ss) # select most specifically - linear2 = linear.select(ksize=31, moltype='DNA') + linear2 = linear.select(ksize=31, moltype="DNA") assert len(linear2) == 1 # all are DNA: - linear2 = linear.select(moltype='DNA') + linear2 = linear.select(moltype="DNA") assert len(linear2) == 3 def test_linear_index_moltype_select(): # this loads two ksizes(21, 10), and two moltypes (DNA and protein) - filename = utils.get_test_data('genome-s10+s11.sig') + filename = utils.get_test_data("genome-s10+s11.sig") siglist = sourmash.load_file_as_signatures(filename) linear = LinearIndex() @@ -568,19 +579,19 @@ def test_linear_index_moltype_select(): linear.insert(ss) # select most specific DNA - linear2 = linear.select(ksize=30, moltype='DNA') + linear2 = linear.select(ksize=30, moltype="DNA") assert len(linear2) == 1 # select most specific protein - linear2 = linear.select(ksize=10, moltype='protein') + linear2 = linear.select(ksize=10, moltype="protein") assert len(linear2) == 1 # can leave off ksize, selects all ksizes - linear2 = linear.select(moltype='DNA') + linear2 = linear.select(moltype="DNA") assert len(linear2) == 2 # can leave off ksize, selects all ksizes - linear2 = linear.select(moltype='protein') + linear2 = linear.select(moltype="protein") assert len(linear2) == 2 # select something impossible @@ -592,7 +603,7 @@ def test_linear_index_picklist_select(): # test LinearIndex.select with a picklist # this loads three ksizes, 21/31/51 - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") siglist = sourmash.load_file_as_signatures(sig2) linear = LinearIndex() @@ -600,22 +611,22 @@ def test_linear_index_picklist_select(): linear.insert(ss) # construct a picklist... - picklist = SignaturePicklist('md5prefix8') - picklist.init(['f3a90d4e']) + picklist = SignaturePicklist("md5prefix8") + picklist.init(["f3a90d4e"]) # select on picklist linear2 = linear.select(picklist=picklist) assert len(linear2) == 1 ss = list(linear2.signatures())[0] assert ss.minhash.ksize == 31 - assert ss.md5sum().startswith('f3a90d4e55') + assert ss.md5sum().startswith("f3a90d4e55") def test_linear_index_picklist_select_exclude(): # test select with a picklist, but exclude # this loads three ksizes, 21/31/51 - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") siglist = sourmash.load_file_as_signatures(sig2) linear = LinearIndex() @@ -623,8 +634,8 @@ def test_linear_index_picklist_select_exclude(): linear.insert(ss) # construct a picklist... - picklist = SignaturePicklist('md5prefix8', pickstyle=PickStyle.EXCLUDE) - picklist.init(['f3a90d4e']) + picklist = SignaturePicklist("md5prefix8", pickstyle=PickStyle.EXCLUDE) + picklist.init(["f3a90d4e"]) # select on picklist linear2 = linear.select(picklist=picklist) @@ -634,37 +645,39 @@ def test_linear_index_picklist_select_exclude(): for ss in list(linear2.signatures()): md5s.add(ss.md5sum()) ksizes.add(ss.minhash.ksize) - assert md5s == set(['f372e47893edd349e5956f8b0d8dcbf7','43f3b48e59443092850964d355a20ac0']) - assert ksizes == set([21,51]) + assert md5s == set( + ["f372e47893edd349e5956f8b0d8dcbf7", "43f3b48e59443092850964d355a20ac0"] + ) + assert ksizes == set([21, 51]) def test_index_same_md5sum_fsstorage(runtmp): # check SBT directory 'save' with two signatures that have identical md5 c = runtmp - testdata1 = utils.get_test_data('img/2706795855.sig') - testdata2 = utils.get_test_data('img/638277004.sig') + testdata1 = utils.get_test_data("img/2706795855.sig") + testdata2 = utils.get_test_data("img/638277004.sig") - c.run_sourmash('index', '-k', '21', 'zzz.sbt.json', testdata1, testdata2) + c.run_sourmash("index", "-k", "21", "zzz.sbt.json", testdata1, testdata2) assert c.last_result.status == 0 - outfile = c.output('zzz.sbt.json') + outfile = c.output("zzz.sbt.json") assert os.path.exists(outfile) - storage = c.output('.sbt.zzz') + storage = c.output(".sbt.zzz") assert len(glob.glob(storage + "/*")) == 4 def test_index_same_md5sum_sbt_zipstorage(runtmp): # check SBT zipfile 'save' with two signatures w/identical md5 c = runtmp - testdata1 = utils.get_test_data('img/2706795855.sig') - testdata2 = utils.get_test_data('img/638277004.sig') + testdata1 = utils.get_test_data("img/2706795855.sig") + testdata2 = utils.get_test_data("img/638277004.sig") - c.run_sourmash('index', '-k', '21', 'zzz.sbt.zip', testdata1, testdata2) + c.run_sourmash("index", "-k", "21", "zzz.sbt.zip", testdata1, testdata2) assert c.last_result.status == 0 - outfile = c.output('zzz.sbt.zip') + outfile = c.output("zzz.sbt.zip") assert os.path.exists(outfile) - zout = zipfile.ZipFile(outfile, mode='r') + zout = zipfile.ZipFile(outfile, mode="r") # should have 3 files, 1 internal and two sigs. We check for 4 because the # directory also shows in namelist() assert len([f for f in zout.namelist() if f.startswith(".sbt.zzz/")]) == 5 @@ -672,11 +685,11 @@ def test_index_same_md5sum_sbt_zipstorage(runtmp): def test_zipfile_does_not_exist(runtmp): with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sig', 'describe', 'no-exist.zip') + runtmp.sourmash("sig", "describe", "no-exist.zip") # old behavior, pre PR #1777 - assert 'FileNotFoundError: SOURMASH-MANIFEST.csv' not in str(exc) - assert not os.path.exists(runtmp.output('no-exist.zip')) + assert "FileNotFoundError: SOURMASH-MANIFEST.csv" not in str(exc) + assert not os.path.exists(runtmp.output("no-exist.zip")) # correct behavior assert "ERROR: Error while reading signatures from 'no-exist.zip'." in str(exc) @@ -686,90 +699,102 @@ def test_zipfile_protein_command_search(runtmp): # test command-line search/gather of zipfile with protein sigs c = runtmp - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/protein.zip') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/protein.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out) - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out) + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_zipfile_hp_command_search(runtmp): # test command-line search/gather of zipfile with hp sigs c = runtmp - sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/hp.zip') + sigfile1 = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/hp.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out, "--threshold", "0.0") + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_zipfile_dayhoff_command_search(runtmp): # test command-line search/gather of zipfile with dayhoff sigs c = runtmp - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/dayhoff.zip') + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/dayhoff.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out, "--threshold", "0.0") + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_zipfile_protein_command_search_combined(runtmp): # test command-line search/gather of combined zipfile with protein sigs c = runtmp - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/all.zip') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/all.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out) - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out) + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_zipfile_hp_command_search_combined(runtmp): # test command-line search/gather of combined zipfile with hp sigs c = runtmp - sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/all.zip') + sigfile1 = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/all.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out, "--threshold", "0.0") + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_zipfile_dayhoff_command_search_combined(runtmp): # test command-line search/gather of combined zipfile with dayhoff sigs c = runtmp - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/all.zip') + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/all.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out, "--threshold", "0.0") + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_zipfile_dayhoff_command_search_protein(runtmp): @@ -777,21 +802,23 @@ def test_zipfile_dayhoff_command_search_protein(runtmp): c = runtmp # with dayhoff query - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/protein.zip') + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/protein.zip") - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") print(c.last_result.out) print(c.last_result.err) - assert 'no compatible signatures found in ' in c.last_result.err + assert "no compatible signatures found in " in c.last_result.err def test_zipfile_API_signatures(use_manifest): # return all of the .sig and .sig.gz files in all.zip - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest) siglist = list(zipidx.signatures()) @@ -814,7 +841,7 @@ def __init__(self): pass def signatures(self): - yield 'a' + yield "a" raise Exception("don't touch me!") def __len__(self): @@ -832,10 +859,11 @@ def __len__(self): def test_zipfile_API_signatures_traverse_yield_all(use_manifest): # include dna-sig.noext, but not build.sh (cannot be loaded as signature) - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") - zipidx = ZipFileLinearIndex.load(zipfile_db, traverse_yield_all=True, - use_manifest=use_manifest) + zipidx = ZipFileLinearIndex.load( + zipfile_db, traverse_yield_all=True, use_manifest=use_manifest + ) siglist = list(zipidx.signatures()) assert len(siglist) == 8 assert len(zipidx) == 8 @@ -848,11 +876,12 @@ def test_zipfile_API_signatures_traverse_yield_all(use_manifest): def test_zipfile_API_signatures_traverse_yield_all_select(use_manifest): # include dna-sig.noext - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") - zipidx = ZipFileLinearIndex.load(zipfile_db, traverse_yield_all=True, - use_manifest=use_manifest) - zipidx = zipidx.select(moltype='DNA') + zipidx = ZipFileLinearIndex.load( + zipfile_db, traverse_yield_all=True, use_manifest=use_manifest + ) + zipidx = zipidx.select(moltype="DNA") siglist = list(zipidx.signatures()) assert len(siglist) == 2 assert len(zipidx) == 2 @@ -860,14 +889,15 @@ def test_zipfile_API_signatures_traverse_yield_all_select(use_manifest): def test_zipfile_API_signatures_traverse_yield_all_manifest(): # check that manifest len is correct - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") - zipidx = ZipFileLinearIndex.load(zipfile_db, traverse_yield_all=True, - use_manifest=True) + zipidx = ZipFileLinearIndex.load( + zipfile_db, traverse_yield_all=True, use_manifest=True + ) assert len(zipidx) == 8, len(zipidx) assert len(zipidx.manifest) == 8, len(zipidx.manifest) - zipidx = zipidx.select(moltype='DNA') + zipidx = zipidx.select(moltype="DNA") siglist = list(zipidx.signatures()) assert len(siglist) == 2 assert len(zipidx) == 2 @@ -876,13 +906,13 @@ def test_zipfile_API_signatures_traverse_yield_all_manifest(): def test_zipfile_API_signatures_select(use_manifest): # include dna-sig.noext - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest) ziplist_pre = LinearIndex(zipidx.signatures()) - ziplist_pre = ziplist_pre.select(moltype='DNA') + ziplist_pre = ziplist_pre.select(moltype="DNA") - zipidx = zipidx.select(moltype='DNA') + zipidx = zipidx.select(moltype="DNA") siglist = list(zipidx.signatures()) if use_manifest: @@ -897,7 +927,7 @@ def test_zipfile_API_signatures_select(use_manifest): def test_zipfile_API_signatures_select_abund_false(use_manifest): # check for abund=False (all signatures match b/c can convert) - zipfile_db = utils.get_test_data('track_abund/track_abund.zip') + zipfile_db = utils.get_test_data("track_abund/track_abund.zip") zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest) ziplist_pre = LinearIndex(zipidx.signatures()) @@ -913,7 +943,7 @@ def test_zipfile_API_signatures_select_abund_false(use_manifest): def test_zipfile_API_signatures_select_abund_true(use_manifest): # find all abund=True (all signatures match, b/c abund) - zipfile_db = utils.get_test_data('track_abund/track_abund.zip') + zipfile_db = utils.get_test_data("track_abund/track_abund.zip") zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest) ziplist_pre = LinearIndex(zipidx.signatures()) @@ -929,7 +959,7 @@ def test_zipfile_API_signatures_select_abund_true(use_manifest): def test_zipfile_API_signatures_select_abund_none(use_manifest): # find all abund=None (all signatures match, b/c no selection criteria) - zipfile_db = utils.get_test_data('track_abund/track_abund.zip') + zipfile_db = utils.get_test_data("track_abund/track_abund.zip") zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest) ziplist_pre = LinearIndex(zipidx.signatures()) @@ -945,14 +975,14 @@ def test_zipfile_API_signatures_select_abund_none(use_manifest): def test_zipfile_API_signatures_select_twice(use_manifest): # include dna-sig.noext - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest) ziplist_pre = LinearIndex(zipidx.signatures()) - ziplist_pre = ziplist_pre.select(moltype='DNA') + ziplist_pre = ziplist_pre.select(moltype="DNA") ziplist_pre = ziplist_pre.select(ksize=31) - zipidx = zipidx.select(moltype='DNA') + zipidx = zipidx.select(moltype="DNA") zipidx = zipidx.select(ksize=31) siglist = list(zipidx.signatures()) @@ -968,17 +998,17 @@ def test_zipfile_API_signatures_select_twice(use_manifest): def test_zipfile_API_save(): # ZipFileLinearIndex.save is not implemented. - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") zipidx = ZipFileLinearIndex.load(zipfile_db) with pytest.raises(NotImplementedError): - zipidx.save('xxx') + zipidx.save("xxx") def test_zipfile_API_insert(): # ZipFileLinearIndex.insert is not implemented. - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") zipidx = ZipFileLinearIndex.load(zipfile_db) @@ -989,7 +1019,7 @@ def test_zipfile_API_insert(): def test_zipfile_API_location(use_manifest): # test ZipFileLinearIndex.location property - zipfile_db = utils.get_test_data('prot/all.zip') + zipfile_db = utils.get_test_data("prot/all.zip") zipidx = ZipFileLinearIndex.load(zipfile_db, use_manifest=use_manifest) @@ -1000,9 +1030,8 @@ def test_zipfile_load_file_as_signatures(use_manifest): # make sure that ZipFileLinearIndex.signatures works, and is generator from types import GeneratorType - zipfile_db = utils.get_test_data('prot/all.zip') - sigs = sourmash_args.load_file_as_signatures(zipfile_db, - _use_manifest=use_manifest) + zipfile_db = utils.get_test_data("prot/all.zip") + sigs = sourmash_args.load_file_as_signatures(zipfile_db, _use_manifest=use_manifest) # it's fine if this needs to change, but for now I want to make # sure that this is a generator. @@ -1019,10 +1048,10 @@ def test_zipfile_load_file_as_signatures_traverse_yield_all(use_manifest): # test with --force, which loads all files from types import GeneratorType - zipfile_db = utils.get_test_data('prot/all.zip') - sigs = sourmash_args.load_file_as_signatures(zipfile_db, - yield_all_files=True, - _use_manifest=use_manifest) + zipfile_db = utils.get_test_data("prot/all.zip") + sigs = sourmash_args.load_file_as_signatures( + zipfile_db, yield_all_files=True, _use_manifest=use_manifest + ) # it's fine if this needs to change, but for now I want to make # sure that this is a generator. @@ -1036,21 +1065,21 @@ def test_zipfile_load_database_fail_if_not_zip(runtmp): # fail _load_database if not .zip c = runtmp - zipfile_db = utils.get_test_data('prot/all.zip') - badname = c.output('xyz.nada') + zipfile_db = utils.get_test_data("prot/all.zip") + badname = c.output("xyz.nada") shutil.copyfile(zipfile_db, badname) with pytest.raises(ValueError) as exc: - sigs = sourmash_args.load_file_as_signatures(badname) + sourmash_args.load_file_as_signatures(badname) - assert 'Error while reading signatures from' in str(exc.value) + assert "Error while reading signatures from" in str(exc.value) def test_multi_index_search(): # test MultiIndex.search - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) @@ -1061,8 +1090,7 @@ def test_multi_index_search(): lidx3 = LinearIndex.load(sig63) # create MultiIndex with source location override - lidx = MultiIndex.load([lidx1, lidx2, lidx3], ['A', None, 'C'], - None) + lidx = MultiIndex.load([lidx1, lidx2, lidx3], ["A", None, "C"], None) lidx = lidx.select(ksize=31) # now, search for sig2 @@ -1070,7 +1098,7 @@ def test_multi_index_search(): print([s[1].name for s in sr]) assert len(sr) == 1 assert sr[0][1] == ss2 - assert sr[0][2] == 'A' # source override + assert sr[0][2] == "A" # source override # search for sig47 with lower threshold; search order not guaranteed. sr = lidx.search(ss47, threshold=0.1) @@ -1078,9 +1106,9 @@ def test_multi_index_search(): assert len(sr) == 2 sr.sort(key=lambda x: -x[0]) assert sr[0][1] == ss47 - assert sr[0][2] == sig47 # source was set to None, so no override + assert sr[0][2] == sig47 # source was set to None, so no override assert sr[1][1] == ss63 - assert sr[1][2] == 'C' # source override + assert sr[1][2] == "C" # source override # search for sig63 with lower threshold; search order not guaranteed. sr = lidx.search(ss63, threshold=0.1) @@ -1088,9 +1116,9 @@ def test_multi_index_search(): assert len(sr) == 2 sr.sort(key=lambda x: -x[0]) assert sr[0][1] == ss63 - assert sr[0][2] == 'C' # source override + assert sr[0][2] == "C" # source override assert sr[1][1] == ss47 - assert sr[1][2] == sig47 # source was set to None, so no override + assert sr[1][2] == sig47 # source was set to None, so no override # search for sig63 with high threshold => 1 match sr = lidx.search(ss63, threshold=0.8) @@ -1098,45 +1126,44 @@ def test_multi_index_search(): assert len(sr) == 1 sr.sort(key=lambda x: -x[0]) assert sr[0][1] == ss63 - assert sr[0][2] == 'C' # source override + assert sr[0][2] == "C" # source override def test_multi_index_gather(): # test MultiIndex.best_containment - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) - ss63 = sourmash.load_one_signature(sig63) + sourmash.load_one_signature(sig63) lidx1 = LinearIndex.load(sig2) lidx2 = LinearIndex.load(sig47) lidx3 = LinearIndex.load(sig63) # create MultiIndex with source location override - lidx = MultiIndex.load([lidx1, lidx2, lidx3], ['A', None, 'C'], - None) + lidx = MultiIndex.load([lidx1, lidx2, lidx3], ["A", None, "C"], None) lidx = lidx.select(ksize=31) match = lidx.best_containment(ss2) assert match assert match.score == 1.0 - assert match.location == 'A' + assert match.location == "A" match = lidx.best_containment(ss47) assert match assert match.score == 1.0 assert match.signature == ss47 - assert match.location == sig47 # no source override + assert match.location == sig47 # no source override def test_multi_index_signatures(): # test MultiIndex.signatures - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) @@ -1147,8 +1174,7 @@ def test_multi_index_signatures(): lidx3 = LinearIndex.load(sig63) # create MultiIndex with source location override - lidx = MultiIndex.load([lidx1, lidx2, lidx3], ['A', None, 'C'], - None) + lidx = MultiIndex.load([lidx1, lidx2, lidx3], ["A", None, "C"], None) lidx = lidx.select(ksize=31) siglist = list(lidx.signatures()) @@ -1168,13 +1194,13 @@ def test_multi_index_create_prepend(): # test MultiIndex constructor - location must be specified if # 'prepend_location is True with pytest.raises(ValueError): - mi = MultiIndex(None, None, prepend_location=True) + MultiIndex(None, None, prepend_location=True) def test_multi_index_load_from_directory(): # test MultiIndex loading from a directory. The full paths to the # signature files should be available via 'signatures_with_location()' - dirname = utils.get_test_data('prot/protein') + dirname = utils.get_test_data("prot/protein") mi = MultiIndex.load_from_directory(dirname, force=False) assert mi.location == dirname @@ -1183,10 +1209,12 @@ def test_multi_index_load_from_directory(): assert len(sigs) == 2 # check to make sure that full paths to expected sig files are returned - locs = [ x[1] for x in mi.signatures_with_location() ] + locs = [x[1] for x in mi.signatures_with_location()] - endings = ('GCA_001593925.1_ASM159392v1_protein.faa.gz.sig', - 'GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + endings = ( + "GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + "GCA_001593935.1_ASM159393v1_protein.faa.gz.sig", + ) for loc in locs: found = False for end in endings: @@ -1195,16 +1223,16 @@ def test_multi_index_load_from_directory(): assert found, f"could not find full filename in locations for {end}" # also check internal locations and parent value -- - assert mi.parent.endswith('prot/protein') + assert mi.parent.endswith("prot/protein") - ilocs = [ x[1] for x in mi._signatures_with_internal() ] + ilocs = [x[1] for x in mi._signatures_with_internal()] assert endings[0] in ilocs, ilocs assert endings[1] in ilocs, ilocs def test_multi_index_load_from_directory_2(): # only load .sig files, currently; not the databases under that directory. - dirname = utils.get_test_data('prot') + dirname = utils.get_test_data("prot") mi = MultiIndex.load_from_directory(dirname, force=False) sigs = list(mi.signatures()) @@ -1214,13 +1242,12 @@ def test_multi_index_load_from_directory_2(): def test_multi_index_load_from_directory_3_simple_bad_file(runtmp): # check that force=False fails properly when confronted with non-JSON # files. - c = runtmp - with open(runtmp.output('badsig.sig'), 'wt') as fp: - fp.write('bad content.') + with open(runtmp.output("badsig.sig"), "w") as fp: + fp.write("bad content.") with pytest.raises(ValueError): - mi = MultiIndex.load_from_directory(runtmp.location, force=False) + MultiIndex.load_from_directory(runtmp.location, force=False) def test_multi_index_load_from_directory_3(runtmp): @@ -1228,7 +1255,7 @@ def test_multi_index_load_from_directory_3(runtmp): # files that are legit sourmash files... c = runtmp - dirname = utils.get_test_data('prot') + dirname = utils.get_test_data("prot") count = 0 for root, dirs, files in os.walk(dirname): @@ -1240,7 +1267,7 @@ def test_multi_index_load_from_directory_3(runtmp): count += 1 with pytest.raises(ValueError): - mi = MultiIndex.load_from_directory(c.location, force=False) + MultiIndex.load_from_directory(c.location, force=False) def test_multi_index_load_from_directory_3_yield_all_true(runtmp): @@ -1248,7 +1275,7 @@ def test_multi_index_load_from_directory_3_yield_all_true(runtmp): # Note here that only .sig/.sig.gz files are loaded. c = runtmp - dirname = utils.get_test_data('prot') + dirname = utils.get_test_data("prot") count = 0 for root, dirs, files in os.walk(dirname): @@ -1269,7 +1296,7 @@ def test_multi_index_load_from_directory_3_yield_all_true_subdir(runtmp): # check that force works ok on subdirectories. # Note here that only .sig/.sig.gz files are loaded. c = runtmp - dirname = utils.get_test_data('prot') + dirname = utils.get_test_data("prot") target_dir = c.output("some_subdir") os.mkdir(target_dir) @@ -1285,7 +1312,7 @@ def test_multi_index_load_from_directory_3_yield_all_true_subdir(runtmp): mi = MultiIndex.load_from_directory(c.location, force=True) - locations = set([ row['internal_location'] for row in mi.manifest.rows ]) + locations = set([row["internal_location"] for row in mi.manifest.rows]) print(locations) sigs = list(mi.signatures()) @@ -1296,12 +1323,12 @@ def test_multi_index_load_from_directory_3_sig_gz(runtmp): # check that we find .sig.gz files, too c = runtmp - dirname = utils.get_test_data('prot') + dirname = utils.get_test_data("prot") count = 0 for root, dirs, files in os.walk(dirname): for name in files: - if not name.endswith('.sig'): # skip non .sig things + if not name.endswith(".sig"): # skip non .sig things continue print(f"at {name}") fullname = os.path.join(root, name) @@ -1321,26 +1348,25 @@ def test_multi_index_load_from_directory_3_check_traverse_fn(runtmp): # test the actual traverse function... eventually this test can be # removed, probably, as we consolidate functionality and test MultiIndex # better. - c = runtmp - dirname = utils.get_test_data('prot') + dirname = utils.get_test_data("prot") files = list(sourmash_args.traverse_find_sigs([dirname])) assert len(files) == 7, files files = list(sourmash_args.traverse_find_sigs([dirname], True)) - assert len(files) == 20, files # if this fails, check for extra files! + assert len(files) == 20, files # if this fails, check for extra files! def test_multi_index_load_from_directory_no_exist(): # raise ValueError on files that don't exist in load_from_directory - dirname = utils.get_test_data('does-not-exist') + dirname = utils.get_test_data("does-not-exist") with pytest.raises(ValueError): - mi = MultiIndex.load_from_directory(dirname, force=True) + MultiIndex.load_from_directory(dirname, force=True) def test_multi_index_load_from_file_path(): # test that MultiIndex.load_from_path works fine - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") mi = MultiIndex.load_from_path(sig2) assert len(mi) == 3 @@ -1349,29 +1375,29 @@ def test_multi_index_load_from_file_path(): def test_multi_index_load_from_file_path_no_exist(): # test that load_from_path fails on non-existent files - filename = utils.get_test_data('does-not-exist') + filename = utils.get_test_data("does-not-exist") with pytest.raises(ValueError): - mi = MultiIndex.load_from_directory(filename, force=True) + MultiIndex.load_from_directory(filename, force=True) def test_multi_index_load_from_pathlist_no_exist(): # test that load_from_pathlist fails on non-existent files - dirname = utils.get_test_data('does-not-exist') + dirname = utils.get_test_data("does-not-exist") with pytest.raises(ValueError): - mi = MultiIndex.load_from_pathlist(dirname) + MultiIndex.load_from_pathlist(dirname) def test_multi_index_load_from_pathlist_1(runtmp): # test functionality of MultiIndex.load_from_pathlist with .sig files c = runtmp - dirname = utils.get_test_data('prot') + dirname = utils.get_test_data("prot") files = list(sourmash_args.traverse_find_sigs([dirname])) assert len(files) == 7, files - file_list = c.output('filelist.txt') + file_list = c.output("filelist.txt") - with open(file_list, 'wt') as fp: + with open(file_list, "w") as fp: print("\n".join(files), file=fp) mi = MultiIndex.load_from_pathlist(file_list) @@ -1388,54 +1414,57 @@ def test_multi_index_load_from_pathlist_2(runtmp): # CTB note: if you create extra files under this directory, # it will fail :) c = runtmp - dirname = utils.get_test_data('prot') + dirname = utils.get_test_data("prot") files = list(sourmash_args.traverse_find_sigs([dirname], True)) - assert len(files) == 20, files # check there aren't extra files in here! + assert len(files) == 20, files # check there aren't extra files in here! - file_list = c.output('filelist.txt') + file_list = c.output("filelist.txt") - with open(file_list, 'wt') as fp: + with open(file_list, "w") as fp: print("\n".join(files), file=fp) with pytest.raises(ValueError) as exc: - mi = MultiIndex.load_from_pathlist(file_list) + MultiIndex.load_from_pathlist(file_list) print(str(exc)) - assert 'Error while reading signatures from' in str(exc) + assert "Error while reading signatures from" in str(exc) def test_multi_index_load_from_pathlist_3_zipfile(runtmp): # can we load zipfiles in a pathlist? yes please. c = runtmp - zipfile = utils.get_test_data('prot/all.zip') + zipfile = utils.get_test_data("prot/all.zip") - file_list = c.output('filelist.txt') + file_list = c.output("filelist.txt") - with open(file_list, 'wt') as fp: + with open(file_list, "w") as fp: print(zipfile, file=fp) mi = MultiIndex.load_from_pathlist(file_list) assert len(mi) == 8 + ## ## test a slightly outre version of JaccardSearch - this is a test of the ## JaccardSearch 'collect' protocol, in particular... ## + class JaccardSearchBestOnly_ButIgnore(JaccardSearch): "A class that ignores certain results, but still does all the pruning." + def __init__(self, ignore_list): super().__init__(SearchType.JACCARD, threshold=0.1) self.ignore_list = ignore_list # a collect function that _ignores_ things in the ignore_list def collect(self, score, match): - print('in collect; current threshold:', self.threshold) + print("in collect; current threshold:", self.threshold) for q in self.ignore_list: - print('ZZZ', match, match.similarity(q)) + print("ZZZ", match, match.similarity(q)) if match.similarity(q) == 1.0: - print('yes, found.') + print("yes, found.") return False # update threshold if not perfect match, which could help prune. @@ -1445,9 +1474,9 @@ def collect(self, score, match): def test_linear_index_gather_ignore(): # do we properly ignore exact matches in 'search' for LinearIndex? - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47, ksize=31) @@ -1460,7 +1489,7 @@ def test_linear_index_gather_ignore(): search_fn = JaccardSearchBestOnly_ButIgnore([ss47]) results = list(lidx.find(search_fn, ss47)) - results = [ sr.signature for sr in results ] + results = [sr.signature for sr in results] def is_found(ss, xx): for q in xx: @@ -1478,9 +1507,9 @@ def test_lca_index_gather_ignore(): # do we properly ignore exact matches in gather on an LCA DB? from sourmash.lca import LCA_Database - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47, ksize=31) @@ -1496,7 +1525,7 @@ def test_lca_index_gather_ignore(): search_fn = JaccardSearchBestOnly_ButIgnore([ss47]) results = list(db.find(search_fn, ss47)) - results = [ sr.signature for sr in results ] + results = [sr.signature for sr in results] def is_found(ss, xx): for q in xx: @@ -1512,9 +1541,9 @@ def is_found(ss, xx): def test_sbt_index_gather_ignore(): # do we properly ignore exact matches in gather on an SBT? - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47, ksize=31) @@ -1529,15 +1558,15 @@ def test_sbt_index_gather_ignore(): db.insert(ss63) # ...now search with something that should ignore sig47, the exact match. - print(f'\n** trying to ignore {ss47}') + print(f"\n** trying to ignore {ss47}") search_fn = JaccardSearchBestOnly_ButIgnore([ss47]) results = list(db.find(search_fn, ss47)) - results = [ sr.signature for sr in results ] + results = [sr.signature for sr in results] def is_found(ss, xx): for q in xx: - print('is found?', ss, ss.similarity(q)) + print("is found?", ss, ss.similarity(q)) if ss.similarity(q) == 1.0: return True return False @@ -1552,39 +1581,41 @@ def test_counter_gather_test_consume(): # (see test_index_protocol.py for generic CounterGather tests.) query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear() match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear() match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") # load up the counter counter = CounterGather(query_ss) - counter.add(match_ss_1, location='loc a') - counter.add(match_ss_2, location='loc b') - counter.add(match_ss_3, location='loc c') + counter.add(match_ss_1, location="loc a") + counter.add(match_ss_2, location="loc b") + counter.add(match_ss_3, location="loc c") ### ok, dig into actual counts... import pprint + pprint.pprint(counter.counter) pprint.pprint(list(counter.signatures())) pprint.pprint(counter.locations) assert set(counter.signatures()) == set([match_ss_1, match_ss_2, match_ss_3]) - assert list(sorted(counter.locations.values())) == ['loc a', 'loc b', 'loc c'] + assert list(sorted(counter.locations.values())) == ["loc a", "loc b", "loc c"] pprint.pprint(counter.counter.most_common()) - assert list(counter.counter.most_common()) == \ - [('26d4943627b33c446f37be1f5baf8d46', 10), - ('f51cedec90ea666e0ebc11aa274eca61', 8), - ('f331f8279113d77e42ab8efca8f9cc17', 4)] + assert list(counter.counter.most_common()) == [ + ("26d4943627b33c446f37be1f5baf8d46", 10), + ("f51cedec90ea666e0ebc11aa274eca61", 8), + ("f331f8279113d77e42ab8efca8f9cc17", 4), + ] ## round 1 @@ -1595,12 +1626,13 @@ def test_counter_gather_test_consume(): assert cur_query == query_ss.minhash counter.consume(intersect_mh) - assert set(counter.signatures()) == set([ match_ss_1, match_ss_2, match_ss_3 ]) - assert list(sorted(counter.locations.values())) == ['loc a', 'loc b', 'loc c'] + assert set(counter.signatures()) == set([match_ss_1, match_ss_2, match_ss_3]) + assert list(sorted(counter.locations.values())) == ["loc a", "loc b", "loc c"] pprint.pprint(counter.counter.most_common()) - assert list(counter.counter.most_common()) == \ - [('f51cedec90ea666e0ebc11aa274eca61', 5), - ('f331f8279113d77e42ab8efca8f9cc17', 4)] + assert list(counter.counter.most_common()) == [ + ("f51cedec90ea666e0ebc11aa274eca61", 5), + ("f331f8279113d77e42ab8efca8f9cc17", 4), + ] ### round 2 @@ -1611,12 +1643,13 @@ def test_counter_gather_test_consume(): assert cur_query != query_ss.minhash counter.consume(intersect_mh) - assert set(counter.signatures()) == set([ match_ss_1, match_ss_2, match_ss_3 ]) - assert list(sorted(counter.locations.values())) == ['loc a', 'loc b', 'loc c'] + assert set(counter.signatures()) == set([match_ss_1, match_ss_2, match_ss_3]) + assert list(sorted(counter.locations.values())) == ["loc a", "loc b", "loc c"] pprint.pprint(counter.counter.most_common()) - assert list(counter.counter.most_common()) == \ - [('f331f8279113d77e42ab8efca8f9cc17', 2)] + assert list(counter.counter.most_common()) == [ + ("f331f8279113d77e42ab8efca8f9cc17", 2) + ] ## round 3 @@ -1627,8 +1660,8 @@ def test_counter_gather_test_consume(): assert cur_query != query_ss.minhash counter.consume(intersect_mh) - assert set(counter.signatures()) == set([ match_ss_1, match_ss_2, match_ss_3 ]) - assert list(sorted(counter.locations.values())) == ['loc a', 'loc b', 'loc c'] + assert set(counter.signatures()) == set([match_ss_1, match_ss_2, match_ss_3]) + assert list(sorted(counter.locations.values())) == ["loc a", "loc b", "loc c"] pprint.pprint(counter.counter.most_common()) assert list(counter.counter.most_common()) == [] @@ -1639,8 +1672,8 @@ def test_counter_gather_test_consume(): assert not results counter.consume(intersect_mh) - assert set(counter.signatures()) == set([ match_ss_1, match_ss_2, match_ss_3 ]) - assert list(sorted(counter.locations.values())) == ['loc a', 'loc b', 'loc c'] + assert set(counter.signatures()) == set([match_ss_1, match_ss_2, match_ss_3]) + assert list(sorted(counter.locations.values())) == ["loc a", "loc b", "loc c"] assert list(counter.counter.most_common()) == [] @@ -1649,28 +1682,28 @@ def test_counter_gather_identical_md5sum(): # check what happens with identical matches w/different names query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") # same as match_mh_1 match_mh_2 = query_mh.copy_and_clear() match_mh_2.add_many(range(0, 10)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") # identical md5sum assert match_ss_1.md5sum() == match_ss_2.md5sum() # load up the counter counter = CounterGather(query_ss) - counter.add(match_ss_1, location='loc a') - counter.add(match_ss_2, location='loc b') + counter.add(match_ss_1, location="loc a") + counter.add(match_ss_2, location="loc b") assert len(counter.siglist) == 1 stored_match = list(counter.siglist.values()).pop() - assert stored_match.name == 'match2' + assert stored_match.name == "match2" # CTB note: this behavior may be changed freely, as the protocol # tests simply specify that _one_ of the identical matches is # returned. See test_counter_gather_multiple_identical_matches. @@ -1678,9 +1711,9 @@ def test_counter_gather_identical_md5sum(): def test_lazy_index_1(): # test some basic features of LazyLinearIndex - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) @@ -1735,14 +1768,14 @@ def minhash(self): lazy = LazyLinearIndex(lidx) lazy2 = lazy.select(ksize=31) with pytest.raises(ValueError) as e: - lazy3 = lazy2.select(ksize=21) + lazy2.select(ksize=21) assert str(e.value) == "cannot select on two different values for ksize" def test_lazy_index_4_bool(): # test some basic features of LazyLinearIndex - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) # test bool false/true @@ -1757,24 +1790,26 @@ def test_lazy_index_4_bool(): def test_lazy_index_wraps_multi_index_location(): # check that 'location' works fine when MultiIndex is wrapped by # LazyLinearIndex. - sigdir = utils.get_test_data('prot/protein/') - sigzip = utils.get_test_data('prot/protein.zip') - siglca = utils.get_test_data('prot/protein.lca.json.gz') - sigsbt = utils.get_test_data('prot/protein.sbt.zip') + sigdir = utils.get_test_data("prot/protein/") + sigzip = utils.get_test_data("prot/protein.zip") + siglca = utils.get_test_data("prot/protein.lca.json.gz") + sigsbt = utils.get_test_data("prot/protein.sbt.zip") db_paths = (sigdir, sigzip, siglca, sigsbt) - dbs = [ sourmash.load_file_as_index(db_path) for db_path in db_paths ] + dbs = [sourmash.load_file_as_index(db_path) for db_path in db_paths] mi = MultiIndex.load(dbs, db_paths, None) lazy = LazyLinearIndex(mi) - mi2 = mi.select(moltype='protein') - lazy2 = lazy.select(moltype='protein') + mi2 = mi.select(moltype="protein") + lazy2 = lazy.select(moltype="protein") - for (ss_tup, ss_lazy_tup) in zip(mi2.signatures_with_location(), - lazy2.signatures_with_location()): + for ss_tup, ss_lazy_tup in zip( + mi2.signatures_with_location(), lazy2.signatures_with_location() + ): assert ss_tup == ss_lazy_tup + def test_revindex_index_search(): # confirm that RevIndex works sig2 = utils.get_test_data("2.fa.sig") @@ -1848,9 +1883,9 @@ def test_revindex_gather(): def test_revindex_gather_ignore(): # check that RevIndex gather ignores things properly. - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47, ksize=31) @@ -1863,7 +1898,7 @@ def test_revindex_gather_ignore(): search_fn = JaccardSearchBestOnly_ButIgnore([ss47]) results = list(lidx.find(search_fn, ss47)) - results = [ ss.signature for ss in results ] + results = [ss.signature for ss in results] def is_found(ss, xx): for q in xx: @@ -1881,8 +1916,8 @@ def test_standalone_manifest_signatures(runtmp): # build a StandaloneManifestIndex and test 'signatures' method. ## first, build a manifest in memory using MultiIndex - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) @@ -1895,7 +1930,7 @@ def test_standalone_manifest_signatures(runtmp): ## got a manifest! ok, now test out StandaloneManifestIndex mm = StandaloneManifestIndex(mi.manifest, None) - siglist = [ ss for ss in mm.signatures() ] + siglist = [ss for ss in mm.signatures()] assert len(siglist) == 2 assert ss47 in siglist assert ss63 in siglist @@ -1905,11 +1940,11 @@ def test_standalone_manifest_signatures_prefix(runtmp): # try out 'prefix' for StandaloneManifestIndex ## first, build a manifest in memory using MultiIndex - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - ss47 = sourmash.load_one_signature(sig47) - ss63 = sourmash.load_one_signature(sig63) + sourmash.load_one_signature(sig47) + sourmash.load_one_signature(sig63) lidx1 = LinearIndex.load(sig47) lidx2 = LinearIndex.load(sig63) @@ -1917,11 +1952,10 @@ def test_standalone_manifest_signatures_prefix(runtmp): # ok, now remove the abspath prefix from iloc for row in mi.manifest.rows: - row['internal_location'] = os.path.basename(row['internal_location']) + row["internal_location"] = os.path.basename(row["internal_location"]) ## this should succeed! - mm = StandaloneManifestIndex(mi.manifest, None, - prefix=utils.get_test_data('')) + mm = StandaloneManifestIndex(mi.manifest, None, prefix=utils.get_test_data("")) assert len(list(mm.signatures())) == 2 @@ -1930,25 +1964,24 @@ def test_standalone_manifest_signatures_prefix_fail(runtmp): # give StandaloneManifest the wrong prefix ## first, build a manifest in memory using MultiIndex - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - ss47 = sourmash.load_one_signature(sig47) - ss63 = sourmash.load_one_signature(sig63) + sourmash.load_one_signature(sig47) + sourmash.load_one_signature(sig63) lidx1 = LinearIndex.load(sig47) lidx2 = LinearIndex.load(sig63) - print('XXX', lidx1.location) + print("XXX", lidx1.location) mi = MultiIndex.load([lidx1, lidx2], [sig47, sig63], "") # remove prefix from manifest for row in mi.manifest.rows: - row['internal_location'] = os.path.basename(row['internal_location']) + row["internal_location"] = os.path.basename(row["internal_location"]) ## got a manifest! ok, now test out StandaloneManifestIndex - mm = StandaloneManifestIndex(mi.manifest, None, - prefix=runtmp.output('foo')) + mm = StandaloneManifestIndex(mi.manifest, None, prefix=runtmp.output("foo")) # should fail with pytest.raises(ValueError) as exc: @@ -1960,37 +1993,37 @@ def test_standalone_manifest_signatures_prefix_fail(runtmp): def test_standalone_manifest_load_from_dir(runtmp): # test loading a mf with relative directory paths from test-data - mf = utils.get_test_data('scaled/mf.csv') + mf = utils.get_test_data("scaled/mf.csv") idx = sourmash.load_file_as_index(mf) siglist = list(idx.signatures()) assert len(siglist) == 15 - assert idx # should be 'True' + assert idx # should be 'True' assert len(idx) == 15 with pytest.raises(NotImplementedError): idx.insert() with pytest.raises(NotImplementedError): - idx.save('foo') + idx.save("foo") assert idx.location == mf def test_standalone_manifest_lazy_load(runtmp): # check that it's actually doing lazy loading - orig_sig47 = utils.get_test_data('47.fa.sig') - sig47 = runtmp.output('47.fa.sig') + orig_sig47 = utils.get_test_data("47.fa.sig") + sig47 = runtmp.output("47.fa.sig") # build an external manifest shutil.copyfile(orig_sig47, sig47) # this is an abspath to sig47 - runtmp.sourmash('sig', 'manifest', sig47, '-o', 'mf.csv') + runtmp.sourmash("sig", "manifest", sig47, "-o", "mf.csv") # should work to get signatures: - idx = StandaloneManifestIndex.load(runtmp.output('mf.csv')) + idx = StandaloneManifestIndex.load(runtmp.output("mf.csv")) siglist = list(idx.signatures()) assert len(siglist) == 1 @@ -2013,18 +2046,19 @@ def test_standalone_manifest_lazy_load(runtmp): def test_standalone_manifest_lazy_load_2_prefix(runtmp): # check that it's actually doing lazy loading; supply explicit prefix - orig_sig47 = utils.get_test_data('47.fa.sig') - sig47 = runtmp.output('47.fa.sig') + orig_sig47 = utils.get_test_data("47.fa.sig") + sig47 = runtmp.output("47.fa.sig") # build an external manifest # note, here use a relative path to 47.fa.sig; the manifest will contain # just '47.fa.sig' as the location shutil.copyfile(orig_sig47, sig47) - runtmp.sourmash('sig', 'manifest', '47.fa.sig', '-o', 'mf.csv') + runtmp.sourmash("sig", "manifest", "47.fa.sig", "-o", "mf.csv") # should work to get signatures: - idx = StandaloneManifestIndex.load(runtmp.output('mf.csv'), - prefix=runtmp.output('')) + idx = StandaloneManifestIndex.load( + runtmp.output("mf.csv"), prefix=runtmp.output("") + ) siglist = list(idx.signatures()) assert len(siglist) == 1 @@ -2047,68 +2081,68 @@ def test_standalone_manifest_lazy_load_2_prefix(runtmp): def test_standalone_manifest_search(runtmp): # test a straight up 'search' - query_sig = utils.get_test_data('scaled/genome-s12.fa.gz.sig') - mf = utils.get_test_data('scaled/mf.csv') + query_sig = utils.get_test_data("scaled/genome-s12.fa.gz.sig") + mf = utils.get_test_data("scaled/mf.csv") - runtmp.sourmash('search', query_sig, mf) + runtmp.sourmash("search", query_sig, mf) out = runtmp.last_result.out print(out) - assert '100.0% d84ef28f' in out + assert "100.0% d84ef28f" in out def test_standalone_manifest_prefetch_lazy(runtmp): # check that prefetch is actually doing lazy loading on manifest index. - orig_sig47 = utils.get_test_data('47.fa.sig') - sig47 = runtmp.output('47.fa.sig') - orig_sig2 = utils.get_test_data('2.fa.sig') - sig2 = runtmp.output('2.fa.sig') - orig_sig63 = utils.get_test_data('63.fa.sig') - sig63 = runtmp.output('63.fa.sig') + orig_sig47 = utils.get_test_data("47.fa.sig") + sig47 = runtmp.output("47.fa.sig") + orig_sig2 = utils.get_test_data("2.fa.sig") + sig2 = runtmp.output("2.fa.sig") + orig_sig63 = utils.get_test_data("63.fa.sig") + sig63 = runtmp.output("63.fa.sig") shutil.copyfile(orig_sig47, sig47) - runtmp.sourmash('sig', 'manifest', sig47, '-o', 'mf1.csv') + runtmp.sourmash("sig", "manifest", sig47, "-o", "mf1.csv") shutil.copyfile(orig_sig2, sig2) - runtmp.sourmash('sig', 'manifest', sig2, '-o', 'mf2.csv') + runtmp.sourmash("sig", "manifest", sig2, "-o", "mf2.csv") shutil.copyfile(orig_sig63, sig63) - runtmp.sourmash('sig', 'manifest', sig63, '-o', 'mf3.csv') + runtmp.sourmash("sig", "manifest", sig63, "-o", "mf3.csv") # combine the manifests, manually for now... - mf1 = CollectionManifest.load_from_filename(runtmp.output('mf1.csv')) + mf1 = CollectionManifest.load_from_filename(runtmp.output("mf1.csv")) assert len(mf1) == 1 - mf2 = CollectionManifest.load_from_filename(runtmp.output('mf2.csv')) + mf2 = CollectionManifest.load_from_filename(runtmp.output("mf2.csv")) assert len(mf2) == 3 - mf3 = CollectionManifest.load_from_filename(runtmp.output('mf3.csv')) + mf3 = CollectionManifest.load_from_filename(runtmp.output("mf3.csv")) assert len(mf3) == 1 mf = mf1 + mf2 + mf3 assert len(mf) == 5 - mf.write_to_filename(runtmp.output('mf.csv')) + mf.write_to_filename(runtmp.output("mf.csv")) # ok! now, remove the last signature, 'sig63'. os.unlink(sig63) # ...but loading the manifest should still work. - idx = StandaloneManifestIndex.load(runtmp.output('mf.csv')) + idx = StandaloneManifestIndex.load(runtmp.output("mf.csv")) # double check - third load will fail. this relies on load order :shrug:. sig_iter = iter(idx.signatures()) ss = next(sig_iter) print(ss) - assert '47.fa' in ss.filename + assert "47.fa" in ss.filename for i in range(3): ss = next(sig_iter) print(i, ss) - assert '2.fa' in ss.filename + assert "2.fa" in ss.filename with pytest.raises(ValueError) as exc: ss = next(sig_iter) - assert 'Error while reading signatures from' in str(exc) - assert '63.fa.sig' in str(exc) + assert "Error while reading signatures from" in str(exc) + assert "63.fa.sig" in str(exc) # ok! now test prefetch... should get one match legit, to 47, # and then no matches to 2, and then error. @@ -2125,5 +2159,5 @@ def test_standalone_manifest_prefetch_lazy(runtmp): with pytest.raises(ValueError) as exc: sr = next(g) - assert 'Error while reading signatures from' in str(exc) - assert '63.fa.sig' in str(exc) + assert "Error while reading signatures from" in str(exc) + assert "63.fa.sig" in str(exc) diff --git a/tests/test_index_protocol.py b/tests/test_index_protocol.py index 4a6672408e..b843e9883d 100644 --- a/tests/test_index_protocol.py +++ b/tests/test_index_protocol.py @@ -8,27 +8,30 @@ import sourmash from sourmash import SourmashSignature -from sourmash.index import (LinearIndex, ZipFileLinearIndex, - LazyLinearIndex, MultiIndex, - StandaloneManifestIndex, - IndexSearchResult) +from sourmash.index import ( + LinearIndex, + ZipFileLinearIndex, + LazyLinearIndex, + MultiIndex, + StandaloneManifestIndex, + IndexSearchResult, +) from sourmash.index import CounterGather from sourmash.index.sqlite_index import SqliteIndex from sourmash.index.revindex import RevIndex from sourmash.sbt import SBT, GraphFactory from sourmash.manifest import CollectionManifest, BaseCollectionManifest from sourmash.lca.lca_db import LCA_Database, load_single_database -from sourmash.minhash import (flatten_and_intersect_scaled, - flatten_and_downsample_scaled) +from sourmash.minhash import flatten_and_intersect_scaled, flatten_and_downsample_scaled import sourmash_tst_utils as utils def _load_three_sigs(): # utility function - load & return these three sigs. - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) @@ -55,7 +58,7 @@ def build_lazy_linear_index(runtmp): def build_sbt_index(runtmp): ss2, ss47, ss63 = _load_three_sigs() - + factory = GraphFactory(5, 100, 3) root = SBT(factory, d=2) @@ -68,7 +71,7 @@ def build_sbt_index(runtmp): def build_sbt_index_save_load(runtmp): root = build_sbt_index(runtmp) - out = runtmp.output('xyz.sbt.zip') + out = runtmp.output("xyz.sbt.zip") root.save(out) return sourmash.load_file_as_index(out) @@ -77,7 +80,7 @@ def build_sbt_index_save_load(runtmp): def build_zipfile_index(runtmp): from sourmash.save_load import SaveSignatures_ZipFile - location = runtmp.output('index.zip') + location = runtmp.output("index.zip") with SaveSignatures_ZipFile(location) as save_sigs: for ss in _load_three_sigs(): save_sigs.add(ss) @@ -95,9 +98,9 @@ def build_multi_index(runtmp): def build_standalone_manifest_index(runtmp): - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) @@ -106,10 +109,10 @@ def build_standalone_manifest_index(runtmp): siglist = [(ss2, sig2), (ss47, sig47), (ss63, sig63)] rows = [] - rows.extend((CollectionManifest.make_manifest_row(ss, loc) for ss, loc in siglist )) + rows.extend((CollectionManifest.make_manifest_row(ss, loc) for ss, loc in siglist)) mf = CollectionManifest(rows) mf_filename = runtmp.output("mf.csv") - + mf.write_to_filename(mf_filename) idx = StandaloneManifestIndex.load(mf_filename) @@ -118,7 +121,7 @@ def build_standalone_manifest_index(runtmp): def build_lca_index(runtmp): siglist = _load_three_sigs() - db = LCA_Database(31, 1000, 'DNA') + db = LCA_Database(31, 1000, "DNA") for ss in siglist: db.insert(ss) @@ -127,14 +130,14 @@ def build_lca_index(runtmp): def build_lca_index_save_load(runtmp): db = build_lca_index(runtmp) - outfile = runtmp.output('db.lca.json') + outfile = runtmp.output("db.lca.json") db.save(outfile) return sourmash.load_file_as_index(outfile) def build_sqlite_index(runtmp): - filename = runtmp.output('idx.sqldb') + filename = runtmp.output("idx.sqldb") db = SqliteIndex.create(filename) siglist = _load_three_sigs() @@ -157,8 +160,8 @@ def build_revindex(runtmp): def build_lca_index_save_load_sql(runtmp): db = build_lca_index(runtmp) - outfile = runtmp.output('db.lca.json') - db.save(outfile, format='sql') + outfile = runtmp.output("db.lca.json") + db.save(outfile, format="sql") x = load_single_database(outfile) db_load = x[0] @@ -171,19 +174,22 @@ def build_lca_index_save_load_sql(runtmp): # building functions. # -@pytest.fixture(params=[build_linear_index, - build_lazy_linear_index, - build_sbt_index, - build_zipfile_index, - build_multi_index, - build_standalone_manifest_index, - build_lca_index, - build_sbt_index_save_load, - build_lca_index_save_load, - build_sqlite_index, - build_lca_index_save_load_sql, -# build_revindex, - ] + +@pytest.fixture( + params=[ + build_linear_index, + build_lazy_linear_index, + build_sbt_index, + build_zipfile_index, + build_multi_index, + build_standalone_manifest_index, + build_lca_index, + build_sbt_index_save_load, + build_lca_index_save_load, + build_sqlite_index, + build_lca_index_save_load_sql, + # build_revindex, + ] ) def index_obj(request, runtmp): build_fn = request.param @@ -271,7 +277,7 @@ def test_index_signatures(index_obj): assert len(siglist) == 3 # check md5sums, since 'in' doesn't always work - md5s = set(( ss.md5sum() for ss in siglist )) + md5s = set(ss.md5sum() for ss in siglist) assert ss2.md5sum() in md5s assert ss47.md5sum() in md5s assert ss63.md5sum() in md5s @@ -285,7 +291,7 @@ def test_index_signatures_with_location(index_obj): assert len(siglist) == 3 # check md5sums, since 'in' doesn't always work - md5s = set(( ss.md5sum() for ss, loc in siglist )) + md5s = set((ss.md5sum() for ss, loc in siglist)) assert ss2.md5sum() in md5s assert ss47.md5sum() in md5s assert ss63.md5sum() in md5s @@ -315,15 +321,22 @@ def test_index_manifest(index_obj): def test_index_select_basic(index_obj): # select does the basic thing ok - idx = index_obj.select(ksize=31, moltype='DNA', abund=False, - containment=True, scaled=1000, num=0, picklist=None) + idx = index_obj.select( + ksize=31, + moltype="DNA", + abund=False, + containment=True, + scaled=1000, + num=0, + picklist=None, + ) assert len(idx) == 3 siglist = list(idx.signatures()) assert len(siglist) == 3 # check md5sums, since 'in' doesn't always work - md5s = set(( ss.md5sum() for ss in siglist )) + md5s = set(ss.md5sum() for ss in siglist) ss2, ss47, ss63 = _load_three_sigs() assert ss2.md5sum() in md5s assert ss47.md5sum() in md5s @@ -477,6 +490,7 @@ class CounterGather_LinearIndex: Provides an (inefficient) CounterGather-style class, for protocol testing purposes. """ + def __init__(self, orig_query): "Constructor - take a SourmashSignature that is the original query." orig_query_mh = orig_query.minhash @@ -564,6 +578,7 @@ class CounterGather_LCA: based on LCA_Database. This is currently just for protocol and API testing purposes. """ + def __init__(self, query): from sourmash.lca.lca_db import LCA_Database @@ -572,8 +587,7 @@ def __init__(self, query): raise ValueError("must use scaled MinHash") self.orig_query_mh = query_mh - lca_db = LCA_Database(query_mh.ksize, query_mh.scaled, - query_mh.moltype) + lca_db = LCA_Database(query_mh.ksize, query_mh.scaled, query_mh.moltype) self.db = lca_db self.siglist = {} self.locations = {} @@ -598,8 +612,7 @@ def add(self, ss, *, location=None, require_overlap=True): def signatures(self): "Yield all signatures." - for ss in self.siglist.values(): - yield ss + yield from self.siglist.values() def downsample(self, scaled): "Track highest scaled across all possible matches." @@ -635,8 +648,7 @@ def peek(self, query_mh, *, threshold_bp=0): cont = result.score match = result.signature - intersect_mh = flatten_and_intersect_scaled(result.signature.minhash, - query_mh) + intersect_mh = flatten_and_intersect_scaled(result.signature.minhash, query_mh) md5 = result.signature.md5sum() location = self.locations[md5] @@ -648,10 +660,12 @@ def consume(self, intersect_mh): self.query_started = 1 -@pytest.fixture(params=[CounterGather, - CounterGather_LinearIndex, - CounterGather_LCA, - ] +@pytest.fixture( + params=[ + CounterGather, + CounterGather_LinearIndex, + CounterGather_LCA, + ] ) def counter_gather_constructor(request): build_fn = request.param @@ -664,19 +678,19 @@ def test_counter_get_signatures(counter_gather_constructor): # test .signatures() method query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear() match_mh_2.add_many(range(10, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear() match_mh_3.add_many(range(15, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") counter = counter_gather_constructor(query_ss) counter.add(match_ss_1) @@ -720,19 +734,19 @@ def test_counter_gather_1(counter_gather_constructor): # generated via CounterGather query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear() match_mh_2.add_many(range(10, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear() match_mh_3.add_many(range(15, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") # load up the counter counter = counter_gather_constructor(query_ss) @@ -742,9 +756,11 @@ def test_counter_gather_1(counter_gather_constructor): results = _consume_all(query_ss.minhash, counter) - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) + expected = ( + ["match1", 10], + ["match2", 5], + ["match3", 2], + ) assert len(results) == len(expected), results for (sr, size), (exp_name, exp_size) in zip(results, expected): @@ -762,19 +778,19 @@ def test_counter_gather_1_b(counter_gather_constructor): # larger. query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear() match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear() match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") # load up the counter counter = counter_gather_constructor(query_ss) @@ -784,9 +800,11 @@ def test_counter_gather_1_b(counter_gather_constructor): results = _consume_all(query_ss.minhash, counter) - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) + expected = ( + ["match1", 10], + ["match2", 5], + ["match3", 2], + ) assert len(results) == len(expected), results for (sr, size), (exp_name, exp_size) in zip(results, expected): @@ -806,19 +824,19 @@ def test_counter_gather_1_c_with_threshold(counter_gather_constructor): query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear() match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear() match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") # load up the counter counter = counter_gather_constructor(query_ss) @@ -826,11 +844,9 @@ def test_counter_gather_1_c_with_threshold(counter_gather_constructor): counter.add(match_ss_2) counter.add(match_ss_3) - results = _consume_all(query_ss.minhash, counter, - threshold_bp=3) + results = _consume_all(query_ss.minhash, counter, threshold_bp=3) - expected = (['match1', 10], - ['match2', 5]) + expected = (["match1", 10], ["match2", 5]) assert len(results) == len(expected), results for (sr, size), (exp_name, exp_size) in zip(results, expected): @@ -844,19 +860,19 @@ def test_counter_gather_1_d_diff_scaled(counter_gather_constructor): # test as above, but with different scaled. query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear().downsample(scaled=10) match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear().downsample(scaled=20) match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear().downsample(scaled=30) match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") # load up the counter counter = counter_gather_constructor(query_ss) @@ -866,9 +882,11 @@ def test_counter_gather_1_d_diff_scaled(counter_gather_constructor): results = _consume_all(query_ss.minhash, counter) - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) + expected = ( + ["match1", 10], + ["match2", 5], + ["match3", 2], + ) assert len(results) == len(expected), results for (sr, size), (exp_name, exp_size) in zip(results, expected): @@ -885,18 +903,18 @@ def test_counter_gather_1_d_diff_scaled_query(counter_gather_constructor): match_mh_1 = query_mh.copy_and_clear().downsample(scaled=10) match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear().downsample(scaled=20) match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear().downsample(scaled=30) match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") # downsample query now - - query_ss = SourmashSignature(query_mh.downsample(scaled=100), name='query') + query_ss = SourmashSignature(query_mh.downsample(scaled=100), name="query") # load up the counter counter = counter_gather_constructor(query_ss) @@ -906,9 +924,11 @@ def test_counter_gather_1_d_diff_scaled_query(counter_gather_constructor): results = _consume_all(query_ss.minhash, counter) - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) + expected = ( + ["match1", 10], + ["match2", 5], + ["match3", 2], + ) assert len(results) == len(expected), results for (sr, size), (exp_name, exp_size) in zip(results, expected): @@ -922,19 +942,19 @@ def test_counter_gather_1_e_abund_query(counter_gather_constructor): # test as above, but abund query query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1, track_abundance=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear().flatten() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear().flatten() match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear().flatten() match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") # load up the counter counter = counter_gather_constructor(query_ss) @@ -945,9 +965,11 @@ def test_counter_gather_1_e_abund_query(counter_gather_constructor): # must flatten before peek! results = _consume_all(query_ss.minhash.flatten(), counter) - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) + expected = ( + ["match1", 10], + ["match2", 5], + ["match3", 2], + ) assert len(results) == len(expected), results for (sr, size), (exp_name, exp_size) in zip(results, expected): @@ -961,19 +983,19 @@ def test_counter_gather_1_f_abund_match(counter_gather_constructor): # test as above, but abund query query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1, track_abundance=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh.flatten(), name='query') + query_ss = SourmashSignature(query_mh.flatten(), name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") match_mh_2 = query_mh.copy_and_clear() match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') + match_ss_2 = SourmashSignature(match_mh_2, name="match2") match_mh_3 = query_mh.copy_and_clear() match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') + match_ss_3 = SourmashSignature(match_mh_3, name="match3") # load up the counter counter = counter_gather_constructor(query_ss) @@ -984,9 +1006,11 @@ def test_counter_gather_1_f_abund_match(counter_gather_constructor): # must flatten before peek! results = _consume_all(query_ss.minhash.flatten(), counter) - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) + expected = ( + ["match1", 10], + ["match2", 5], + ["match3", 2], + ) assert len(results) == len(expected), results for (sr, size), (exp_name, exp_size) in zip(results, expected): @@ -999,13 +1023,14 @@ def test_counter_gather_1_f_abund_match(counter_gather_constructor): def test_counter_gather_2(counter_gather_constructor): # check basic set of gather results on semi-real data, # generated via CounterGather - testdata_combined = utils.get_test_data('gather/combined.sig') - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_combined = utils.get_test_data("gather/combined.sig") + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) query_ss = sourmash.load_one_signature(testdata_combined, ksize=21) - subject_sigs = [ (sourmash.load_one_signature(t, ksize=21), t) - for t in testdata_sigs ] + subject_sigs = [ + (sourmash.load_one_signature(t, ksize=21), t) for t in testdata_sigs + ] # load up the counter counter = counter_gather_constructor(query_ss) @@ -1014,18 +1039,20 @@ def test_counter_gather_2(counter_gather_constructor): results = _consume_all(query_ss.minhash, counter) - expected = (['NC_003198.1', 487], - ['NC_000853.1', 192], - ['NC_011978.1', 169], - ['NC_002163.1', 157], - ['NC_003197.2', 152], - ['NC_009486.1', 92], - ['NC_006905.1', 76], - ['NC_011080.1', 59], - ['NC_011274.1', 42], - ['NC_006511.1', 31], - ['NC_011294.1', 7], - ['NC_004631.1', 2]) + expected = ( + ["NC_003198.1", 487], + ["NC_000853.1", 192], + ["NC_011978.1", 169], + ["NC_002163.1", 157], + ["NC_003197.2", 152], + ["NC_009486.1", 92], + ["NC_006905.1", 76], + ["NC_011080.1", 59], + ["NC_011274.1", 42], + ["NC_006511.1", 31], + ["NC_011294.1", 7], + ["NC_004631.1", 2], + ) assert len(results) == len(expected) for (sr, size), (exp_name, exp_size) in zip(results, expected): @@ -1040,11 +1067,11 @@ def test_counter_gather_exact_match(counter_gather_constructor): # query == match query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") # load up the counter; provide a location override, too. counter = counter_gather_constructor(query_ss) - counter.add(query_ss, location='somewhere over the rainbow') + counter.add(query_ss, location="somewhere over the rainbow") results = _consume_all(query_ss.minhash, counter) assert len(results) == 1 @@ -1052,14 +1079,14 @@ def test_counter_gather_exact_match(counter_gather_constructor): assert sr.score == 1.0 assert sr.signature == query_ss - assert sr.location == 'somewhere over the rainbow' + assert sr.location == "somewhere over the rainbow" def test_counter_gather_multiple_identical_matches(counter_gather_constructor): # test multiple identical matches being inserted, with only one return query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") # create counter... counter = counter_gather_constructor(query_ss) @@ -1068,7 +1095,7 @@ def test_counter_gather_multiple_identical_matches(counter_gather_constructor): match_mh = query_mh.copy_and_clear() match_mh.add_many(range(5, 15)) - for name in 'match1', 'match2', 'match3': + for name in "match1", "match2", "match3": match_ss = SourmashSignature(match_mh, name=name) counter.add(match_ss, location=name) @@ -1080,18 +1107,18 @@ def test_counter_gather_multiple_identical_matches(counter_gather_constructor): assert overlap_count == 10 # any one of the three is valid - assert sr.location in ('match1', 'match2', 'match3') + assert sr.location in ("match1", "match2", "match3") def test_counter_gather_add_after_peek(counter_gather_constructor): # cannot add after peek or consume query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") # load up the counter counter = counter_gather_constructor(query_ss) - counter.add(query_ss, location='somewhere over the rainbow') + counter.add(query_ss, location="somewhere over the rainbow") counter.peek(query_ss.minhash) @@ -1103,11 +1130,11 @@ def test_counter_gather_add_after_consume(counter_gather_constructor): # cannot add after peek or consume query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") # load up the counter counter = counter_gather_constructor(query_ss) - counter.add(query_ss, location='somewhere over the rainbow') + counter.add(query_ss, location="somewhere over the rainbow") counter.consume(query_ss.minhash) @@ -1119,11 +1146,11 @@ def test_counter_gather_consume_empty_intersect(counter_gather_constructor): # check that consume works fine when there is an empty signature. query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") # load up the counter counter = counter_gather_constructor(query_ss) - counter.add(query_ss, location='somewhere over the rainbow') + counter.add(query_ss, location="somewhere over the rainbow") # nothing really happens here :laugh:, just making sure there's no error counter.consume(query_ss.minhash.copy_and_clear()) @@ -1132,11 +1159,11 @@ def test_counter_gather_consume_empty_intersect(counter_gather_constructor): def test_counter_gather_empty_initial_query(counter_gather_constructor): # check empty initial query query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") # load up the counter counter = counter_gather_constructor(query_ss) @@ -1149,7 +1176,7 @@ def test_counter_gather_num_query(counter_gather_constructor): # check num query query_mh = sourmash.MinHash(n=500, ksize=31) query_mh.add_many(range(0, 10)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") with pytest.raises(ValueError): counter_gather_constructor(query_ss) @@ -1159,11 +1186,11 @@ def test_counter_gather_empty_cur_query(counter_gather_constructor): # test empty cur query query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") # load up the counter counter = counter_gather_constructor(query_ss) - counter.add(query_ss, location='somewhere over the rainbow') + counter.add(query_ss, location="somewhere over the rainbow") cur_query_mh = query_ss.minhash.copy_and_clear() results = _consume_all(cur_query_mh, counter) @@ -1174,27 +1201,27 @@ def test_counter_gather_add_num_matchy(counter_gather_constructor): # test add num query query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh = sourmash.MinHash(n=500, ksize=31) match_mh.add_many(range(0, 20)) - match_ss = SourmashSignature(match_mh, name='query') + match_ss = SourmashSignature(match_mh, name="query") # load up the counter counter = counter_gather_constructor(query_ss) with pytest.raises(ValueError): - counter.add(match_ss, location='somewhere over the rainbow') + counter.add(match_ss, location="somewhere over the rainbow") def test_counter_gather_bad_cur_query(counter_gather_constructor): # test cur query that is not subset of original query query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") # load up the counter counter = counter_gather_constructor(query_ss) - counter.add(query_ss, location='somewhere over the rainbow') + counter.add(query_ss, location="somewhere over the rainbow") cur_query_mh = query_ss.minhash.copy_and_clear() cur_query_mh.add_many(range(20, 30)) @@ -1206,11 +1233,11 @@ def test_counter_gather_add_no_overlap(counter_gather_constructor): # check adding match with no overlap w/query query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 10)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(10, 20)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") # load up the counter counter = counter_gather_constructor(query_ss) @@ -1224,18 +1251,18 @@ def test_counter_gather_big_threshold(counter_gather_constructor): # check 'peek' with a huge threshold query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") match_mh_1 = query_mh.copy_and_clear() match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') + match_ss_1 = SourmashSignature(match_mh_1, name="match1") # load up the counter counter = counter_gather_constructor(query_ss) counter.add(match_ss_1) # impossible threshold: - threshold_bp=30*query_ss.minhash.scaled + threshold_bp = 30 * query_ss.minhash.scaled results = counter.peek(query_ss.minhash, threshold_bp=threshold_bp) assert results == [] @@ -1243,7 +1270,7 @@ def test_counter_gather_big_threshold(counter_gather_constructor): def test_counter_gather_empty_counter(counter_gather_constructor): # check empty counter query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_ss = SourmashSignature(query_mh, name='query') + query_ss = SourmashSignature(query_mh, name="query") # empty counter! counter = counter_gather_constructor(query_ss) diff --git a/tests/test_jaccard.py b/tests/test_jaccard.py index ce0846a3ae..87093ee194 100644 --- a/tests/test_jaccard.py +++ b/tests/test_jaccard.py @@ -72,10 +72,10 @@ def test_dna_mh(track_abundance): e1 = MinHash(n=5, ksize=4, track_abundance=track_abundance) e2 = MinHash(n=5, ksize=4, track_abundance=track_abundance) - seq = 'ATGGCAGTGACGATGCCAG' + seq = "ATGGCAGTGACGATGCCAG" e1.add_sequence(seq) for i in range(len(seq) - 3): - e2.add_kmer(seq[i:i + 4]) + e2.add_kmer(seq[i : i + 4]) assert e1.hashes.keys() == e2.hashes.keys() print(e1.hashes.keys()) @@ -84,19 +84,17 @@ def test_dna_mh(track_abundance): def test_protein_mh(track_abundance): - e1 = MinHash(n=5, ksize=2, is_protein=True, - track_abundance=track_abundance) - e2 = MinHash(n=5, ksize=2, is_protein=True, - track_abundance=track_abundance) + e1 = MinHash(n=5, ksize=2, is_protein=True, track_abundance=track_abundance) + e2 = MinHash(n=5, ksize=2, is_protein=True, track_abundance=track_abundance) # ok, so this is confusing, but: we are adding _DNA_ kmers here, # and translating. so, add_sequence and add_kmer actually both add # 6-mers. - seq = 'ATGGCAGTGACGATGCCG' + seq = "ATGGCAGTGACGATGCCG" e1.add_sequence(seq) for i in range(len(seq) - 5): - kmer = seq[i:i + 6] + kmer = seq[i : i + 6] e2.add_kmer(kmer) assert e1.hashes.keys() == e2.hashes.keys() @@ -107,10 +105,9 @@ def test_pickle(track_abundance): import pickle from io import BytesIO - e1 = MinHash(n=5, ksize=6, is_protein=False, - track_abundance=track_abundance) + e1 = MinHash(n=5, ksize=6, is_protein=False, track_abundance=track_abundance) - seq = 'ATGGCAGTGACGATGCCG' + seq = "ATGGCAGTGACGATGCCG" e1.add_sequence(seq) e1.add_sequence(seq) @@ -131,8 +128,7 @@ def test_pickle(track_abundance): def test_bad_construct_1(track_abundance): try: - e1 = MinHash(ksize=6, is_protein=False, - track_abundance=track_abundance) + MinHash(ksize=6, is_protein=False, track_abundance=track_abundance) assert 0, "require n in constructor" except TypeError: pass @@ -140,8 +136,7 @@ def test_bad_construct_1(track_abundance): def test_bad_construct_2(track_abundance): try: - e1 = MinHash(n=100, is_protein=False, - track_abundance=track_abundance) + MinHash(n=100, is_protein=False, track_abundance=track_abundance) assert 0, "require ksize in constructor" except TypeError: pass @@ -175,15 +170,16 @@ def test_abund_similarity_zero(): #### + def test_jaccard_on_real_data(): from sourmash.signature import load_signatures - afile = 'n10000/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz' + afile = "n10000/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz" a = utils.get_test_data(afile) sig1 = list(load_signatures(a))[0] mh1 = sig1.minhash - bfile = 'n10000/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz' + bfile = "n10000/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz" b = utils.get_test_data(bfile) sig2 = list(load_signatures(b))[0] mh2 = sig2.minhash @@ -210,12 +206,12 @@ def test_jaccard_on_real_data(): def test_scaled_on_real_data(): from sourmash.signature import load_signatures - afile = 'scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz' + afile = "scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz" a = utils.get_test_data(afile) sig1 = list(load_signatures(a))[0] mh1 = sig1.minhash - bfile = 'scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz' + bfile = "scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz" b = utils.get_test_data(bfile) sig2 = list(load_signatures(b))[0] mh2 = sig2.minhash @@ -243,12 +239,12 @@ def test_scaled_on_real_data(): def test_scaled_on_real_data_2(): from sourmash.signature import load_signatures - afile = 'scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz' + afile = "scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz" a = utils.get_test_data(afile) sig1 = list(load_signatures(a))[0] mh1 = sig1.minhash - bfile = 'scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz' + bfile = "scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz" b = utils.get_test_data(bfile) sig2 = list(load_signatures(b))[0] mh2 = sig2.minhash @@ -276,12 +272,12 @@ def test_scaled_on_real_data_2(): def test_downsample_scaled_with_num(): from sourmash.signature import load_signatures - afile = 'scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz' + afile = "scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz" a = utils.get_test_data(afile) sig1 = list(load_signatures(a))[0] mh1 = sig1.minhash with pytest.raises(ValueError) as exc: - mh = mh1.downsample(num=500) + mh1.downsample(num=500) - assert 'cannot downsample a scaled MinHash using num' in str(exc.value) + assert "cannot downsample a scaled MinHash using num" in str(exc.value) diff --git a/tests/test_lca.py b/tests/test_lca.py index 46b1d9716d..7db105628e 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -21,8 +21,7 @@ def test_api_create_search(): # create a database and then search for result. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) assert len(lca_db) == 0 @@ -44,18 +43,16 @@ def test_api_create_search(): def test_api_find_picklist_select(): # does 'find' respect picklists? - sig47 = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - sig63 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), - ksize=31) + sig47 = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + sig63 = sourmash.load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(sig47) lca_db.insert(sig63) # construct a picklist... - picklist = SignaturePicklist('md5prefix8') - picklist.init(['09a08691']) + picklist = SignaturePicklist("md5prefix8") + picklist.init(["09a08691"]) # run a 'find' with sig63, should find 47 and 63 both. search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0) @@ -72,24 +69,22 @@ def test_api_find_picklist_select(): # and check that it is the expected one! ss = results[0].signature assert ss.minhash.ksize == 31 - assert ss.md5sum().startswith('09a08691c') + assert ss.md5sum().startswith("09a08691c") def test_api_find_picklist_select_exclude(): # does 'find' respect picklists? - sig47 = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - sig63 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), - ksize=31) + sig47 = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + sig63 = sourmash.load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(sig47) lca_db.insert(sig63) # construct a picklist... - picklist = SignaturePicklist('md5prefix8', pickstyle= PickStyle.EXCLUDE) - picklist.init(['09a08691']) + picklist = SignaturePicklist("md5prefix8", pickstyle=PickStyle.EXCLUDE) + picklist.init(["09a08691"]) # run a 'find' with sig63, should find 47 and 63 both. search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0) @@ -106,13 +101,12 @@ def test_api_find_picklist_select_exclude(): # and check that it is the expected one! ss = results[0].signature assert ss.minhash.ksize == 31 - assert ss.md5sum().startswith('38729c637') + assert ss.md5sum().startswith("38729c637") def test_api_create_insert(): # test some internal implementation stuff: create & then insert a sig. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss) @@ -131,16 +125,15 @@ def test_api_create_insert(): for vv in lca_db._hashval_to_idx.values(): set_of_values.update(vv) assert len(set_of_values) == 1 - assert set_of_values == { 0 } + assert set_of_values == {0} - assert not lca_db._idx_to_lid # no lineage added - assert not lca_db._lid_to_lineage # no lineage added + assert not lca_db._idx_to_lid # no lineage added + assert not lca_db._lid_to_lineage # no lineage added def test_api_create_insert_bad_ksize(): # can we insert a ksize=21 signature into a ksize=31 DB? hopefully not. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=21, scaled=1000) with pytest.raises(ValueError): @@ -149,17 +142,15 @@ def test_api_create_insert_bad_ksize(): def test_api_create_insert_bad_ident(): # can we insert a signature with no/empty ident? - ss1 = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - ss2 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), - ksize=31) + ss1 = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + ss2 = sourmash.load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) ss1 = ss1.to_mutable() ss2 = ss2.to_mutable() - ss1.name = '' - ss1.filename = '' - ss2.name = '' - ss2.filename = '' + ss1.name = "" + ss1.filename = "" + ss2.name = "" + ss2.filename = "" lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss1) @@ -171,8 +162,7 @@ def test_api_create_insert_bad_ident(): def test_api_create_insert_bad_scaled(): # can we insert a scaled=1000 signature into a scaled=500 DB? # hopefully not. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) assert ss.minhash.scaled == 1000 lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=500) @@ -183,11 +173,10 @@ def test_api_create_insert_bad_scaled(): def test_api_create_insert_bad_moltype(): # can we insert a DNAsignature into a protein DB? # hopefully not. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - assert ss.minhash.moltype == 'DNA' + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + assert ss.minhash.moltype == "DNA" - lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=500, moltype='protein') + lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=500, moltype="protein") with pytest.raises(ValueError): lca_db.insert(ss) @@ -195,13 +184,12 @@ def test_api_create_insert_bad_moltype(): def test_api_create_insert_ident(): # test some internal implementation stuff: signature inserted with # different ident than name. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) - lca_db.insert(ss, ident='foo') + lca_db.insert(ss, ident="foo") - ident = 'foo' + ident = "foo" assert len(lca_db._ident_to_name) == 1 assert ident in lca_db._ident_to_name assert lca_db._ident_to_name[ident] == ss.name @@ -215,27 +203,25 @@ def test_api_create_insert_ident(): for vv in lca_db._hashval_to_idx.values(): set_of_values.update(vv) assert len(set_of_values) == 1 - assert set_of_values == { 0 } + assert set_of_values == {0} - assert not lca_db._idx_to_lid # no lineage added - assert not lca_db._lid_to_lineage # no lineage added + assert not lca_db._idx_to_lid # no lineage added + assert not lca_db._lid_to_lineage # no lineage added assert not lca_db._lineage_to_lid assert not lca_db._lid_to_idx def test_api_create_insert_two(): # check internal details if multiple signatures are inserted. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - ss2 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + ss2 = sourmash.load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) - lca_db.insert(ss, ident='foo') - lca_db.insert(ss2, ident='bar') + lca_db.insert(ss, ident="foo") + lca_db.insert(ss2, ident="bar") - ident = 'foo' - ident2 = 'bar' + ident = "foo" + ident2 = "bar" assert len(lca_db._ident_to_name) == 2 assert ident in lca_db._ident_to_name assert ident2 in lca_db._ident_to_name @@ -258,22 +244,20 @@ def test_api_create_insert_two(): for vv in lca_db._hashval_to_idx.values(): set_of_values.update(vv) assert len(set_of_values) == 2 - assert set_of_values == { 0, 1 } + assert set_of_values == {0, 1} - assert not lca_db._idx_to_lid # no lineage added - assert not lca_db._lid_to_lineage # no lineage added + assert not lca_db._idx_to_lid # no lineage added + assert not lca_db._lid_to_lineage # no lineage added assert not lca_db._lineage_to_lid assert not lca_db._lid_to_idx def test_api_create_insert_w_lineage(): # test some internal implementation stuff - insert signature w/lineage - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) - lineage = ((LineagePair('rank1', 'name1'), - LineagePair('rank2', 'name2'))) + lineage = (LineagePair("rank1", "name1"), LineagePair("rank2", "name2")) lca_db.insert(ss, lineage=lineage) @@ -293,14 +277,14 @@ def test_api_create_insert_w_lineage(): for vv in lca_db._hashval_to_idx.values(): set_of_values.update(vv) assert len(set_of_values) == 1 - assert set_of_values == { 0 } + assert set_of_values == {0} # check lineage stuff assert len(lca_db._idx_to_lid) == 1 assert lca_db._idx_to_lid[0] == 0 assert len(lca_db._lid_to_lineage) == 1 assert lca_db._lid_to_lineage[0] == lineage - assert lca_db._lid_to_idx[0] == { 0 } + assert lca_db._lid_to_idx[0] == {0} assert len(lca_db._lineage_to_lid) == 1 assert lca_db._lineage_to_lid[lineage] == 0 @@ -308,12 +292,10 @@ def test_api_create_insert_w_lineage(): def test_api_create_insert_w_bad_lineage(): # test some internal implementation stuff - insert signature w/bad lineage - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) - lineage = ([LineagePair('rank1', 'name1'), - LineagePair('rank2', 'name2')],) + lineage = ([LineagePair("rank1", "name1"), LineagePair("rank2", "name2")],) with pytest.raises(ValueError): lca_db.insert(ss, lineage=lineage) @@ -321,11 +303,10 @@ def test_api_create_insert_w_bad_lineage(): def test_api_create_insert_w_bad_lineage_2(): # test some internal implementation stuff - insert signature w/bad lineage - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) - lineage = 1 # something non-iterable... + lineage = 1 # something non-iterable... with pytest.raises(ValueError): lca_db.insert(ss, lineage=lineage) @@ -333,8 +314,7 @@ def test_api_create_insert_w_bad_lineage_2(): def test_api_create_gather(): # create a database, and then run gather on it. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss) @@ -348,10 +328,8 @@ def test_api_create_gather(): def test_api_add_genome_lineage(): # LCA_Databases can store/retrieve arbitrary lineages/taxonomies. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - lineage = ((LineagePair('rank1', 'name1'), - (LineagePair('rank2', 'name2')))) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + lineage = (LineagePair("rank1", "name1"), (LineagePair("rank2", "name2"))) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss, lineage=lineage) @@ -366,26 +344,24 @@ def test_api_add_genome_lineage(): def test_api_insert_update(): # check that cached parts of LCA_Database are updated when a new # signature is inserted. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - ss2 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + ss2 = sourmash.load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss) - all_mh = [ x.minhash for x in lca_db.signatures() ] + all_mh = [x.minhash for x in lca_db.signatures()] assert ss.minhash in all_mh # see decorator @cached_property - assert hasattr(lca_db, '_cache') + assert hasattr(lca_db, "_cache") assert lca_db._cache # inserting a signature should delete the cache lca_db.insert(ss2) - assert not hasattr(lca_db, '_cache') + assert not hasattr(lca_db, "_cache") # check that it's rebuilt etc. etc. - all_mh = [ x.minhash for x in lca_db.signatures() ] + all_mh = [x.minhash for x in lca_db.signatures()] assert ss.minhash in all_mh assert ss2.minhash in all_mh @@ -393,8 +369,7 @@ def test_api_insert_update(): def test_api_insert_retrieve_check_name(): # check that signatures retrieved from LCA_Database objects have the # right name. - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss) @@ -408,10 +383,8 @@ def test_api_insert_retrieve_check_name(): def test_api_create_insert_two_then_scale(): # construct database, THEN downsample - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - ss2 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + ss2 = sourmash.load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss) @@ -431,10 +404,8 @@ def test_api_create_insert_two_then_scale(): def test_api_create_insert_two_then_scale_then_add(): # construct database, THEN downsample, then add another - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - ss2 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + ss2 = sourmash.load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) lca_db.insert(ss) @@ -460,10 +431,8 @@ def test_api_create_insert_two_then_scale_then_add(): def test_api_create_insert_scale_two(): # downsample while constructing database - ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), - ksize=31) - ss2 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), - ksize=31) + ss = sourmash.load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + ss2 = sourmash.load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) # downsample to 5000 while inserting: lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=5000) @@ -483,7 +452,7 @@ def test_api_create_insert_scale_two(): def test_load_single_db(): - filename = utils.get_test_data('lca/delmont-1.lca.json') + filename = utils.get_test_data("lca/delmont-1.lca.json") db, ksize, scaled = lca_utils.load_single_database(filename) print(db) @@ -494,9 +463,9 @@ def test_load_single_db(): def test_load_single_db_empty(runtmp): # test load_single_database on an empty file; should raise ValueError - empty = runtmp.output('empty.lca.json') + empty = runtmp.output("empty.lca.json") - with open(empty, "wt") as fp: + with open(empty, "w"): pass with pytest.raises(ValueError) as exc: @@ -506,8 +475,8 @@ def test_load_single_db_empty(runtmp): def test_databases(): - filename1 = utils.get_test_data('lca/delmont-1.lca.json') - filename2 = utils.get_test_data('lca/delmont-2.lca.json') + filename1 = utils.get_test_data("lca/delmont-1.lca.json") + filename2 = utils.get_test_data("lca/delmont-2.lca.json") dblist, ksize, scaled = lca_utils.load_databases([filename1, filename2]) print(dblist) @@ -518,7 +487,7 @@ def test_databases(): def test_databases_load_fail_on_no_JSON(): - filename1 = utils.get_test_data('prot/protein.zip') + filename1 = utils.get_test_data("prot/protein.zip") with pytest.raises(ValueError) as exc: dblist, ksize, scaled = lca_utils.load_databases([filename1]) @@ -528,36 +497,37 @@ def test_databases_load_fail_on_no_JSON(): def test_databases_load_fail_on_dir(): - filename1 = utils.get_test_data('lca') + filename1 = utils.get_test_data("lca") with pytest.raises(ValueError) as exc: dblist, ksize, scaled = lca_utils.load_databases([filename1]) err = str(exc.value) print(err) assert f"'{filename1}' is not a file and cannot be loaded as an LCA database" in err - assert not 'found 0 matches total;' in err + assert "found 0 matches total;" not in err def test_databases_load_fail_on_not_exist(): - filename1 = utils.get_test_data('does-not-exist') + filename1 = utils.get_test_data("does-not-exist") with pytest.raises(ValueError) as exc: dblist, ksize, scaled = lca_utils.load_databases([filename1]) err = str(exc.value) print(err) assert f"'{filename1}' is not a file and cannot be loaded as an LCA database" in err - assert not 'found 0 matches total;' in err + assert "found 0 matches total;" not in err + def test_db_repr(): - filename = utils.get_test_data('lca/delmont-1.lca.json') + filename = utils.get_test_data("lca/delmont-1.lca.json") db, ksize, scaled = lca_utils.load_single_database(filename) - assert repr(db) == "LCA_Database('{}')".format(filename) + assert repr(db) == f"LCA_Database('{filename}')" def test_lca_index_signatures_method(): # test 'signatures' method from base class Index - filename = utils.get_test_data('lca/47+63.lca.json') + filename = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(filename) siglist = list(db.signatures()) @@ -567,13 +537,13 @@ def test_lca_index_signatures_method(): def test_lca_index_select(): # test 'select' method from Index base class. - filename = utils.get_test_data('lca/47+63.lca.json') + filename = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(filename) xx = db.select(ksize=31) assert xx == db - xx = db.select(moltype='DNA') + xx = db.select(moltype="DNA") assert xx == db xx = db.select(abund=False) @@ -583,7 +553,7 @@ def test_lca_index_select(): db.select(ksize=21) with pytest.raises(ValueError): - db.select(moltype='protein') + db.select(moltype="protein") with pytest.raises(ValueError): db.select(abund=True) @@ -592,12 +562,12 @@ def test_lca_index_select(): def test_lca_index_select_picklist(): # test 'select' method from Index base class with a picklist. - filename = utils.get_test_data('lca/47+63.lca.json') + filename = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(filename) # construct a picklist... - picklist = SignaturePicklist('md5prefix8') - picklist.init(['50a92740']) + picklist = SignaturePicklist("md5prefix8") + picklist.init(["50a92740"]) xx = db.select(picklist=picklist) assert xx == db @@ -605,7 +575,7 @@ def test_lca_index_select_picklist(): siglist = list(db.signatures()) assert len(siglist) == 1 ss = siglist[0] - assert ss.md5sum().startswith('50a92740') + assert ss.md5sum().startswith("50a92740") assert ss.minhash.ksize == 31 @@ -613,14 +583,14 @@ def test_lca_index_find_picklist_check_overlap(): # make sure 'find' works for picklists that exclude relevant signatures # (bug #1638) - query_fn = utils.get_test_data('47.fa.sig') + query_fn = utils.get_test_data("47.fa.sig") query_sig = sourmash.load_one_signature(query_fn, ksize=31) - db_fn = utils.get_test_data('lca/47+63.lca.json') + db_fn = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(db_fn) # construct a picklist... - picklist = SignaturePicklist('ident') - picklist.init(['NC_009665.1']) + picklist = SignaturePicklist("ident") + picklist.init(["NC_009665.1"]) xx = db.select(picklist=picklist) assert xx == db @@ -632,12 +602,12 @@ def test_lca_index_find_picklist_check_overlap(): def test_lca_index_select_picklist_exclude(): # test 'select' method from Index base class with a picklist. - filename = utils.get_test_data('lca/47+63.lca.json') + filename = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(filename) # construct a picklist... - picklist = SignaturePicklist('md5prefix8', pickstyle=PickStyle.EXCLUDE) - picklist.init(['50a92740']) + picklist = SignaturePicklist("md5prefix8", pickstyle=PickStyle.EXCLUDE) + picklist.init(["50a92740"]) xx = db.select(picklist=picklist) assert xx == db @@ -645,19 +615,19 @@ def test_lca_index_select_picklist_exclude(): siglist = list(db.signatures()) assert len(siglist) == 1 ss = siglist[0] - assert ss.md5sum().startswith('e88dc390') + assert ss.md5sum().startswith("e88dc390") assert ss.minhash.ksize == 31 def test_lca_index_select_picklist_twice(): # test 'select' method from Index base class with a picklist. - filename = utils.get_test_data('lca/47+63.lca.json') + filename = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(filename) # construct a picklist... - picklist = SignaturePicklist('md5prefix8') - picklist.init(['50a92740']) + picklist = SignaturePicklist("md5prefix8") + picklist.init(["50a92740"]) xx = db.select(picklist=picklist) assert xx == db @@ -668,13 +638,12 @@ def test_lca_index_select_picklist_twice(): assert "we do not (yet) support multiple picklists for LCA databases" in str(exc) - def test_search_db_scaled_gt_sig_scaled(): - dbfile = utils.get_test_data('lca/47+63.lca.json') + dbfile = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(dbfile) - sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + sig = sourmash.load_one_signature(utils.get_test_data("47.fa.sig")) - results = db.search(sig, threshold=.01, ignore_abundance=True) + results = db.search(sig, threshold=0.01, ignore_abundance=True) match_sig = results[0][1] minhash = sig.minhash.downsample(scaled=10000) @@ -682,28 +651,28 @@ def test_search_db_scaled_gt_sig_scaled(): def test_search_db_scaled_lt_sig_scaled(): - dbfile = utils.get_test_data('lca/47+63.lca.json') + dbfile = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(dbfile) - sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + sig = sourmash.load_one_signature(utils.get_test_data("47.fa.sig")) sig = sig.to_mutable() sig.minhash = sig.minhash.downsample(scaled=100000) - results = db.search(sig, threshold=.01, ignore_abundance=True) + results = db.search(sig, threshold=0.01, ignore_abundance=True) print(results) assert results[0].score == 1.0 match = results[0].signature - orig_sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + orig_sig = sourmash.load_one_signature(utils.get_test_data("47.fa.sig")) assert orig_sig.minhash.jaccard(match.minhash, downsample=True) == 1.0 def test_gather_db_scaled_gt_sig_scaled(): - dbfile = utils.get_test_data('lca/47+63.lca.json') + dbfile = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(dbfile) - sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + sig = sourmash.load_one_signature(utils.get_test_data("47.fa.sig")) - result = db.best_containment(sig, threshold=.01, ignore_abundance=True) + result = db.best_containment(sig, threshold=0.01, ignore_abundance=True) match_sig = result[1] minhash = sig.minhash.downsample(scaled=10000) @@ -711,12 +680,12 @@ def test_gather_db_scaled_gt_sig_scaled(): def test_gather_db_scaled_lt_sig_scaled(): - dbfile = utils.get_test_data('lca/47+63.lca.json') + dbfile = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(dbfile) - sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) + sig = sourmash.load_one_signature(utils.get_test_data("47.fa.sig")) sig_minhash = sig.minhash.downsample(scaled=100000) - result = db.best_containment(sig, threshold=.01, ignore_abundance=True) + result = db.best_containment(sig, threshold=0.01, ignore_abundance=True) match_sig = result[1] minhash = match_sig.minhash.downsample(scaled=100000) @@ -724,7 +693,7 @@ def test_gather_db_scaled_lt_sig_scaled(): def test_db_lineage_to_lid(): - dbfile = utils.get_test_data('lca/47+63.lca.json') + dbfile = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(dbfile) d = db._lineage_to_lid @@ -735,15 +704,15 @@ def test_db_lineage_to_lid(): print(items) lin1 = items[0][0][-1] - assert lin1.rank == 'strain' - assert lin1.name == 'Shewanella baltica OS185' + assert lin1.rank == "strain" + assert lin1.name == "Shewanella baltica OS185" lin1 = items[1][0][-1] - assert lin1.rank == 'strain' - assert lin1.name == 'Shewanella baltica OS223' + assert lin1.rank == "strain" + assert lin1.name == "Shewanella baltica OS223" def test_db_lid_to_idx(): - dbfile = utils.get_test_data('lca/47+63.lca.json') + dbfile = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(dbfile) d = db._lid_to_idx @@ -756,7 +725,7 @@ def test_db_lid_to_idx(): def test_db_idx_to_ident(): - dbfile = utils.get_test_data('lca/47+63.lca.json') + dbfile = utils.get_test_data("lca/47+63.lca.json") db, ksize, scaled = lca_utils.load_single_database(dbfile) d = db._idx_to_ident @@ -765,23 +734,23 @@ def test_db_idx_to_ident(): assert len(items) == 2 print(items) - assert items == [(32, 'NC_009665'), (48, 'NC_011663')] + assert items == [(32, "NC_009665"), (48, "NC_011663")] ## command line tests def test_run_sourmash_lca(): - status, out, err = utils.runscript('sourmash', ['lca'], fail_ok=True) - assert status != 0 # no args provided, ok ;) + status, out, err = utils.runscript("sourmash", ["lca"], fail_ok=True) + assert status != 0 # no args provided, ok ;) def test_basic_index(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-1.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-1.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, 'delmont-1', input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, "delmont-1", input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -790,38 +759,50 @@ def test_basic_index(runtmp, lca_db_format): assert os.path.exists(lca_db), lca_db - assert 'Building LCA database with ksize=31 scaled=10000 moltype=DNA' in runtmp.last_result.err - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "Building LCA database with ksize=31 scaled=10000 moltype=DNA" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) def test_basic_index_twice(runtmp, lca_db_format): # run 'lca index' twice. - taxcsv = utils.get_test_data('lca/delmont-1.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-1.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, 'delmont-1', input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, "delmont-1", input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) with pytest.raises(SourmashCommandFailed): - cmd = ['lca', 'index', taxcsv, 'delmont-1', input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, "delmont-1", input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'already exists. Not overwriting.' in runtmp.last_result.err + assert "already exists. Not overwriting." in runtmp.last_result.err def test_basic_index_bad_spreadsheet(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/bad-spreadsheet.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/bad-spreadsheet.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -830,79 +811,112 @@ def test_basic_index_bad_spreadsheet(runtmp, lca_db_format): assert os.path.exists(lca_db), lca_db - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) def test_basic_index_broken_spreadsheet(runtmp, lca_db_format): # duplicate identifiers in this spreadsheet - taxcsv = utils.get_test_data('lca/bad-spreadsheet-2.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/bad-spreadsheet-2.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig, "-F", lca_db_format] with pytest.raises(SourmashCommandFailed): runtmp.sourmash(*cmd) assert runtmp.last_result.status != 0 - assert "multiple lineages for identifier TARA_ASE_MAG_00031" in runtmp.last_result.err + assert ( + "multiple lineages for identifier TARA_ASE_MAG_00031" in runtmp.last_result.err + ) def test_basic_index_too_many_strains_too_few_species(runtmp, lca_db_format): # explicit test for #841, where 'n_species' wasn't getting counted # if lineage was at strain level resolution. - taxcsv = utils.get_test_data('lca/podar-lineage.csv') - input_sig = utils.get_test_data('47.fa.sig') - lca_db = runtmp.output(f'out.lca.{lca_db_format}') - - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, - '-C', '3', '--split-identifiers', '-F', lca_db_format] + taxcsv = utils.get_test_data("lca/podar-lineage.csv") + input_sig = utils.get_test_data("47.fa.sig") + lca_db = runtmp.output(f"out.lca.{lca_db_format}") + + cmd = [ + "lca", + "index", + taxcsv, + lca_db, + input_sig, + "-C", + "3", + "--split-identifiers", + "-F", + lca_db_format, + ] runtmp.sourmash(*cmd) - assert not 'error: fewer than 20% of lineages' in runtmp.last_result.err + assert "error: fewer than 20% of lineages" not in runtmp.last_result.err assert runtmp.last_result.status == 0 def test_basic_index_too_few_species(runtmp, lca_db_format): # spreadsheets with too few species should be flagged, unless -f specified - taxcsv = utils.get_test_data('lca/tully-genome-sigs.classify.csv') + taxcsv = utils.get_test_data("lca/tully-genome-sigs.classify.csv") # (these don't really matter, should break on load spreadsheet) - input_sig = utils.get_test_data('47.fa.sig') - lca_db = runtmp.output(f'out.lca.{lca_db_format}') + input_sig = utils.get_test_data("47.fa.sig") + lca_db = runtmp.output(f"out.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-C', '3', - '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig, "-C", "3", "-F", lca_db_format] with pytest.raises(SourmashCommandFailed): runtmp.sourmash(*cmd) - assert not '"ERROR: fewer than 20% of lineages have species-level resolution' in runtmp.last_result.err + assert ( + '"ERROR: fewer than 20% of lineages have species-level resolution' + not in runtmp.last_result.err + ) assert runtmp.last_result.status != 0 def test_basic_index_require_taxonomy(runtmp, lca_db_format): # no taxonomy in here - taxcsv = utils.get_test_data('lca/bad-spreadsheet-3.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') - - cmd = ['lca', 'index', '--require-taxonomy', taxcsv, lca_db, input_sig, - '-F', lca_db_format] + taxcsv = utils.get_test_data("lca/bad-spreadsheet-3.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") + + cmd = [ + "lca", + "index", + "--require-taxonomy", + taxcsv, + lca_db, + input_sig, + "-F", + lca_db_format, + ] with pytest.raises(SourmashCommandFailed): runtmp.sourmash(*cmd) assert runtmp.last_result.status != 0 - assert "ERROR: no hash values found - are there any signatures?" in runtmp.last_result.err + assert ( + "ERROR: no hash values found - are there any signatures?" + in runtmp.last_result.err + ) def test_basic_index_column_start(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-3.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-3.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', '-C', '3', taxcsv, lca_db, input_sig, - '-F', lca_db_format] + cmd = ["lca", "index", "-C", "3", taxcsv, lca_db, input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -911,49 +925,71 @@ def test_basic_index_column_start(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) def test_index_empty_sketch_name(runtmp, lca_db_format): c = runtmp # create two signatures with empty 'name' attributes - cmd = ['sketch', 'dna', utils.get_test_data('genome-s12.fa.gz'), - utils.get_test_data('genome-s11.fa.gz')] + cmd = [ + "sketch", + "dna", + utils.get_test_data("genome-s12.fa.gz"), + utils.get_test_data("genome-s11.fa.gz"), + ] c.run_sourmash(*cmd) - sig1 = c.output('genome-s11.fa.gz.sig') + sig1 = c.output("genome-s11.fa.gz.sig") assert os.path.exists(sig1) - sig2 = c.output('genome-s12.fa.gz.sig') + sig2 = c.output("genome-s12.fa.gz.sig") assert os.path.exists(sig2) - outfile = f'zzz.lca.{lca_db_format}' + outfile = f"zzz.lca.{lca_db_format}" # can we insert them both? - taxcsv = utils.get_test_data('lca/delmont-1.csv') - cmd = ['lca', 'index', taxcsv, outfile, sig1, sig2, '-F', lca_db_format] + taxcsv = utils.get_test_data("lca/delmont-1.csv") + cmd = ["lca", "index", taxcsv, outfile, sig1, sig2, "-F", lca_db_format] c.run_sourmash(*cmd) assert os.path.exists(c.output(outfile)) print(c.last_result.out) print(c.last_result.err) - assert 'WARNING: no lineage provided for 2 sig' in c.last_result.err + assert "WARNING: no lineage provided for 2 sig" in c.last_result.err def test_basic_index_and_classify_with_tsv_and_gz(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-1.tsv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + taxcsv = utils.get_test_data("lca/delmont-1.tsv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") - if lca_db_format == 'json': - lca_db = runtmp.output(f'delmont-1.lca.json.gz') + if lca_db_format == "json": + lca_db = runtmp.output("delmont-1.lca.json.gz") else: - lca_db = runtmp.output(f'delmont-1.lca.sql') - - cmd = ['lca', 'index', '--tabs', '--no-header', taxcsv, lca_db, input_sig, - '-F', lca_db_format] + lca_db = runtmp.output("delmont-1.lca.sql") + + cmd = [ + "lca", + "index", + "--tabs", + "--no-header", + taxcsv, + lca_db, + input_sig, + "-F", + lca_db_format, + ] runtmp.sourmash(*cmd) print(cmd) @@ -962,27 +998,36 @@ def test_basic_index_and_classify_with_tsv_and_gz(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "1 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig] + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in runtmp.last_result.out - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in runtmp.last_result.out + ) + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_basic_index_and_classify(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-1.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-1.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -991,31 +1036,55 @@ def test_basic_index_and_classify(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err - - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig] + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in runtmp.last_result.out - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in runtmp.last_result.out + ) + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_basic_index_and_classify_dup_lineage(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/tara-delmont-SuppTable3.csv') - input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00007.sig') - input_sig2 = utils.get_test_data('lca/TARA_ANW_MAG_00005.sig') - lca_db = runtmp.output(f'delmont-dup.lca.{lca_db_format}') - - cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format, '-f'] + taxcsv = utils.get_test_data("lca/tara-delmont-SuppTable3.csv") + input_sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00007.sig") + input_sig2 = utils.get_test_data("lca/TARA_ANW_MAG_00005.sig") + lca_db = runtmp.output(f"delmont-dup.lca.{lca_db_format}") + + cmd = [ + "lca", + "index", + taxcsv, + lca_db, + input_sig1, + input_sig2, + "-F", + lca_db_format, + "-f", + ] runtmp.sourmash(*cmd) print(cmd) @@ -1024,35 +1093,41 @@ def test_basic_index_and_classify_dup_lineage(runtmp, lca_db_format): assert os.path.exists(lca_db) - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig1] + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig1] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'TARA_ASE_MAG_00007,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,,,' in runtmp.last_result.out + assert ( + "TARA_ASE_MAG_00007,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,,," + in runtmp.last_result.out + ) - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig2] + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig2] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'TARA_ANW_MAG_00005,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,,,' in runtmp.last_result.out + assert ( + "TARA_ANW_MAG_00005,found,Bacteria,Proteobacteria,Gammaproteobacteria,,,,," + in runtmp.last_result.out + ) def test_index_traverse(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-1.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-1.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - in_dir = runtmp.output('sigs') + in_dir = runtmp.output("sigs") os.mkdir(in_dir) - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig')) + shutil.copyfile(input_sig, os.path.join(in_dir, "q.sig")) - cmd = ['lca', 'index', taxcsv, lca_db, in_dir, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, in_dir, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1061,26 +1136,35 @@ def test_index_traverse(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err - assert 'WARNING: 1 duplicate signatures.' not in runtmp.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + assert "WARNING: 1 duplicate signatures." not in runtmp.last_result.err def test_index_traverse_force(runtmp, lca_db_format): c = runtmp # test the use of --force to load all files, not just .sig - taxcsv = utils.get_test_data('lca/delmont-1.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = c.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-1.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = c.output(f"delmont-1.lca.{lca_db_format}") - in_dir = c.output('sigs') + in_dir = c.output("sigs") os.mkdir(in_dir) # name signature .txt instead of .sig: - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.txt')) + shutil.copyfile(input_sig, os.path.join(in_dir, "q.txt")) # use --force - cmd = ['lca', 'index', taxcsv, lca_db, in_dir, '-f', '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, in_dir, "-f", "-F", lca_db_format] c.run_sourmash(*cmd) out = c.last_result.out @@ -1092,22 +1176,31 @@ def test_index_traverse_force(runtmp, lca_db_format): assert "** assuming column 'MAGs' is identifiers in spreadsheet" in err assert "** assuming column 'Domain' is superkingdom in spreadsheet" in err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in err - assert 'WARNING: 1 duplicate signatures.' not in err + assert "1 identifiers used out of 1 distinct identifiers in spreadsheet." in err + assert "WARNING: 1 duplicate signatures." not in err def test_index_from_file_cmdline_sig(runtmp, lca_db_format): c = runtmp - taxcsv = utils.get_test_data('lca/delmont-1.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = c.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-1.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = c.output(f"delmont-1.lca.{lca_db_format}") - file_list = c.output('sigs.list') - with open(file_list, 'wt') as fp: + file_list = c.output("sigs.list") + with open(file_list, "w") as fp: print(input_sig, file=fp) - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '--from-file', file_list, - '-F', lca_db_format] + cmd = [ + "lca", + "index", + taxcsv, + lca_db, + input_sig, + "--from-file", + file_list, + "-F", + lca_db_format, + ] c.run_sourmash(*cmd) out = c.last_result.out @@ -1119,23 +1212,31 @@ def test_index_from_file_cmdline_sig(runtmp, lca_db_format): assert "** assuming column 'MAGs' is identifiers in spreadsheet" in err assert "** assuming column 'Domain' is superkingdom in spreadsheet" in err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in err - assert 'WARNING: 1 duplicate signatures.' in err + assert "1 identifiers used out of 1 distinct identifiers in spreadsheet." in err + assert "WARNING: 1 duplicate signatures." in err def test_index_from_file(runtmp, lca_db_format): c = runtmp - taxcsv = utils.get_test_data('lca/delmont-1.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = c.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-1.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = c.output(f"delmont-1.lca.{lca_db_format}") - file_list = c.output('sigs.list') - with open(file_list, 'wt') as fp: + file_list = c.output("sigs.list") + with open(file_list, "w") as fp: print(input_sig, file=fp) - cmd = ['lca', 'index', taxcsv, lca_db, '--from-file', file_list, - '-F', lca_db_format] + cmd = [ + "lca", + "index", + taxcsv, + lca_db, + "--from-file", + file_list, + "-F", + lca_db_format, + ] c.run_sourmash(*cmd) out = c.last_result.out @@ -1147,33 +1248,41 @@ def test_index_from_file(runtmp, lca_db_format): assert "** assuming column 'MAGs' is identifiers in spreadsheet" in err assert "** assuming column 'Domain' is superkingdom in spreadsheet" in err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in err + assert "1 identifiers used out of 1 distinct identifiers in spreadsheet." in err def test_index_fail_on_num(runtmp, lca_db_format): c = runtmp # lca index should yield a decent error message when attempted on 'num' - sigfile = utils.get_test_data('num/63.fa.sig') - taxcsv = utils.get_test_data('lca/podar-lineage.csv') + sigfile = utils.get_test_data("num/63.fa.sig") + taxcsv = utils.get_test_data("lca/podar-lineage.csv") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('lca', 'index', taxcsv, f'xxx.lca.{lca_db_format}', sigfile, - '-C', '3', '-F', lca_db_format) + c.run_sourmash( + "lca", + "index", + taxcsv, + f"xxx.lca.{lca_db_format}", + sigfile, + "-C", + "3", + "-F", + lca_db_format, + ) err = c.last_result.err print(err) - assert 'ERROR: cannot insert signature ' in err - assert 'ERROR: cannot downsample signature; is it a scaled signature?' in err + assert "ERROR: cannot insert signature " in err + assert "ERROR: cannot downsample signature; is it a scaled signature?" in err def test_index_traverse_real_spreadsheet_no_report(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/tara-delmont-SuppTable3.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/tara-delmont-SuppTable3.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-f', - '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig, "-f", "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1182,22 +1291,44 @@ def test_index_traverse_real_spreadsheet_no_report(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 957 distinct identifiers in spreadsheet.' in runtmp.last_result.err - assert 'WARNING: no signatures for 956 spreadsheet rows.' in runtmp.last_result.err - assert 'WARNING: 105 unused lineages.' in runtmp.last_result.err - assert '(You can use --report to generate a detailed report.)' in runtmp.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 957 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + assert "WARNING: no signatures for 956 spreadsheet rows." in runtmp.last_result.err + assert "WARNING: 105 unused lineages." in runtmp.last_result.err + assert ( + "(You can use --report to generate a detailed report.)" + in runtmp.last_result.err + ) def test_index_traverse_real_spreadsheet_report(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/tara-delmont-SuppTable3.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') - report_loc = runtmp.output('report.txt') - - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '--report', - report_loc, '-f', '-F', lca_db_format] + taxcsv = utils.get_test_data("lca/tara-delmont-SuppTable3.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") + report_loc = runtmp.output("report.txt") + + cmd = [ + "lca", + "index", + taxcsv, + lca_db, + input_sig, + "--report", + report_loc, + "-f", + "-F", + lca_db_format, + ] runtmp.sourmash(*cmd) print(cmd) @@ -1206,148 +1337,191 @@ def test_index_traverse_real_spreadsheet_report(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 957 distinct identifiers in spreadsheet.' in runtmp.last_result.err - assert 'WARNING: no signatures for 956 spreadsheet rows.' in runtmp.last_result.err - assert 'WARNING: 105 unused lineages.' in runtmp.last_result.err - assert '(You can use --report to generate a detailed report.)' not in runtmp.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 957 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + assert "WARNING: no signatures for 956 spreadsheet rows." in runtmp.last_result.err + assert "WARNING: 105 unused lineages." in runtmp.last_result.err + assert ( + "(You can use --report to generate a detailed report.)" + not in runtmp.last_result.err + ) assert os.path.exists(report_loc) def test_single_classify(runtmp): # run a basic 'classify', check output. - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") - cmd = ['lca', 'classify', '--db', db1, '--query', input_sig] + cmd = ["lca", "classify", "--db", db1, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_single_classify_zip_query(runtmp): # run 'classify' with a query in a zipfile - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") query_ss = sourmash.load_one_signature(input_sig, ksize=31) - query_zipfile = runtmp.output('query.zip') + query_zipfile = runtmp.output("query.zip") with sourmash_args.SaveSignaturesToLocation(query_zipfile) as save_sig: save_sig.add(query_ss) - cmd = ['lca', 'classify', '--db', db1, '--query', query_zipfile] + cmd = ["lca", "classify", "--db", db1, "--query", query_zipfile] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_single_classify_to_output(runtmp): - db1 = utils.get_test_data(f'lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - - cmd = ['lca', 'classify', '--db', db1, '--query', input_sig, - '-o', runtmp.output('outfile.txt')] + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + + cmd = [ + "lca", + "classify", + "--db", + db1, + "--query", + input_sig, + "-o", + runtmp.output("outfile.txt"), + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - with open(runtmp.output('outfile.txt'), 'rt') as fp: + with open(runtmp.output("outfile.txt")) as fp: outdata = fp.read() - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii' in outdata - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii" + in outdata + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_single_classify_to_output_no_name(runtmp): - db1 = utils.get_test_data(f'lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") ss = sourmash.load_one_signature(input_sig, ksize=31) - outsig_filename = runtmp.output('q.sig') - with open(outsig_filename, 'wt') as fp: + outsig_filename = runtmp.output("q.sig") + with open(outsig_filename, "w") as fp: # remove name from signature here -- - new_sig = sourmash.SourmashSignature(ss.minhash, filename='xyz') + new_sig = sourmash.SourmashSignature(ss.minhash, filename="xyz") sourmash.save_signatures([new_sig], fp) - cmd = ['lca', 'classify', '--db', db1, '--query', outsig_filename, - '-o', runtmp.output('outfile.txt')] + cmd = [ + "lca", + "classify", + "--db", + db1, + "--query", + outsig_filename, + "-o", + runtmp.output("outfile.txt"), + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - with open(runtmp.output('outfile.txt'), 'rt') as fp: + with open(runtmp.output("outfile.txt")) as fp: outdata = fp.read() print((outdata,)) - assert 'xyz,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii' in outdata - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "xyz,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii" + in outdata + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_single_classify_empty(runtmp): - db1 = utils.get_test_data(f'lca/both.lca.json') - input_sig = utils.get_test_data('GCF_000005845.2_ASM584v2_genomic.fna.gz.sig') + db1 = utils.get_test_data("lca/both.lca.json") + input_sig = utils.get_test_data("GCF_000005845.2_ASM584v2_genomic.fna.gz.sig") - cmd = ['lca', 'classify', '--db', db1, '--query', input_sig] + cmd = ["lca", "classify", "--db", db1, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'GCF_000005845,nomatch,,,,,,,,' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert "GCF_000005845,nomatch,,,,,,,," in runtmp.last_result.out + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_single_classify_traverse(runtmp): - db1 = utils.get_test_data(f'lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - in_dir = runtmp.output('sigs') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + in_dir = runtmp.output("sigs") os.mkdir(in_dir) - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig')) + shutil.copyfile(input_sig, os.path.join(in_dir, "q.sig")) - cmd = ['lca', 'classify', '--db', db1, '--query', input_sig] + cmd = ["lca", "classify", "--db", db1, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_multi_query_classify_traverse(runtmp): # both.lca.json is built from both dir and dir2 - db1 = utils.get_test_data(f'lca/both.lca.json') - dir1 = utils.get_test_data('lca/dir1') - dir2 = utils.get_test_data('lca/dir2') + db1 = utils.get_test_data("lca/both.lca.json") + dir1 = utils.get_test_data("lca/dir1") + dir2 = utils.get_test_data("lca/dir2") - cmd = ['lca', 'classify', '--db', db1, '--query', dir1, dir2] + cmd = ["lca", "classify", "--db", db1, "--query", dir1, dir2] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - with open(utils.get_test_data('lca/classify-by-both.csv')) as fp: + with open(utils.get_test_data("lca/classify-by-both.csv")) as fp: fp_lines = fp.readlines() out_lines = runtmp.last_result.out.splitlines() @@ -1362,22 +1536,22 @@ def test_multi_query_classify_traverse(runtmp): @utils.in_tempdir def test_multi_query_classify_query_from_file(c): # both.lca.json is built from both dir and dir2 - db1 = utils.get_test_data('lca/both.lca.json') - dir1_glob = utils.get_test_data('lca/dir1/*.sig') + db1 = utils.get_test_data("lca/both.lca.json") + dir1_glob = utils.get_test_data("lca/dir1/*.sig") dir1_files = glob.glob(dir1_glob) - dir2_glob = utils.get_test_data('lca/dir2/*.sig') + dir2_glob = utils.get_test_data("lca/dir2/*.sig") dir2_files = glob.glob(dir2_glob) - file_list = c.output('file.list') - with open(file_list, 'wt') as fp: + file_list = c.output("file.list") + with open(file_list, "w") as fp: print("\n".join(dir1_files), file=fp) print("\n".join(dir2_files), file=fp) - cmd = ['lca', 'classify', '--db', db1, '--query-from-file', file_list] + cmd = ["lca", "classify", "--db", db1, "--query-from-file", file_list] c.run_sourmash(*cmd) out = c.last_result.out - with open(utils.get_test_data('lca/classify-by-both.csv')) as fp: + with open(utils.get_test_data("lca/classify-by-both.csv")) as fp: fp_lines = fp.readlines() out_lines = out.splitlines() @@ -1392,23 +1566,31 @@ def test_multi_query_classify_query_from_file(c): @utils.in_tempdir def test_multi_query_classify_query_from_file_and_query(c): # both.lca.json is built from both dir and dir2 - db1 = utils.get_test_data(f'lca/both.lca.json') - dir1_glob = utils.get_test_data('lca/dir1/*.sig') + db1 = utils.get_test_data("lca/both.lca.json") + dir1_glob = utils.get_test_data("lca/dir1/*.sig") dir1_files = glob.glob(dir1_glob) - dir2_glob = utils.get_test_data('lca/dir2/*.sig') + dir2_glob = utils.get_test_data("lca/dir2/*.sig") dir2_files = glob.glob(dir2_glob) - file_list = c.output('file.list') - with open(file_list, 'wt') as fp: - print("\n".join(dir1_files[1:]), file=fp) # leave off first one + file_list = c.output("file.list") + with open(file_list, "w") as fp: + print("\n".join(dir1_files[1:]), file=fp) # leave off first one print("\n".join(dir2_files), file=fp) - cmd = ['lca', 'classify', '--db', db1, '--query', dir1_files[0], - '--query-from-file', file_list] + cmd = [ + "lca", + "classify", + "--db", + db1, + "--query", + dir1_files[0], + "--query-from-file", + file_list, + ] c.run_sourmash(*cmd) out = c.last_result.out - with open(utils.get_test_data('lca/classify-by-both.csv'), 'rt') as fp: + with open(utils.get_test_data("lca/classify-by-both.csv")) as fp: fp_lines = fp.readlines() out_lines = out.splitlines() @@ -1422,19 +1604,19 @@ def test_multi_query_classify_query_from_file_and_query(c): def test_multi_db_multi_query_classify_traverse(runtmp): # two halves of both.lca.json, see above test. - db1 = utils.get_test_data(f'lca/dir1.lca.json') - db2 = utils.get_test_data(f'lca/dir2.lca.json') - dir1 = utils.get_test_data('lca/dir1') - dir2 = utils.get_test_data('lca/dir2') + db1 = utils.get_test_data("lca/dir1.lca.json") + db2 = utils.get_test_data("lca/dir2.lca.json") + dir1 = utils.get_test_data("lca/dir1") + dir2 = utils.get_test_data("lca/dir2") - cmd = ['lca', 'classify', '--db', db1, db2, '--query', dir1, dir2] + cmd = ["lca", "classify", "--db", db1, db2, "--query", dir1, dir2] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - with open(utils.get_test_data('lca/classify-by-both.csv'), 'rt') as fp: + with open(utils.get_test_data("lca/classify-by-both.csv")) as fp: fp_lines = fp.readlines() out_lines = runtmp.last_result.out.splitlines() @@ -1447,11 +1629,11 @@ def test_multi_db_multi_query_classify_traverse(runtmp): def test_unassigned_internal_index_and_classify(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-4.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-4.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1460,29 +1642,44 @@ def test_unassigned_internal_index_and_classify(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err - - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig] + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in runtmp.last_result.out - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,unassigned,Alteromonadaceae,unassigned,Alteromonas_macleodii' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in runtmp.last_result.out + ) + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,unassigned,Alteromonadaceae,unassigned,Alteromonas_macleodii" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_unassigned_last_index_and_classify(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-5.csv') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-5.csv") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1491,31 +1688,45 @@ def test_unassigned_last_index_and_classify(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '1 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err - - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig] + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "1 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in runtmp.last_result.out - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,,,\r\n' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in runtmp.last_result.out + ) + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,,,\r\n" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_index_and_classify_internal_unassigned_multi(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-6.csv') - input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-6.csv") + input_sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + input_sig2 = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig1, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1524,35 +1735,56 @@ def test_index_and_classify_internal_unassigned_multi(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '2 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "2 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) # classify input_sig1 - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig1] + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig1] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in runtmp.last_result.out - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,unassigned,unassigned,Alteromonadaceae,,,\r\n' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in runtmp.last_result.out + ) + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,unassigned,unassigned,Alteromonadaceae,,,\r\n" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err # classify input_sig2 - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig2] + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig2] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in runtmp.last_result.out - assert 'TARA_PSW_MAG_00136,found,Eukaryota,Chlorophyta,Prasinophyceae,unassigned,unassigned,Ostreococcus,,\r\n' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 1 LCA databases' in runtmp.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in runtmp.last_result.out + ) + assert ( + "TARA_PSW_MAG_00136,found,Eukaryota,Chlorophyta,Prasinophyceae,unassigned,unassigned,Ostreococcus,,\r\n" + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 1 LCA databases" in runtmp.last_result.err def test_classify_majority_vote_1(runtmp, lca_db_format): @@ -1560,13 +1792,14 @@ def test_classify_majority_vote_1(runtmp, lca_db_format): c = runtmp # build database - taxcsv = utils.get_test_data('lca/delmont-6.csv') - input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = c.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-6.csv") + input_sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + input_sig2 = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + lca_db = c.output(f"delmont-1.lca.{lca_db_format}") - c.run_sourmash('lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format) + c.run_sourmash( + "lca", "index", taxcsv, lca_db, input_sig1, input_sig2, "-F", lca_db_format + ) print(c.last_command) print(c.last_result.out) @@ -1574,26 +1807,46 @@ def test_classify_majority_vote_1(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in c.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in c.last_result.err - assert '2 identifiers used out of 2 distinct identifiers in spreadsheet.' in c.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" in c.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in c.last_result.err + ) + assert ( + "2 identifiers used out of 2 distinct identifiers in spreadsheet." + in c.last_result.err + ) # merge input_sig1 and input_sig2 - c.run_sourmash('signature', 'merge', input_sig1, input_sig2, '-k', '31', '--flatten', '-o', 'sig1and2.sig') - sig1and2 = c.output('sig1and2.sig') + c.run_sourmash( + "signature", + "merge", + input_sig1, + input_sig2, + "-k", + "31", + "--flatten", + "-o", + "sig1and2.sig", + ) + sig1and2 = c.output("sig1and2.sig") # lca classify should yield no results - c.run_sourmash('lca', 'classify', '--db', lca_db, '--query', sig1and2) + c.run_sourmash("lca", "classify", "--db", lca_db, "--query", sig1and2) print(c.last_command) print(c.last_result.out) print(c.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in c.last_result.out - assert 'disagree,,,,,,,,' in c.last_result.out - assert 'classified 1 signatures total' in c.last_result.err - assert 'loaded 1 LCA databases' in c.last_result.err - + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in c.last_result.out + ) + assert "disagree,,,,,,,," in c.last_result.out + assert "classified 1 signatures total" in c.last_result.err + assert "loaded 1 LCA databases" in c.last_result.err def test_classify_majority_vote_2(runtmp, lca_db_format): @@ -1603,13 +1856,14 @@ def test_classify_majority_vote_2(runtmp, lca_db_format): c = runtmp # build database - taxcsv = utils.get_test_data('lca/delmont-6.csv') - input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = c.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-6.csv") + input_sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + input_sig2 = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + lca_db = c.output(f"delmont-1.lca.{lca_db_format}") - c.run_sourmash('lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format) + c.run_sourmash( + "lca", "index", taxcsv, lca_db, input_sig1, input_sig2, "-F", lca_db_format + ) print(c.last_command) print(c.last_result.out) @@ -1617,25 +1871,49 @@ def test_classify_majority_vote_2(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in c.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in c.last_result.err - assert '2 identifiers used out of 2 distinct identifiers in spreadsheet.' in c.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" in c.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in c.last_result.err + ) + assert ( + "2 identifiers used out of 2 distinct identifiers in spreadsheet." + in c.last_result.err + ) # merge input_sig1 and input_sig2 - c.run_sourmash('signature', 'merge', input_sig1, input_sig2, '-k', '31', '--flatten', '-o', 'sig1and2.sig') - sig1and2 = c.output('sig1and2.sig') + c.run_sourmash( + "signature", + "merge", + input_sig1, + input_sig2, + "-k", + "31", + "--flatten", + "-o", + "sig1and2.sig", + ) + sig1and2 = c.output("sig1and2.sig") # majority vote classify - c.run_sourmash('lca', 'classify', '--db', lca_db, '--query', sig1and2, '--majority') + c.run_sourmash("lca", "classify", "--db", lca_db, "--query", sig1and2, "--majority") print(c.last_command) print(c.last_result.out) print(c.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in c.last_result.out - assert 'found,Eukaryota,Chlorophyta,Prasinophyceae,unassigned,unassigned,Ostreococcus' in c.last_result.out - assert 'classified 1 signatures total' in c.last_result.err - assert 'loaded 1 LCA databases' in c.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in c.last_result.out + ) + assert ( + "found,Eukaryota,Chlorophyta,Prasinophyceae,unassigned,unassigned,Ostreococcus" + in c.last_result.out + ) + assert "classified 1 signatures total" in c.last_result.err + assert "loaded 1 LCA databases" in c.last_result.err def test_classify_majority_vote_3(runtmp, lca_db_format): @@ -1643,13 +1921,14 @@ def test_classify_majority_vote_3(runtmp, lca_db_format): c = runtmp # build database - taxcsv = utils.get_test_data('lca/delmont-6.csv') - input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = c.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-6.csv") + input_sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + input_sig2 = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + lca_db = c.output(f"delmont-1.lca.{lca_db_format}") - c.run_sourmash('lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format) + c.run_sourmash( + "lca", "index", taxcsv, lca_db, input_sig1, input_sig2, "-F", lca_db_format + ) print(c.last_command) print(c.last_result.out) @@ -1657,51 +1936,70 @@ def test_classify_majority_vote_3(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in c.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in c.last_result.err - assert '2 identifiers used out of 2 distinct identifiers in spreadsheet.' in c.last_result.err + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" in c.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in c.last_result.err + ) + assert ( + "2 identifiers used out of 2 distinct identifiers in spreadsheet." + in c.last_result.err + ) # obtain testdata '47.fa.sig' - testdata1 = utils.get_test_data('47.fa.sig') + testdata1 = utils.get_test_data("47.fa.sig") # majority vote classify - c.run_sourmash('lca', 'classify', '--db', lca_db, '--query', testdata1, '--majority') + c.run_sourmash( + "lca", "classify", "--db", lca_db, "--query", testdata1, "--majority" + ) print(c.last_command) print(c.last_result.out) print(c.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in c.last_result.out - assert 'nomatch,,,,,,,,' in c.last_result.out - assert 'classified 1 signatures total' in c.last_result.err - assert 'loaded 1 LCA databases' in c.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in c.last_result.out + ) + assert "nomatch,,,,,,,," in c.last_result.out + assert "classified 1 signatures total" in c.last_result.err + assert "loaded 1 LCA databases" in c.last_result.err def test_multi_db_classify(runtmp): - db1 = utils.get_test_data(f'lca/delmont-1.lca.json') - db2 = utils.get_test_data('lca/delmont-2.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + db2 = utils.get_test_data("lca/delmont-2.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") - cmd = ['lca', 'classify', '--db', db1, db2, '--query', input_sig] + cmd = ["lca", "classify", "--db", db1, db2, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'ID,status,superkingdom,phylum,class,order,family,genus,species' in runtmp.last_result.out - assert 'TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,,,,' in runtmp.last_result.out - assert 'classified 1 signatures total' in runtmp.last_result.err - assert 'loaded 2 LCA databases' in runtmp.last_result.err + assert ( + "ID,status,superkingdom,phylum,class,order,family,genus,species" + in runtmp.last_result.out + ) + assert ( + "TARA_ASE_MAG_00031,found,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,,,," + in runtmp.last_result.out + ) + assert "classified 1 signatures total" in runtmp.last_result.err + assert "loaded 2 LCA databases" in runtmp.last_result.err def test_classify_unknown_hashes(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca-root/tax.csv') - input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') - input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca-root/tax.csv") + input_sig1 = utils.get_test_data("lca-root/TARA_MED_MAG_00029.fa.sig") + input_sig2 = utils.get_test_data("lca-root/TOBG_MED-875.fna.gz.sig") + lca_db = runtmp.output(f"lca-root.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig2, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1710,59 +2008,81 @@ def test_classify_unknown_hashes(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert '1 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "1 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) - cmd = ['lca', 'classify', '--db', lca_db, '--query', input_sig1] + cmd = ["lca", "classify", "--db", lca_db, "--query", input_sig1] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '(root)' not in runtmp.last_result.out - assert 'TARA_MED_MAG_00029,found,Archaea,Euryarcheoata,unassigned,unassigned,novelFamily_I' in runtmp.last_result.out + assert "(root)" not in runtmp.last_result.out + assert ( + "TARA_MED_MAG_00029,found,Archaea,Euryarcheoata,unassigned,unassigned,novelFamily_I" + in runtmp.last_result.out + ) def test_single_summarize(runtmp): - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") - cmd = ['lca', 'summarize', '--db', db1, '--query', input_sig] + cmd = ["lca", "summarize", "--db", db1, "--query", input_sig] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 1 signatures from 1 files total.' in runtmp.last_result.err - assert '100.0% 200 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales' in runtmp.last_result.out + assert "loaded 1 signatures from 1 files total." in runtmp.last_result.err + assert ( + "100.0% 200 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales" + in runtmp.last_result.out + ) def test_single_summarize_singleton(runtmp): - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - - cmd = ['lca', 'summarize', '--db', db1, '--query', input_sig,] + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + + cmd = [ + "lca", + "summarize", + "--db", + db1, + "--query", + input_sig, + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 1 signatures from 1 files total.' in runtmp.last_result.err - assert '100.0% 200 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales' in runtmp.last_result.out - assert 'test-data/lca/TARA_ASE_MAG_00031.sig:5b438c6c TARA_ASE_MAG_00031' in runtmp.last_result.out + assert "loaded 1 signatures from 1 files total." in runtmp.last_result.err + assert ( + "100.0% 200 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales" + in runtmp.last_result.out + ) + assert ( + "test-data/lca/TARA_ASE_MAG_00031.sig:5b438c6c TARA_ASE_MAG_00031" + in runtmp.last_result.out + ) @utils.in_tempdir def test_single_summarize_traverse(c): - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - in_dir = c.output('sigs') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + in_dir = c.output("sigs") os.mkdir(in_dir) - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig')) + shutil.copyfile(input_sig, os.path.join(in_dir, "q.sig")) - cmd = ['lca', 'summarize', '--db', db1, '--query', in_dir] + cmd = ["lca", "summarize", "--db", db1, "--query", in_dir] c.run_sourmash(*cmd) out = c.last_result.out @@ -1770,18 +2090,22 @@ def test_single_summarize_traverse(c): err = c.last_result.err print(err) - assert 'loaded 1 signatures from 1 files total.' in err - assert '100.0% 200 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales' in out + assert "loaded 1 signatures from 1 files total." in err + assert ( + "100.0% 200 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales" + in out + ) + @utils.in_tempdir def test_single_summarize_singleton_traverse(c): - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - in_dir = c.output('sigs') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + in_dir = c.output("sigs") os.mkdir(in_dir) - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig')) + shutil.copyfile(input_sig, os.path.join(in_dir, "q.sig")) - cmd = ['lca', 'summarize', '--db', db1, '--query', in_dir] + cmd = ["lca", "summarize", "--db", db1, "--query", in_dir] c.run_sourmash(*cmd) out = c.last_result.out @@ -1789,63 +2113,89 @@ def test_single_summarize_singleton_traverse(c): err = c.last_result.err print(err) - assert 'loaded 1 signatures from 1 files total.' in err - assert '100.0% 200 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales' in out - assert 'q.sig:5b438c6c TARA_ASE_MAG_00031' in out + assert "loaded 1 signatures from 1 files total." in err + assert ( + "100.0% 200 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales" + in out + ) + assert "q.sig:5b438c6c TARA_ASE_MAG_00031" in out def test_single_summarize_to_output(runtmp): - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - in_dir = runtmp.output('sigs') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + in_dir = runtmp.output("sigs") os.mkdir(in_dir) - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig')) - - cmd = ['lca', 'summarize', '--db', db1, '--query', input_sig, - '-o', runtmp.output('output.txt')] + shutil.copyfile(input_sig, os.path.join(in_dir, "q.sig")) + + cmd = [ + "lca", + "summarize", + "--db", + db1, + "--query", + input_sig, + "-o", + runtmp.output("output.txt"), + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - with open(runtmp.output('output.txt'), 'rt') as fp: + with open(runtmp.output("output.txt")) as fp: outdata = fp.read() - assert 'loaded 1 signatures from 1 files total.' in runtmp.last_result.err - assert '200,Bacteria,Proteobacteria,Gammaproteobacteria' in outdata - + assert "loaded 1 signatures from 1 files total." in runtmp.last_result.err + assert "200,Bacteria,Proteobacteria,Gammaproteobacteria" in outdata def test_single_summarize_to_output_check_filename(runtmp): - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - in_dir = runtmp.output('sigs') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + in_dir = runtmp.output("sigs") os.mkdir(in_dir) - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig')) - - cmd = ['lca', 'summarize', '--db', db1, '--query', os.path.join(in_dir, 'q.sig'), - '-o', runtmp.output('output.txt')] + shutil.copyfile(input_sig, os.path.join(in_dir, "q.sig")) + + cmd = [ + "lca", + "summarize", + "--db", + db1, + "--query", + os.path.join(in_dir, "q.sig"), + "-o", + runtmp.output("output.txt"), + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - outdata = Path(runtmp.output('output.txt')).read_text() - - assert 'loaded 1 signatures from 1 files total.' in runtmp.last_result.err - assert 'count,superkingdom,phylum,class,order,family,genus,species,strain,filename,sig_name,sig_md5,total_counts\n' in outdata - assert '200,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii,,'+os.path.join(in_dir, 'q.sig')+',TARA_ASE_MAG_00031,5b438c6c858cdaf9e9b05a207fa3f9f0,200.0\n' in outdata + outdata = Path(runtmp.output("output.txt")).read_text() + + assert "loaded 1 signatures from 1 files total." in runtmp.last_result.err + assert ( + "count,superkingdom,phylum,class,order,family,genus,species,strain,filename,sig_name,sig_md5,total_counts\n" + in outdata + ) + assert ( + "200,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii,," + + os.path.join(in_dir, "q.sig") + + ",TARA_ASE_MAG_00031,5b438c6c858cdaf9e9b05a207fa3f9f0,200.0\n" + in outdata + ) print(outdata) def test_summarize_unknown_hashes_to_output_check_total_counts(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca-root/tax.csv') - input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') - input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca-root/tax.csv") + input_sig1 = utils.get_test_data("lca-root/TARA_MED_MAG_00029.fa.sig") + input_sig2 = utils.get_test_data("lca-root/TOBG_MED-875.fna.gz.sig") + lca_db = runtmp.output(f"lca-root.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig2, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1854,78 +2204,89 @@ def test_summarize_unknown_hashes_to_output_check_total_counts(runtmp, lca_db_fo assert os.path.exists(lca_db) - assert '1 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "1 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) - cmd = ['lca', 'summarize', '--db', lca_db, '--query', input_sig1, - '-o', 'out.csv'] + cmd = ["lca", "summarize", "--db", lca_db, "--query", input_sig1, "-o", "out.csv"] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '(root)' not in runtmp.last_result.out - assert '11.5% 27 Archaea;Euryarcheoata;unassigned;unassigned;novelFamily_I' in runtmp.last_result.out + assert "(root)" not in runtmp.last_result.out + assert ( + "11.5% 27 Archaea;Euryarcheoata;unassigned;unassigned;novelFamily_I" + in runtmp.last_result.out + ) - with open(runtmp.output('out.csv'), newline="") as fp: + with open(runtmp.output("out.csv"), newline="") as fp: r = csv.DictReader(fp) rows = list(r) - pairs = [ (row['count'], row['total_counts']) for row in rows ] - pairs = [ (float(x), float(y)) for x, y in pairs ] + pairs = [(row["count"], row["total_counts"]) for row in rows] + pairs = [(float(x), float(y)) for x, y in pairs] pairs = set(pairs) - assert pairs == { (27.0, 234.0) } + assert pairs == {(27.0, 234.0)} def test_single_summarize_scaled(runtmp): - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - in_dir = runtmp.output('sigs') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + in_dir = runtmp.output("sigs") os.mkdir(in_dir) - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig')) + shutil.copyfile(input_sig, os.path.join(in_dir, "q.sig")) - cmd = ['lca', 'summarize', '--db', db1, '--query', input_sig, - '--scaled', '100000'] + cmd = ["lca", "summarize", "--db", db1, "--query", input_sig, "--scaled", "100000"] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 1 signatures from 1 files total.' in runtmp.last_result.err - assert '100.0% 27 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales' + assert "loaded 1 signatures from 1 files total." in runtmp.last_result.err + assert "100.0% 27 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales" def test_single_summarize_scaled_zip_query(runtmp): # check zipfile as query - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + db1 = utils.get_test_data("lca/delmont-1.lca.json") + input_sig = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") query_ss = sourmash.load_one_signature(input_sig, ksize=31) - query_zipfile = runtmp.output('query.zip') + query_zipfile = runtmp.output("query.zip") with sourmash_args.SaveSignaturesToLocation(query_zipfile) as save_sig: save_sig.add(query_ss) - cmd = ['lca', 'summarize', '--db', db1, '--query', query_zipfile, - '--scaled', '100000'] + cmd = [ + "lca", + "summarize", + "--db", + db1, + "--query", + query_zipfile, + "--scaled", + "100000", + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 1 signatures from 1 files total.' in runtmp.last_result.err - assert '100.0% 27 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales' + assert "loaded 1 signatures from 1 files total." in runtmp.last_result.err + assert "100.0% 27 Bacteria;Proteobacteria;Gammaproteobacteria;Alteromonadales" def test_multi_summarize_with_unassigned_singleton(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca/delmont-6.csv') - input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-6.csv") + input_sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + input_sig2 = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig1, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1934,21 +2295,39 @@ def test_multi_summarize_with_unassigned_singleton(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '2 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err - - cmd = ['lca', 'summarize', '--db', lca_db, '--query', input_sig1, - input_sig2, '--ignore-abundance'] + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "2 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + + cmd = [ + "lca", + "summarize", + "--db", + lca_db, + "--query", + input_sig1, + input_sig2, + "--ignore-abundance", + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 2 signatures from 2 files total.' in runtmp.last_result.err + assert "loaded 2 signatures from 2 files total." in runtmp.last_result.err out_lines = runtmp.last_result.out.splitlines() + def remove_line_startswith(x, check=None): for line in out_lines: if line.startswith(x): @@ -1957,32 +2336,45 @@ def remove_line_startswith(x, check=None): # make sure the check value is in there assert check in line return line - assert 0, "couldn't find {}".format(x) + assert 0, f"couldn't find {x}" # note, proportions/percentages are now per-file - remove_line_startswith('100.0% 200 Bacteria ', 'TARA_ASE_MAG_00031.sig:5b438c6c') - remove_line_startswith('100.0% 200 Bacteria;Proteobacteria;unassigned;unassigned ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta ') - remove_line_startswith('100.0% 1231 Eukaryota ', 'TARA_PSW_MAG_00136.sig:db50b713') - remove_line_startswith('100.0% 200 Bacteria;Proteobacteria ') - remove_line_startswith('100.0% 200 Bacteria;Proteobacteria;unassigned ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae ') - remove_line_startswith('100.0% 200 Bacteria;Proteobacteria;unassigned;unassigned;Alteromonadaceae ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned;unassigned ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned;unassigned;Ostreococcus ') + remove_line_startswith( + "100.0% 200 Bacteria ", "TARA_ASE_MAG_00031.sig:5b438c6c" + ) + remove_line_startswith( + "100.0% 200 Bacteria;Proteobacteria;unassigned;unassigned " + ) + remove_line_startswith("100.0% 1231 Eukaryota;Chlorophyta ") + remove_line_startswith( + "100.0% 1231 Eukaryota ", "TARA_PSW_MAG_00136.sig:db50b713" + ) + remove_line_startswith("100.0% 200 Bacteria;Proteobacteria ") + remove_line_startswith("100.0% 200 Bacteria;Proteobacteria;unassigned ") + remove_line_startswith("100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae ") + remove_line_startswith( + "100.0% 200 Bacteria;Proteobacteria;unassigned;unassigned;Alteromonadaceae " + ) + remove_line_startswith( + "100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned;unassigned " + ) + remove_line_startswith( + "100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned " + ) + remove_line_startswith( + "100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned;unassigned;Ostreococcus " + ) assert not out_lines def test_multi_summarize_with_zip_unassigned_singleton(runtmp, lca_db_format): # test summarize on multiple queries, in a zipfile. - taxcsv = utils.get_test_data('lca/delmont-6.csv') - input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-6.csv") + input_sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + input_sig2 = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig1, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -1991,31 +2383,48 @@ def test_multi_summarize_with_zip_unassigned_singleton(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '2 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err - - query_zipfile = runtmp.output('query.zip') + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "2 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + + query_zipfile = runtmp.output("query.zip") with sourmash_args.SaveSignaturesToLocation(query_zipfile) as save_sig: - input_sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + input_sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") sig1 = sourmash.load_one_signature(input_sig1, ksize=31) - input_sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') + input_sig2 = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") sig2 = sourmash.load_one_signature(input_sig2, ksize=31) save_sig.add(sig1) save_sig.add(sig2) - cmd = ['lca', 'summarize', '--db', lca_db, '--query', 'query.zip', - '--ignore-abundance'] + cmd = [ + "lca", + "summarize", + "--db", + lca_db, + "--query", + "query.zip", + "--ignore-abundance", + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 2 signatures from 1 files total.' in runtmp.last_result.err + assert "loaded 2 signatures from 1 files total." in runtmp.last_result.err out_lines = runtmp.last_result.out.splitlines() + def remove_line_startswith(x, check=None): for line in out_lines: if line.startswith(x): @@ -2024,31 +2433,40 @@ def remove_line_startswith(x, check=None): # make sure the check value is in there assert check in line return line - assert 0, "couldn't find {}".format(x) + assert 0, f"couldn't find {x}" # note, proportions/percentages are now per-file - remove_line_startswith('100.0% 200 Bacteria ', ':5b438c6c') - remove_line_startswith('100.0% 200 Bacteria;Proteobacteria;unassigned;unassigned ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta ') - remove_line_startswith('100.0% 1231 Eukaryota ', ':db50b713') - remove_line_startswith('100.0% 200 Bacteria;Proteobacteria ') - remove_line_startswith('100.0% 200 Bacteria;Proteobacteria;unassigned ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae ') - remove_line_startswith('100.0% 200 Bacteria;Proteobacteria;unassigned;unassigned;Alteromonadaceae ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned;unassigned ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned ') - remove_line_startswith('100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned;unassigned;Ostreococcus ') + remove_line_startswith("100.0% 200 Bacteria ", ":5b438c6c") + remove_line_startswith( + "100.0% 200 Bacteria;Proteobacteria;unassigned;unassigned " + ) + remove_line_startswith("100.0% 1231 Eukaryota;Chlorophyta ") + remove_line_startswith("100.0% 1231 Eukaryota ", ":db50b713") + remove_line_startswith("100.0% 200 Bacteria;Proteobacteria ") + remove_line_startswith("100.0% 200 Bacteria;Proteobacteria;unassigned ") + remove_line_startswith("100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae ") + remove_line_startswith( + "100.0% 200 Bacteria;Proteobacteria;unassigned;unassigned;Alteromonadaceae " + ) + remove_line_startswith( + "100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned;unassigned " + ) + remove_line_startswith( + "100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned " + ) + remove_line_startswith( + "100.0% 1231 Eukaryota;Chlorophyta;Prasinophyceae;unassigned;unassigned;Ostreococcus " + ) assert not out_lines def test_summarize_to_root(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca-root/tax.csv') - input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') - input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca-root/tax.csv") + input_sig1 = utils.get_test_data("lca-root/TARA_MED_MAG_00029.fa.sig") + input_sig2 = utils.get_test_data("lca-root/TOBG_MED-875.fna.gz.sig") + lca_db = runtmp.output(f"lca-root.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig1, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -2057,27 +2475,37 @@ def test_summarize_to_root(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert '2 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err - - cmd = ['lca', 'summarize', '--db', lca_db, '--query', input_sig2, - '--ignore-abundance'] + assert ( + "2 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + + cmd = [ + "lca", + "summarize", + "--db", + lca_db, + "--query", + input_sig2, + "--ignore-abundance", + ] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '78.6% 99 Archaea' in runtmp.last_result.out - assert '21.4% 27 (root)' in runtmp.last_result.out + assert "78.6% 99 Archaea" in runtmp.last_result.out + assert "21.4% 27 (root)" in runtmp.last_result.out def test_summarize_unknown_hashes(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca-root/tax.csv') - input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') - input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca-root/tax.csv") + input_sig1 = utils.get_test_data("lca-root/TARA_MED_MAG_00029.fa.sig") + input_sig2 = utils.get_test_data("lca-root/TOBG_MED-875.fna.gz.sig") + lca_db = runtmp.output(f"lca-root.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig2, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -2086,27 +2514,32 @@ def test_summarize_unknown_hashes(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert '1 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "1 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) - cmd = ['lca', 'summarize', '--db', lca_db, '--query', input_sig1] + cmd = ["lca", "summarize", "--db", lca_db, "--query", input_sig1] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '(root)' not in runtmp.last_result.out - assert '11.5% 27 Archaea;Euryarcheoata;unassigned;unassigned;novelFamily_I' in runtmp.last_result.out + assert "(root)" not in runtmp.last_result.out + assert ( + "11.5% 27 Archaea;Euryarcheoata;unassigned;unassigned;novelFamily_I" + in runtmp.last_result.out + ) def test_summarize_to_root_abund(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca-root/tax.csv') - input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') - input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca-root/tax.csv") + input_sig1 = utils.get_test_data("lca-root/TARA_MED_MAG_00029.fa.sig") + input_sig2 = utils.get_test_data("lca-root/TOBG_MED-875.fna.gz.sig") + lca_db = runtmp.output(f"lca-root.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig1, input_sig2, - '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig1, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -2115,26 +2548,29 @@ def test_summarize_to_root_abund(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert '2 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "2 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) - cmd = ['lca', 'summarize', '--db', lca_db, '--query', input_sig2] + cmd = ["lca", "summarize", "--db", lca_db, "--query", input_sig2] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '78.9% 101 Archaea' in runtmp.last_result.out - assert '21.1% 27 (root)' in runtmp.last_result.out + assert "78.9% 101 Archaea" in runtmp.last_result.out + assert "21.1% 27 (root)" in runtmp.last_result.out def test_summarize_unknown_hashes_abund(runtmp, lca_db_format): - taxcsv = utils.get_test_data('lca-root/tax.csv') - input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') - input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = runtmp.output(f'lca-root.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca-root/tax.csv") + input_sig1 = utils.get_test_data("lca-root/TARA_MED_MAG_00029.fa.sig") + input_sig2 = utils.get_test_data("lca-root/TOBG_MED-875.fna.gz.sig") + lca_db = runtmp.output(f"lca-root.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig2, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig2, "-F", lca_db_format] runtmp.sourmash(*cmd) print(cmd) @@ -2143,61 +2579,73 @@ def test_summarize_unknown_hashes_abund(runtmp, lca_db_format): assert os.path.exists(lca_db) - assert '1 identifiers used out of 2 distinct identifiers in spreadsheet.' in runtmp.last_result.err + assert ( + "1 identifiers used out of 2 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) - cmd = ['lca', 'summarize', '--db', lca_db, '--query', input_sig1] + cmd = ["lca", "summarize", "--db", lca_db, "--query", input_sig1] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '(root)' not in runtmp.last_result.out - assert '11.5% 27 Archaea;Euryarcheoata;unassigned;unassigned;novelFamily_I' in runtmp.last_result.out + assert "(root)" not in runtmp.last_result.out + assert ( + "11.5% 27 Archaea;Euryarcheoata;unassigned;unassigned;novelFamily_I" + in runtmp.last_result.out + ) @utils.in_thisdir def test_summarize_abund_hmp(c): # test lca summarize --with-abundance on some real data - queryfile = utils.get_test_data('hmp-sigs/G36354.sig.gz') - dbname = utils.get_test_data('hmp-sigs/G36354-matches.lca.json.gz') + queryfile = utils.get_test_data("hmp-sigs/G36354.sig.gz") + dbname = utils.get_test_data("hmp-sigs/G36354-matches.lca.json.gz") - c.run_sourmash('lca', 'summarize', '--db', dbname, '--query', queryfile) + c.run_sourmash("lca", "summarize", "--db", dbname, "--query", queryfile) - assert '32.1% 1080 p__Firmicutes;c__Bacilli;o__Lactobacillales' in c.last_result.out + assert ( + "32.1% 1080 p__Firmicutes;c__Bacilli;o__Lactobacillales" in c.last_result.out + ) @utils.in_thisdir def test_summarize_abund_fake_no_abund(c): # test lca summarize on some known/fake data; see docs for explanation. - queryfile = utils.get_test_data('fake-abund/query.sig.gz') - dbname = utils.get_test_data('fake-abund/matches.lca.json.gz') + queryfile = utils.get_test_data("fake-abund/query.sig.gz") + dbname = utils.get_test_data("fake-abund/matches.lca.json.gz") - c.run_sourmash('lca', 'summarize', '--db', dbname, '--query', queryfile, - '--ignore-abundance') + c.run_sourmash( + "lca", "summarize", "--db", dbname, "--query", queryfile, "--ignore-abundance" + ) - assert 'NOTE: discarding abundances in query, since --ignore-abundance' in c.last_result.err - assert '79.6% 550 Bacteria' in c.last_result.out - assert '20.4% 141 Archaea' in c.last_result.out + assert ( + "NOTE: discarding abundances in query, since --ignore-abundance" + in c.last_result.err + ) + assert "79.6% 550 Bacteria" in c.last_result.out + assert "20.4% 141 Archaea" in c.last_result.out @utils.in_thisdir def test_summarize_abund_fake_yes_abund(c): # test lca summarize abundance weighting on some known/fake data - queryfile = utils.get_test_data('fake-abund/query.sig.gz') - dbname = utils.get_test_data('fake-abund/matches.lca.json.gz') + queryfile = utils.get_test_data("fake-abund/query.sig.gz") + dbname = utils.get_test_data("fake-abund/matches.lca.json.gz") - c.run_sourmash('lca', 'summarize', '--db', dbname, '--query', queryfile) + c.run_sourmash("lca", "summarize", "--db", dbname, "--query", queryfile) - assert '43.2% 563 Bacteria' in c.last_result.out - assert '56.8% 740 Archaea' in c.last_result.out + assert "43.2% 563 Bacteria" in c.last_result.out + assert "56.8% 740 Archaea" in c.last_result.out def test_rankinfo_on_multi(runtmp): - db1 = utils.get_test_data('lca/dir1.lca.json') - db2 = utils.get_test_data('lca/dir2.lca.json') + db1 = utils.get_test_data("lca/dir1.lca.json") + db2 = utils.get_test_data("lca/dir2.lca.json") - cmd = ['lca', 'rankinfo', db1, db2] + cmd = ["lca", "rankinfo", db1, db2] runtmp.sourmash(*cmd) print(cmd) @@ -2205,22 +2653,22 @@ def test_rankinfo_on_multi(runtmp): print(runtmp.last_result.err) lines = runtmp.last_result.out.splitlines() - lines.remove('superkingdom: 0 (0.0%)') - lines.remove('phylum: 464 (12.8%)') - lines.remove('class: 533 (14.7%)') - lines.remove('order: 1050 (29.0%)') - lines.remove('family: 695 (19.2%)') - lines.remove('genus: 681 (18.8%)') - lines.remove('species: 200 (5.5%)') - lines.remove('strain: 0 (0.0%)') + lines.remove("superkingdom: 0 (0.0%)") + lines.remove("phylum: 464 (12.8%)") + lines.remove("class: 533 (14.7%)") + lines.remove("order: 1050 (29.0%)") + lines.remove("family: 695 (19.2%)") + lines.remove("genus: 681 (18.8%)") + lines.remove("species: 200 (5.5%)") + lines.remove("strain: 0 (0.0%)") assert not lines def test_rankinfo_on_single(runtmp): - db1 = utils.get_test_data('lca/both.lca.json') + db1 = utils.get_test_data("lca/both.lca.json") - cmd = ['lca', 'rankinfo', db1] + cmd = ["lca", "rankinfo", db1] runtmp.sourmash(*cmd) print(cmd) @@ -2228,46 +2676,55 @@ def test_rankinfo_on_single(runtmp): print(runtmp.last_result.err) lines = runtmp.last_result.out.splitlines() - lines.remove('superkingdom: 0 (0.0%)') - lines.remove('phylum: 464 (12.8%)') - lines.remove('class: 533 (14.7%)') - lines.remove('order: 1050 (29.0%)') - lines.remove('family: 695 (19.2%)') - lines.remove('genus: 681 (18.8%)') - lines.remove('species: 200 (5.5%)') - lines.remove('strain: 0 (0.0%)') + lines.remove("superkingdom: 0 (0.0%)") + lines.remove("phylum: 464 (12.8%)") + lines.remove("class: 533 (14.7%)") + lines.remove("order: 1050 (29.0%)") + lines.remove("family: 695 (19.2%)") + lines.remove("genus: 681 (18.8%)") + lines.remove("species: 200 (5.5%)") + lines.remove("strain: 0 (0.0%)") assert not lines def test_rankinfo_no_tax(runtmp, lca_db_format): # note: TARA_PSW_MAG_00136 is _not_ in delmont-1.csv. - taxcsv = utils.get_test_data('lca/delmont-1.csv') - input_sig = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') - lca_db = runtmp.output(f'delmont-1.lca.{lca_db_format}') + taxcsv = utils.get_test_data("lca/delmont-1.csv") + input_sig = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") + lca_db = runtmp.output(f"delmont-1.lca.{lca_db_format}") - cmd = ['lca', 'index', taxcsv, lca_db, input_sig, '-F', lca_db_format] + cmd = ["lca", "index", taxcsv, lca_db, input_sig, "-F", lca_db_format] runtmp.sourmash(*cmd) - print('cmd:', cmd) - print('out:', runtmp.last_result.out) - print('err:', runtmp.last_result.err) + print("cmd:", cmd) + print("out:", runtmp.last_result.out) + print("err:", runtmp.last_result.err) assert os.path.exists(lca_db) - assert "** assuming column 'MAGs' is identifiers in spreadsheet" in runtmp.last_result.err - assert "** assuming column 'Domain' is superkingdom in spreadsheet" in runtmp.last_result.err - assert '0 identifiers used out of 1 distinct identifiers in spreadsheet.' in runtmp.last_result.err - - cmd = ['lca', 'rankinfo', lca_db] + assert ( + "** assuming column 'MAGs' is identifiers in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "** assuming column 'Domain' is superkingdom in spreadsheet" + in runtmp.last_result.err + ) + assert ( + "0 identifiers used out of 1 distinct identifiers in spreadsheet." + in runtmp.last_result.err + ) + + cmd = ["lca", "rankinfo", lca_db] runtmp.sourmash(*cmd) def test_rankinfo_with_min(runtmp): - db1 = utils.get_test_data('lca/dir1.lca.json') - db2 = utils.get_test_data('lca/dir2.lca.json') + db1 = utils.get_test_data("lca/dir1.lca.json") + db2 = utils.get_test_data("lca/dir2.lca.json") - cmd = ['lca', 'rankinfo', db1, db2, '--minimum-num', '1'] + cmd = ["lca", "rankinfo", db1, db2, "--minimum-num", "1"] runtmp.sourmash(*cmd) print(cmd) @@ -2275,23 +2732,23 @@ def test_rankinfo_with_min(runtmp): print(runtmp.last_result.err) lines = runtmp.last_result.out.splitlines() - lines.remove('superkingdom: 0 (0.0%)') - lines.remove('phylum: 464 (12.8%)') - lines.remove('class: 533 (14.7%)') - lines.remove('order: 1050 (29.0%)') - lines.remove('family: 695 (19.2%)') - lines.remove('genus: 681 (18.8%)') - lines.remove('species: 200 (5.5%)') - lines.remove('strain: 0 (0.0%)') + lines.remove("superkingdom: 0 (0.0%)") + lines.remove("phylum: 464 (12.8%)") + lines.remove("class: 533 (14.7%)") + lines.remove("order: 1050 (29.0%)") + lines.remove("family: 695 (19.2%)") + lines.remove("genus: 681 (18.8%)") + lines.remove("species: 200 (5.5%)") + lines.remove("strain: 0 (0.0%)") assert not lines def test_rankinfo_with_min_2(runtmp): - db1 = utils.get_test_data('lca/dir1.lca.json') - db2 = utils.get_test_data('lca/dir2.lca.json') + db1 = utils.get_test_data("lca/dir1.lca.json") + db2 = utils.get_test_data("lca/dir2.lca.json") - cmd = ['lca', 'rankinfo', db1, db2, '--minimum-num', '2'] + cmd = ["lca", "rankinfo", db1, db2, "--minimum-num", "2"] runtmp.sourmash(*cmd) print(cmd) @@ -2302,126 +2759,186 @@ def test_rankinfo_with_min_2(runtmp): def test_compare_csv(runtmp): - a = utils.get_test_data('lca/classify-by-both.csv') - b = utils.get_test_data('lca/tara-delmont-SuppTable3.csv') + a = utils.get_test_data("lca/classify-by-both.csv") + b = utils.get_test_data("lca/tara-delmont-SuppTable3.csv") - cmd = ['lca', 'compare_csv', a, b, '-f'] + cmd = ["lca", "compare_csv", a, b, "-f"] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 106 distinct lineages, 957 rows' in runtmp.last_result.err - assert 'missing 937 assignments in classify spreadsheet.' in runtmp.last_result.err - assert '20 total assignments, 0 differ between spreadsheets.' in runtmp.last_result.err + assert "loaded 106 distinct lineages, 957 rows" in runtmp.last_result.err + assert "missing 937 assignments in classify spreadsheet." in runtmp.last_result.err + assert ( + "20 total assignments, 0 differ between spreadsheets." in runtmp.last_result.err + ) def test_compare_csv_real(runtmp): - a = utils.get_test_data('lca/tully-genome-sigs.classify.csv') - b = utils.get_test_data('lca/tully-query.delmont-db.sigs.classify.csv') + a = utils.get_test_data("lca/tully-genome-sigs.classify.csv") + b = utils.get_test_data("lca/tully-query.delmont-db.sigs.classify.csv") - cmd = ['lca', 'compare_csv', a, b, '--start-column=3', '-f'] + cmd = ["lca", "compare_csv", a, b, "--start-column=3", "-f"] runtmp.sourmash(*cmd) print(cmd) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'loaded 87 distinct lineages, 2631 rows' in runtmp.last_result.err - assert 'missing 71 assignments in classify spreadsheet.' in runtmp.last_result.err - assert 'missing 1380 assignments in custom spreadsheet.' in runtmp.last_result.err - assert '(these will not be evaluated any further)' in runtmp.last_result.err - assert '987 total assignments, 889 differ between spreadsheets.' in runtmp.last_result.err - assert '296 are compatible (one lineage is ancestor of another.' in runtmp.last_result.err - assert '593 are incompatible (there is a disagreement in the trees).' in runtmp.last_result.err - assert '164 incompatible at rank superkingdom' in runtmp.last_result.err - assert '255 incompatible at rank phylum' in runtmp.last_result.err - assert '107 incompatible at rank class' in runtmp.last_result.err - assert '54 incompatible at rank order' in runtmp.last_result.err - assert '13 incompatible at rank family' in runtmp.last_result.err - assert '0 incompatible at rank genus' in runtmp.last_result.err - assert '0 incompatible at rank species' in runtmp.last_result.err + assert "loaded 87 distinct lineages, 2631 rows" in runtmp.last_result.err + assert "missing 71 assignments in classify spreadsheet." in runtmp.last_result.err + assert "missing 1380 assignments in custom spreadsheet." in runtmp.last_result.err + assert "(these will not be evaluated any further)" in runtmp.last_result.err + assert ( + "987 total assignments, 889 differ between spreadsheets." + in runtmp.last_result.err + ) + assert ( + "296 are compatible (one lineage is ancestor of another." + in runtmp.last_result.err + ) + assert ( + "593 are incompatible (there is a disagreement in the trees)." + in runtmp.last_result.err + ) + assert "164 incompatible at rank superkingdom" in runtmp.last_result.err + assert "255 incompatible at rank phylum" in runtmp.last_result.err + assert "107 incompatible at rank class" in runtmp.last_result.err + assert "54 incompatible at rank order" in runtmp.last_result.err + assert "13 incompatible at rank family" in runtmp.last_result.err + assert "0 incompatible at rank genus" in runtmp.last_result.err + assert "0 incompatible at rank species" in runtmp.last_result.err def test_incompat_lca_db_ksize_2_fail(runtmp, lca_db_format): # test on gather - create a database with ksize of 25 => fail # because of incompatibility. c = runtmp - testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz') - c.run_sourmash('sketch', 'dna', '-p', 'k=25,scaled=1000', testdata1, - '-o', 'test_db.sig') + testdata1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.fa.gz") + c.run_sourmash( + "sketch", "dna", "-p", "k=25,scaled=1000", testdata1, "-o", "test_db.sig" + ) print(c) - c.run_sourmash('lca', 'index', utils.get_test_data('lca/delmont-1.csv',), - f'test.lca.{lca_db_format}', 'test_db.sig', - '-k', '25', '--scaled', '10000', - '-F', lca_db_format) + c.run_sourmash( + "lca", + "index", + utils.get_test_data( + "lca/delmont-1.csv", + ), + f"test.lca.{lca_db_format}", + "test_db.sig", + "-k", + "25", + "--scaled", + "10000", + "-F", + lca_db_format, + ) print(c) # this should fail: the LCA database has ksize 25, and the query sig has # no compatible ksizes. - with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('gather', utils.get_test_data('lca/TARA_ASE_MAG_00031.sig'), f'test.lca.{lca_db_format}') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "gather", + utils.get_test_data("lca/TARA_ASE_MAG_00031.sig"), + f"test.lca.{lca_db_format}", + ) err = c.last_result.err print(err) - if lca_db_format == 'sql': + if lca_db_format == "sql": assert "no compatible signatures found in 'test.lca.sql'" in err else: assert "ERROR: cannot use 'test.lca.json' for this query." in err - assert "ksize on this database is 25; this is different from requested ksize of 31" + assert ( + "ksize on this database is 25; this is different from requested ksize of 31" + ) def test_incompat_lca_db_ksize_2_nofail(runtmp, lca_db_format): # test on gather - create a database with ksize of 25, no fail # because of --no-fail-on-empty-databases c = runtmp - testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz') - c.run_sourmash('sketch', 'dna', '-p', 'k=25,scaled=1000', testdata1, - '-o', 'test_db.sig') + testdata1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.fa.gz") + c.run_sourmash( + "sketch", "dna", "-p", "k=25,scaled=1000", testdata1, "-o", "test_db.sig" + ) print(c) - c.run_sourmash('lca', 'index', utils.get_test_data('lca/delmont-1.csv',), - f'test.lca.{lca_db_format}', 'test_db.sig', - '-k', '25', '--scaled', '10000', - '-F', lca_db_format) + c.run_sourmash( + "lca", + "index", + utils.get_test_data( + "lca/delmont-1.csv", + ), + f"test.lca.{lca_db_format}", + "test_db.sig", + "-k", + "25", + "--scaled", + "10000", + "-F", + lca_db_format, + ) print(c) # this should not fail despite mismatched ksize, b/c of --no-fail flag. - c.run_sourmash('gather', utils.get_test_data('lca/TARA_ASE_MAG_00031.sig'), f'test.lca.{lca_db_format}', '--no-fail-on-empty-database') + c.run_sourmash( + "gather", + utils.get_test_data("lca/TARA_ASE_MAG_00031.sig"), + f"test.lca.{lca_db_format}", + "--no-fail-on-empty-database", + ) err = c.last_result.err print(err) - if lca_db_format == 'sql': + if lca_db_format == "sql": assert "no compatible signatures found in 'test.lca.sql'" in err else: assert "ERROR: cannot use 'test.lca.json' for this query." in err - assert "ksize on this database is 25; this is different from requested ksize of 31" + assert ( + "ksize on this database is 25; this is different from requested ksize of 31" + ) def test_lca_index_empty(runtmp, lca_db_format): c = runtmp # test lca index with an empty taxonomy CSV, followed by a load & gather. - sig2file = utils.get_test_data('2.fa.sig') - sig47file = utils.get_test_data('47.fa.sig') - sig63file = utils.get_test_data('63.fa.sig') + sig2file = utils.get_test_data("2.fa.sig") + sig47file = utils.get_test_data("47.fa.sig") + sig63file = utils.get_test_data("63.fa.sig") sig63 = load_one_signature(sig63file, ksize=31) # create an empty spreadsheet - with open(c.output('empty.csv'), 'wt') as fp: - fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') + with open(c.output("empty.csv"), "w") as fp: + fp.write( + "accession,superkingdom,phylum,class,order,family,genus,species,strain" + ) # index! - c.run_sourmash('lca', 'index', 'empty.csv', 'xxx', - sig2file, sig47file, sig63file, '--scaled', '1000', - '-F', lca_db_format) + c.run_sourmash( + "lca", + "index", + "empty.csv", + "xxx", + sig2file, + sig47file, + sig63file, + "--scaled", + "1000", + "-F", + lca_db_format, + ) # can we load and search? - lca_db_filename = c.output(f'xxx.lca.{lca_db_format}') + lca_db_filename = c.output(f"xxx.lca.{lca_db_format}") db, ksize, scaled = lca_utils.load_single_database(lca_db_filename) result = db.best_containment(sig63) @@ -2434,9 +2951,9 @@ def test_lca_index_empty(runtmp, lca_db_format): def test_lca_gather_threshold_1(): # test gather() method, in some detail; see same tests for sbt. - sig2file = utils.get_test_data('2.fa.sig') - sig47file = utils.get_test_data('47.fa.sig') - sig63file = utils.get_test_data('63.fa.sig') + sig2file = utils.get_test_data("2.fa.sig") + sig47file = utils.get_test_data("47.fa.sig") + sig63file = utils.get_test_data("63.fa.sig") sig2 = load_one_signature(sig2file, ksize=31) sig47 = load_one_signature(sig47file, ksize=31) @@ -2468,7 +2985,7 @@ def test_lca_gather_threshold_1(): containment, match_sig, name = result assert containment == 1.0 assert match_sig.minhash == sig2.minhash - assert name == None + assert name is None # check with a threshold -> should be no results. with pytest.raises(ValueError): @@ -2485,7 +3002,7 @@ def test_lca_gather_threshold_1(): containment, match_sig, name = result assert containment == 1.0 assert match_sig.minhash == sig2.minhash - assert name == None + assert name is None # check with a too-high threshold -> should be no results. with pytest.raises(ValueError): @@ -2494,9 +3011,9 @@ def test_lca_gather_threshold_1(): def test_lca_gather_threshold_5(): # test gather() method, in some detail; see same tests for sbt. - sig2file = utils.get_test_data('2.fa.sig') - sig47file = utils.get_test_data('47.fa.sig') - sig63file = utils.get_test_data('63.fa.sig') + sig2file = utils.get_test_data("2.fa.sig") + sig47file = utils.get_test_data("47.fa.sig") + sig63file = utils.get_test_data("63.fa.sig") sig2 = load_one_signature(sig2file, ksize=31) sig47 = load_one_signature(sig47file, ksize=31) @@ -2528,7 +3045,7 @@ def test_lca_gather_threshold_5(): containment, match_sig, name = result assert containment == 1.0 assert match_sig.minhash == sig2.minhash - assert name == None + assert name is None # now, check with a threshold_bp that should be meet-able. result = db.best_containment(SourmashSignature(new_mh), threshold_bp=5000) @@ -2536,13 +3053,13 @@ def test_lca_gather_threshold_5(): containment, match_sig, name = result assert containment == 1.0 assert match_sig.minhash == sig2.minhash - assert name == None + assert name is None def test_gather_multiple_return(): - sig2file = utils.get_test_data('2.fa.sig') - sig47file = utils.get_test_data('47.fa.sig') - sig63file = utils.get_test_data('63.fa.sig') + sig2file = utils.get_test_data("2.fa.sig") + sig47file = utils.get_test_data("47.fa.sig") + sig63file = utils.get_test_data("63.fa.sig") sig2 = load_one_signature(sig2file, ksize=31) sig47 = load_one_signature(sig47file, ksize=31) @@ -2564,18 +3081,22 @@ def test_gather_multiple_return(): def test_lca_db_protein_build(): # test programmatic creation of LCA database with protein sigs in it - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) - db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype='protein') + db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype="protein") assert db.insert(sig1) assert db.insert(sig2) # check reconstruction -- - mh_list = [ x.minhash for x in db.signatures() ] + mh_list = [x.minhash for x in db.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2591,28 +3112,32 @@ def test_lca_db_protein_build(): @utils.in_tempdir def test_lca_db_protein_save_load(c): # test save/load of programmatically created db with protein sigs in it - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) - db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype='protein') + db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype="protein") assert db.insert(sig1) assert db.insert(sig2) - db.save(c.output('xxx.lca.json')) + db.save(c.output("xxx.lca.json")) del db - x = sourmash.lca.lca_db.load_single_database(c.output('xxx.lca.json')) + x = sourmash.lca.lca_db.load_single_database(c.output("xxx.lca.json")) db2 = x[0] - assert db2.moltype == 'protein' + assert db2.moltype == "protein" # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 - print('XXX', mh_list[0].ksize) - print('YYY', sig1.minhash.ksize) + print("XXX", mh_list[0].ksize) + print("YYY", sig1.minhash.ksize) assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2628,26 +3153,45 @@ def test_lca_db_protein_command_index(runtmp, lca_db_format): # test command-line creation of LCA database with protein sigs c = runtmp - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') - lineages = utils.get_test_data('prot/gtdb-subset-lineages.csv') - - db_out = c.output(f'protein.lca.{lca_db_format}') - - c.run_sourmash('lca', 'index', lineages, db_out, sigfile1, sigfile2, - '-C', '2', '--split-identifiers', '--require-taxonomy', - '--scaled', '100', '-k', '19', '--protein', - '-F', lca_db_format) + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) + lineages = utils.get_test_data("prot/gtdb-subset-lineages.csv") + + db_out = c.output(f"protein.lca.{lca_db_format}") + + c.run_sourmash( + "lca", + "index", + lineages, + db_out, + sigfile1, + sigfile2, + "-C", + "2", + "--split-identifiers", + "--require-taxonomy", + "--scaled", + "100", + "-k", + "19", + "--protein", + "-F", + lca_db_format, + ) x = sourmash.lca.lca_db.load_single_database(db_out) db2 = x[0] - assert db2.moltype == 'protein' + assert db2.moltype == "protein" sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2664,31 +3208,37 @@ def test_lca_db_protein_command_index(runtmp, lca_db_format): def test_lca_db_protein_command_search(c): # test command-line search/gather of LCA database with protein sigs # (LCA database created as above) - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/protein.lca.json.gz') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/protein.lca.json.gz") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out) - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out) + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_lca_db_hp_build(): # test programmatic creation of LCA database with hp sigs in it - sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) - db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype='hp') + db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype="hp") assert db.insert(sig1) assert db.insert(sig2) # check reconstruction -- - mh_list = [ x.minhash for x in db.signatures() ] + mh_list = [x.minhash for x in db.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2704,25 +3254,29 @@ def test_lca_db_hp_build(): @utils.in_tempdir def test_lca_db_hp_save_load(c): # test save/load of programmatically created db with hp sigs in it - sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) - db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype='hp') + db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype="hp") assert db.insert(sig1) assert db.insert(sig2) - db.save(c.output('xxx.lca.json')) + db.save(c.output("xxx.lca.json")) del db - x = sourmash.lca.lca_db.load_single_database(c.output('xxx.lca.json')) + x = sourmash.lca.lca_db.load_single_database(c.output("xxx.lca.json")) db2 = x[0] - assert db2.moltype == 'hp' + assert db2.moltype == "hp" # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2739,26 +3293,45 @@ def test_lca_db_hp_command_index(runtmp, lca_db_format): # test command-line creation of LCA database with hp sigs c = runtmp - sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') - lineages = utils.get_test_data('prot/gtdb-subset-lineages.csv') - - db_out = c.output(f'hp.lca.{lca_db_format}') - - c.run_sourmash('lca', 'index', lineages, db_out, sigfile1, sigfile2, - '-C', '2', '--split-identifiers', '--require-taxonomy', - '--scaled', '100', '-k', '19', '--hp', - '-F', lca_db_format) + sigfile1 = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) + lineages = utils.get_test_data("prot/gtdb-subset-lineages.csv") + + db_out = c.output(f"hp.lca.{lca_db_format}") + + c.run_sourmash( + "lca", + "index", + lineages, + db_out, + sigfile1, + sigfile2, + "-C", + "2", + "--split-identifiers", + "--require-taxonomy", + "--scaled", + "100", + "-k", + "19", + "--hp", + "-F", + lca_db_format, + ) x = sourmash.lca.lca_db.load_single_database(db_out) db2 = x[0] - assert db2.moltype == 'hp' + assert db2.moltype == "hp" sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2775,31 +3348,37 @@ def test_lca_db_hp_command_index(runtmp, lca_db_format): def test_lca_db_hp_command_search(c): # test command-line search/gather of LCA database with hp sigs # (LCA database created as above) - sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/hp.lca.json.gz') + sigfile1 = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/hp.lca.json.gz") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out, "--threshold", "0.0") + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_lca_db_dayhoff_build(): # test programmatic creation of LCA database with dayhoff sigs in it - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) - db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype='dayhoff') + db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype="dayhoff") assert db.insert(sig1) assert db.insert(sig2) # check reconstruction -- - mh_list = [ x.minhash for x in db.signatures() ] + mh_list = [x.minhash for x in db.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2815,25 +3394,29 @@ def test_lca_db_dayhoff_build(): @utils.in_tempdir def test_lca_db_dayhoff_save_load(c): # test save/load of programmatically created db with dayhoff sigs in it - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) - db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype='dayhoff') + db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype="dayhoff") assert db.insert(sig1) assert db.insert(sig2) - db.save(c.output('xxx.lca.json')) + db.save(c.output("xxx.lca.json")) del db - x = sourmash.lca.lca_db.load_single_database(c.output('xxx.lca.json')) + x = sourmash.lca.lca_db.load_single_database(c.output("xxx.lca.json")) db2 = x[0] - assert db2.moltype == 'dayhoff' + assert db2.moltype == "dayhoff" # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2850,26 +3433,45 @@ def test_lca_db_dayhoff_command_index(runtmp, lca_db_format): # test command-line creation of LCA database with dayhoff sigs c = runtmp - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') - lineages = utils.get_test_data('prot/gtdb-subset-lineages.csv') - - db_out = c.output(f'dayhoff.lca.{lca_db_format}') - - c.run_sourmash('lca', 'index', lineages, db_out, sigfile1, sigfile2, - '-C', '2', '--split-identifiers', '--require-taxonomy', - '--scaled', '100', '-k', '19', '--dayhoff', - '-F', lca_db_format) + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) + lineages = utils.get_test_data("prot/gtdb-subset-lineages.csv") + + db_out = c.output(f"dayhoff.lca.{lca_db_format}") + + c.run_sourmash( + "lca", + "index", + lineages, + db_out, + sigfile1, + sigfile2, + "-C", + "2", + "--split-identifiers", + "--require-taxonomy", + "--scaled", + "100", + "-k", + "19", + "--dayhoff", + "-F", + lca_db_format, + ) x = sourmash.lca.lca_db.load_single_database(db_out) db2 = x[0] - assert db2.moltype == 'dayhoff' + assert db2.moltype == "dayhoff" sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list @@ -2886,29 +3488,43 @@ def test_lca_db_dayhoff_command_index(runtmp, lca_db_format): def test_lca_db_dayhoff_command_search(c): # test command-line search/gather of LCA database with dayhoff sigs # (LCA database created as above) - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/dayhoff.lca.json.gz') + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/dayhoff.lca.json.gz") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out, "--threshold", "0.0") + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_lca_index_with_picklist(runtmp, lca_db_format): - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - outdb = runtmp.output(f'gcf.lca.{lca_db_format}') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + outdb = runtmp.output(f"gcf.lca.{lca_db_format}") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") # create an empty spreadsheet - with open(runtmp.output('empty.csv'), 'wt') as fp: - fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') - - runtmp.sourmash('lca', 'index', 'empty.csv', outdb, *gcf_sigs, - '-k', '21', '--picklist', f"{picklist}:md5:md5", - '-F', lca_db_format) + with open(runtmp.output("empty.csv"), "w") as fp: + fp.write( + "accession,superkingdom,phylum,class,order,family,genus,species,strain" + ) + + runtmp.sourmash( + "lca", + "index", + "empty.csv", + outdb, + *gcf_sigs, + "-k", + "21", + "--picklist", + f"{picklist}:md5:md5", + "-F", + lca_db_format, + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -2923,21 +3539,33 @@ def test_lca_index_with_picklist(runtmp, lca_db_format): siglist = list(sourmash.load_file_as_signatures(outdb)) assert len(siglist) == 3 for ss in siglist: - assert 'Thermotoga' in ss.name + assert "Thermotoga" in ss.name def test_lca_index_with_picklist_exclude(runtmp, lca_db_format): - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - outdb = runtmp.output(f'gcf.lca.{lca_db_format}') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + outdb = runtmp.output(f"gcf.lca.{lca_db_format}") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") # create an empty spreadsheet - with open(runtmp.output('empty.csv'), 'wt') as fp: - fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') - - runtmp.sourmash('lca', 'index', 'empty.csv', outdb, *gcf_sigs, - '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude", - '-F', lca_db_format) + with open(runtmp.output("empty.csv"), "w") as fp: + fp.write( + "accession,superkingdom,phylum,class,order,family,genus,species,strain" + ) + + runtmp.sourmash( + "lca", + "index", + "empty.csv", + outdb, + *gcf_sigs, + "-k", + "21", + "--picklist", + f"{picklist}:md5:md5:exclude", + "-F", + lca_db_format, + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -2948,21 +3576,24 @@ def test_lca_index_with_picklist_exclude(runtmp, lca_db_format): siglist = list(sourmash.load_file_as_signatures(outdb)) assert len(siglist) == 9 for ss in siglist: - assert 'Thermotoga' not in ss.name + assert "Thermotoga" not in ss.name def test_lca_index_select_with_picklist(runtmp, lca_db_format): # check what happens with picklists after index - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - outdb = runtmp.output(f'gcf.lca.{lca_db_format}') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + outdb = runtmp.output(f"gcf.lca.{lca_db_format}") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") # create an empty spreadsheet - with open(runtmp.output('empty.csv'), 'wt') as fp: - fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') + with open(runtmp.output("empty.csv"), "w") as fp: + fp.write( + "accession,superkingdom,phylum,class,order,family,genus,species,strain" + ) - runtmp.sourmash('lca', 'index', 'empty.csv', outdb, *gcf_sigs, - '-k', '21', '-F', lca_db_format) + runtmp.sourmash( + "lca", "index", "empty.csv", outdb, *gcf_sigs, "-k", "21", "-F", lca_db_format + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -2979,21 +3610,24 @@ def test_lca_index_select_with_picklist(runtmp, lca_db_format): siglist = list(idx.signatures()) assert len(siglist) == 3 for ss in siglist: - assert 'Thermotoga' in ss.name + assert "Thermotoga" in ss.name def test_lca_index_select_with_picklist_exclude(runtmp, lca_db_format): # check what happens with picklists after index - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - outdb = runtmp.output(f'gcf.lca.{lca_db_format}') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + outdb = runtmp.output(f"gcf.lca.{lca_db_format}") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") # create an empty spreadsheet - with open(runtmp.output('empty.csv'), 'wt') as fp: - fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') + with open(runtmp.output("empty.csv"), "w") as fp: + fp.write( + "accession,superkingdom,phylum,class,order,family,genus,species,strain" + ) - runtmp.sourmash('lca', 'index', 'empty.csv', outdb, *gcf_sigs, - '-k', '21', '-F', lca_db_format) + runtmp.sourmash( + "lca", "index", "empty.csv", outdb, *gcf_sigs, "-k", "21", "-F", lca_db_format + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -3009,7 +3643,7 @@ def test_lca_index_select_with_picklist_exclude(runtmp, lca_db_format): siglist = list(idx.signatures()) assert len(siglist) == 9 for ss in siglist: - assert 'Thermotoga' not in ss.name + assert "Thermotoga" not in ss.name def test_lca_jaccard_ordering(): @@ -3028,10 +3662,10 @@ def test_lca_jaccard_ordering(): def _intersect(x, y): return x.intersection_and_union_size(y)[0] - print('a intersect b:', _intersect(a, b)) - print('a intersect c:', _intersect(a, c)) - print('a jaccard b:', a.jaccard(b)) - print('a jaccard c:', a.jaccard(c)) + print("a intersect b:", _intersect(a, b)) + print("a intersect c:", _intersect(a, c)) + print("a jaccard b:", a.jaccard(b)) + print("a jaccard c:", a.jaccard(c)) assert _intersect(a, b) > _intersect(a, c) assert a.jaccard(b) < a.jaccard(c) @@ -3040,9 +3674,9 @@ def _intersect(x, y): assert a.jaccard(c) > 0.15 # now - make signatures, try out :) - ss_a = sourmash.SourmashSignature(a, name='A') - ss_b = sourmash.SourmashSignature(b, name='B') - ss_c = sourmash.SourmashSignature(c, name='C') + ss_a = sourmash.SourmashSignature(a, name="A") + ss_b = sourmash.SourmashSignature(b, name="B") + ss_c = sourmash.SourmashSignature(c, name="C") db = sourmash.lca.LCA_Database(ksize=31, scaled=2) db.insert(ss_a) @@ -3060,17 +3694,21 @@ def _intersect(x, y): def test_lca_db_protein_save_twice(runtmp, lca_db_format): # test save twice - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) sig1 = sourmash.load_one_signature(sigfile1) sig2 = sourmash.load_one_signature(sigfile2) - db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype='protein') + db = sourmash.lca.LCA_Database(ksize=19, scaled=100, moltype="protein") assert db.insert(sig1) assert db.insert(sig2) - db.save(runtmp.output('xxx'), format=lca_db_format) + db.save(runtmp.output("xxx"), format=lca_db_format) with pytest.raises(ValueError): - db.save(runtmp.output('xxx'), format=lca_db_format) + db.save(runtmp.output("xxx"), format=lca_db_format) diff --git a/tests/test_lca_db_protocol.py b/tests/test_lca_db_protocol.py index a3fc57b085..eb2f76fe07 100644 --- a/tests/test_lca_db_protocol.py +++ b/tests/test_lca_db_protocol.py @@ -7,26 +7,30 @@ import sourmash from sourmash.tax.tax_utils import MultiLineageDB -from sourmash.lca.lca_db import (LCA_Database, load_single_database) +from sourmash.lca.lca_db import LCA_Database, load_single_database def build_inmem_lca_db(runtmp): # test in-memory LCA_Database - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) ss1 = sourmash.load_one_signature(sigfile1) ss2 = sourmash.load_one_signature(sigfile2) - lineages_file = utils.get_test_data('prot/gtdb-subset-lineages.csv') + lineages_file = utils.get_test_data("prot/gtdb-subset-lineages.csv") lineages = MultiLineageDB.load([lineages_file]) - db = LCA_Database(ksize=19, scaled=100, moltype='protein') + db = LCA_Database(ksize=19, scaled=100, moltype="protein") - ident1 = ss1.name.split(' ')[0].split('.')[0] + ident1 = ss1.name.split(" ")[0].split(".")[0] assert lineages[ident1] db.insert(ss1, ident=ident1, lineage=lineages[ident1]) - ident2 = ss2.name.split(' ')[0].split('.')[0] + ident2 = ss2.name.split(" ")[0].split(".")[0] assert lineages[ident2] db.insert(ss2, ident=ident2, lineage=lineages[ident2]) @@ -36,9 +40,9 @@ def build_inmem_lca_db(runtmp): def build_json_lca_db(runtmp): # test saved/loaded JSON database db = build_inmem_lca_db(runtmp) - db_out = runtmp.output('protein.lca.json') + db_out = runtmp.output("protein.lca.json") - db.save(db_out, format='json') + db.save(db_out, format="json") x = load_single_database(db_out) db_load = x[0] @@ -49,9 +53,9 @@ def build_json_lca_db(runtmp): def build_sql_lca_db(runtmp): # test saved/loaded SQL database db = build_inmem_lca_db(runtmp) - db_out = runtmp.output('protein.lca.json') + db_out = runtmp.output("protein.lca.json") - db.save(db_out, format='sql') + db.save(db_out, format="sql") x = load_single_database(db_out) db_load = x[0] @@ -59,9 +63,7 @@ def build_sql_lca_db(runtmp): return db_load -@pytest.fixture(params=[build_inmem_lca_db, - build_json_lca_db, - build_sql_lca_db]) +@pytest.fixture(params=[build_inmem_lca_db, build_json_lca_db, build_sql_lca_db]) def lca_db_obj(request, runtmp): build_fn = request.param @@ -77,16 +79,18 @@ def test_get_lineage_assignments(lca_db_obj): x = [] for tup in lineage: - if tup[0] != 'strain' or tup[1]: # ignore empty strain + if tup[0] != "strain" or tup[1]: # ignore empty strain x.append((tup[0], tup[1])) - assert x == [('superkingdom', 'd__Archaea'), - ('phylum', 'p__Crenarchaeota'), - ('class', 'c__Bathyarchaeia'), - ('order', 'o__B26-1'), - ('family', 'f__B26-1'), - ('genus', 'g__B26-1'), - ('species', 's__B26-1 sp001593925'),] + assert x == [ + ("superkingdom", "d__Archaea"), + ("phylum", "p__Crenarchaeota"), + ("class", "c__Bathyarchaeia"), + ("order", "o__B26-1"), + ("family", "f__B26-1"), + ("genus", "g__B26-1"), + ("species", "s__B26-1 sp001593925"), + ] def test_hashvals(lca_db_obj): @@ -102,7 +106,7 @@ def test_get_identifiers_for_hashval(lca_db_obj): assert len(idents) == 1 ident = idents[0] - assert ident == 'GCA_001593925' + assert ident == "GCA_001593925" def test_get_identifiers_for_hashval_2(lca_db_obj): @@ -111,15 +115,15 @@ def test_get_identifiers_for_hashval_2(lca_db_obj): for hashval in lca_db_obj.hashvals: idents = lca_db_obj.get_identifiers_for_hashval(hashval) - #idents = list(idents) + # idents = list(idents) all_idents.update(idents) all_idents = list(all_idents) print(all_idents) assert len(all_idents) == 2 - assert 'GCA_001593925' in all_idents - assert 'GCA_001593935' in all_idents + assert "GCA_001593925" in all_idents + assert "GCA_001593935" in all_idents def test_downsample_scaled(lca_db_obj): diff --git a/tests/test_lca_functions.py b/tests/test_lca_functions.py index 0674df80df..9add0df47f 100644 --- a/tests/test_lca_functions.py +++ b/tests/test_lca_functions.py @@ -4,14 +4,21 @@ import pytest from sourmash.lca import lca_utils -from sourmash.lca.lca_utils import (LineagePair, build_tree, find_lca, - taxlist, count_lca_for_assignments, - zip_lineage, display_lineage, - make_lineage, is_lineage_match, - pop_to_rank) - - -class FakeLCA_Database(object): +from sourmash.lca.lca_utils import ( + LineagePair, + build_tree, + find_lca, + taxlist, + count_lca_for_assignments, + zip_lineage, + display_lineage, + make_lineage, + is_lineage_match, + pop_to_rank, +) + + +class FakeLCA_Database: def __init__(self): self._assignments = {} @@ -26,139 +33,194 @@ def get_lineage_assignments(self, hashval): def test_taxlist_1(): - assert list(taxlist()) == ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain'] + assert list(taxlist()) == [ + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + "strain", + ] def test_taxlist_2(): - assert list(taxlist(include_strain=False)) == ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] + assert list(taxlist(include_strain=False)) == [ + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + ] def test_zip_lineage_1(): - x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ] - assert zip_lineage(x) == ['a', 'b', '', '', '', '', '', ''] + x = [LineagePair("superkingdom", "a"), LineagePair("phylum", "b")] + assert zip_lineage(x) == ["a", "b", "", "", "", "", "", ""] def test_zip_lineage_2(): - x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ] - assert zip_lineage(x, truncate_empty=True) == ['a', 'b'] + x = [LineagePair("superkingdom", "a"), LineagePair("phylum", "b")] + assert zip_lineage(x, truncate_empty=True) == ["a", "b"] def test_zip_lineage_3(): - x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ] - assert zip_lineage(x) == ['a', '', 'c', '', '', '', '', ''] + x = [ + LineagePair("superkingdom", "a"), + LineagePair(None, ""), + LineagePair("class", "c"), + ] + assert zip_lineage(x) == ["a", "", "c", "", "", "", "", ""] def test_zip_lineage_3_truncate(): - x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ] - assert zip_lineage(x, truncate_empty=True) == ['a', '', 'c'] + x = [ + LineagePair("superkingdom", "a"), + LineagePair(None, ""), + LineagePair("class", "c"), + ] + assert zip_lineage(x, truncate_empty=True) == ["a", "", "c"] def test_zip_lineage_4(): - x = [ LineagePair('superkingdom', 'a'), LineagePair('class', 'c') ] + x = [LineagePair("superkingdom", "a"), LineagePair("class", "c")] with pytest.raises(ValueError) as e: zip_lineage(x) - assert 'incomplete lineage at phylum - is class instead' in str(e.value) + assert "incomplete lineage at phylum - is class instead" in str(e.value) def test_display_lineage_1(): - x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ] + x = [LineagePair("superkingdom", "a"), LineagePair("phylum", "b")] assert display_lineage(x) == "a;b", display_lineage(x) def test_display_lineage_2(): - x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ] + x = [ + LineagePair("superkingdom", "a"), + LineagePair(None, ""), + LineagePair("class", "c"), + ] assert display_lineage(x) == "a;;c", display_lineage(x) def test_build_tree(): - tree = build_tree([[LineagePair('rank1', 'name1'), - LineagePair('rank2', 'name2')]]) - assert tree == { LineagePair('rank1', 'name1'): - { LineagePair('rank2', 'name2') : {}} } + tree = build_tree([[LineagePair("rank1", "name1"), LineagePair("rank2", "name2")]]) + assert tree == {LineagePair("rank1", "name1"): {LineagePair("rank2", "name2"): {}}} def test_build_tree_2(): - tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')], - [LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2b')], - ]) + tree = build_tree( + [ + [LineagePair("rank1", "name1"), LineagePair("rank2", "name2a")], + [LineagePair("rank1", "name1"), LineagePair("rank2", "name2b")], + ] + ) - assert tree == { LineagePair('rank1', 'name1'): { LineagePair('rank2', 'name2a') : {}, - LineagePair('rank2', 'name2b') : {}} } + assert tree == { + LineagePair("rank1", "name1"): { + LineagePair("rank2", "name2a"): {}, + LineagePair("rank2", "name2b"): {}, + } + } -def test_build_tree_3(): # empty 'rank2' name - tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', '')]]) - assert tree == { LineagePair('rank1', 'name1'): {} } +def test_build_tree_3(): # empty 'rank2' name + tree = build_tree([[LineagePair("rank1", "name1"), LineagePair("rank2", "")]]) + assert tree == {LineagePair("rank1", "name1"): {}} def test_build_tree_4(): - tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')], - ]) - - tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2b')], - ], tree) + tree = build_tree( + [ + [LineagePair("rank1", "name1"), LineagePair("rank2", "name2a")], + ] + ) + + tree = build_tree( + [ + [LineagePair("rank1", "name1"), LineagePair("rank2", "name2b")], + ], + tree, + ) + + assert tree == { + LineagePair("rank1", "name1"): { + LineagePair("rank2", "name2a"): {}, + LineagePair("rank2", "name2b"): {}, + } + } - assert tree == { LineagePair('rank1', 'name1'): { LineagePair('rank2', 'name2a') : {}, - LineagePair('rank2', 'name2b') : {}} } def test_build_tree_5(): with pytest.raises(ValueError): - tree = build_tree([]) + build_tree([]) def test_find_lca(): - tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2')]]) + tree = build_tree([[LineagePair("rank1", "name1"), LineagePair("rank2", "name2")]]) lca = find_lca(tree) - assert lca == ((LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2'),), 0) + assert lca == ( + ( + LineagePair("rank1", "name1"), + LineagePair("rank2", "name2"), + ), + 0, + ) def test_find_lca_2(): - tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')], - [LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2b')], - ]) + tree = build_tree( + [ + [LineagePair("rank1", "name1"), LineagePair("rank2", "name2a")], + [LineagePair("rank1", "name1"), LineagePair("rank2", "name2b")], + ] + ) lca = find_lca(tree) - assert lca == ((LineagePair('rank1', 'name1'),), 2) + assert lca == ((LineagePair("rank1", "name1"),), 2) def test_find_lca_3(): - lin1 = lca_utils.make_lineage('a;b;c') - lin2 = lca_utils.make_lineage('a;b') + lin1 = lca_utils.make_lineage("a;b;c") + lin2 = lca_utils.make_lineage("a;b") tree = build_tree([lin1, lin2]) lca, reason = find_lca(tree) - assert lca == lin1 # find most specific leaf node + assert lca == lin1 # find most specific leaf node def test_gather_assignments_1(): # test basic mechanics of gather_assignments function hashval = 12345678 - lin = lca_utils.make_lineage('a;b;c') + lin = lca_utils.make_lineage("a;b;c") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin ])) + db._set_lineage_assignment(hashval, set([lin])) assignments = lca_utils.gather_assignments([hashval], [db]) print(assignments) - assert assignments[hashval] == set([ lin ]) + assert assignments[hashval] == set([lin]) def test_gather_assignments_2(): # test basic mechanics of gather_assignments function with two lineages hashval = 12345678 - lin = lca_utils.make_lineage('a;b;c') - lin2 = lca_utils.make_lineage('a;b;d') + lin = lca_utils.make_lineage("a;b;c") + lin2 = lca_utils.make_lineage("a;b;d") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin, lin2 ])) + db._set_lineage_assignment(hashval, set([lin, lin2])) assignments = lca_utils.gather_assignments([hashval], [db]) print(assignments) - assert assignments[hashval] == set([ lin, lin2 ]) + assert assignments[hashval] == set([lin, lin2]) def test_gather_assignments_3(): @@ -166,27 +228,27 @@ def test_gather_assignments_3(): # and two hashvals hashval = 12345678 hashval2 = 87654321 - lin = lca_utils.make_lineage('a;b;c') - lin2 = lca_utils.make_lineage('a;b;d') + lin = lca_utils.make_lineage("a;b;c") + lin2 = lca_utils.make_lineage("a;b;d") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin, lin2 ])) - db._set_lineage_assignment(hashval2, set([ lin ])) + db._set_lineage_assignment(hashval, set([lin, lin2])) + db._set_lineage_assignment(hashval2, set([lin])) assignments = lca_utils.gather_assignments([hashval, hashval2], [db]) print(assignments) - assert assignments[hashval] == set([ lin, lin2 ]) - assert assignments[hashval2] == set([ lin ]) + assert assignments[hashval] == set([lin, lin2]) + assert assignments[hashval2] == set([lin]) def test_count_lca_for_assignments_1(): # test basic mechanics of gather_assignments function hashval = 12345678 - lin = lca_utils.make_lineage('a;b;c') + lin = lca_utils.make_lineage("a;b;c") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin ])) + db._set_lineage_assignment(hashval, set([lin])) assignments = lca_utils.gather_assignments([hashval], [db]) counts = count_lca_for_assignments(assignments) @@ -199,11 +261,11 @@ def test_count_lca_for_assignments_1(): def test_count_lca_for_assignments_2(): # test basic mechanics of gather_assignments function with two lineages hashval = 12345678 - lin = lca_utils.make_lineage('a;b;c') - lin2 = lca_utils.make_lineage('a;b;d') + lin = lca_utils.make_lineage("a;b;c") + lin2 = lca_utils.make_lineage("a;b;d") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin, lin2 ])) + db._set_lineage_assignment(hashval, set([lin, lin2])) assignments = lca_utils.gather_assignments([hashval], [db]) counts = count_lca_for_assignments(assignments) @@ -213,7 +275,7 @@ def test_count_lca_for_assignments_2(): assert counts[lin2] == 0 assert len(counts) == 1 - lca_lin = lca_utils.make_lineage('a;b') + lca_lin = lca_utils.make_lineage("a;b") assert counts[lca_lin] == 1 @@ -222,12 +284,12 @@ def test_count_lca_for_assignments_3(): # and two hashvals hashval = 12345678 hashval2 = 87654321 - lin = lca_utils.make_lineage('a;b;c') - lin2 = lca_utils.make_lineage('a;b;d') + lin = lca_utils.make_lineage("a;b;c") + lin2 = lca_utils.make_lineage("a;b;d") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin, lin2 ])) - db._set_lineage_assignment(hashval2, set([ lin ])) + db._set_lineage_assignment(hashval, set([lin, lin2])) + db._set_lineage_assignment(hashval2, set([lin])) assignments = lca_utils.gather_assignments([hashval, hashval2], [db]) counts = count_lca_for_assignments(assignments) @@ -237,20 +299,20 @@ def test_count_lca_for_assignments_3(): assert counts[lin] == 1 assert counts[lin2] == 0 - lca_lin = lca_utils.make_lineage('a;b') + lca_lin = lca_utils.make_lineage("a;b") assert counts[lca_lin] == 1 def test_count_lca_for_assignments_abund_1(): # test basic mechanics of gather_assignments function hashval = 12345678 - hashval_counts = dict() + hashval_counts = {} hashval_counts[hashval] = 3 - lin = lca_utils.make_lineage('a;b;c') + lin = lca_utils.make_lineage("a;b;c") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin ])) + db._set_lineage_assignment(hashval, set([lin])) assignments = lca_utils.gather_assignments(hashval_counts.keys(), [db]) counts = count_lca_for_assignments(assignments, hashval_counts) @@ -263,14 +325,14 @@ def test_count_lca_for_assignments_abund_1(): def test_count_lca_for_assignments_abund_2(): # test basic mechanics of gather_assignments function with two lineages hashval = 12345678 - hashval_counts = dict() + hashval_counts = {} hashval_counts[hashval] = 3 - lin = lca_utils.make_lineage('a;b;c') - lin2 = lca_utils.make_lineage('a;b;d') + lin = lca_utils.make_lineage("a;b;c") + lin2 = lca_utils.make_lineage("a;b;d") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin, lin2 ])) + db._set_lineage_assignment(hashval, set([lin, lin2])) assignments = lca_utils.gather_assignments(hashval_counts, [db]) counts = count_lca_for_assignments(assignments, hashval_counts) @@ -280,8 +342,8 @@ def test_count_lca_for_assignments_abund_2(): assert counts[lin2] == 0 assert len(counts) == 1 - lca_lin = lca_utils.make_lineage('a;b') - assert counts[lca_lin] == 3 # yes! + lca_lin = lca_utils.make_lineage("a;b") + assert counts[lca_lin] == 3 # yes! def test_count_lca_for_assignments_abund_3(): @@ -289,27 +351,28 @@ def test_count_lca_for_assignments_abund_3(): # and two hashvals hashval = 12345678 hashval2 = 87654321 - hashval_counts = dict() + hashval_counts = {} hashval_counts[hashval] = 2 hashval_counts[hashval2] = 5 - lin = lca_utils.make_lineage('a;b;c') - lin2 = lca_utils.make_lineage('a;b;d') + lin = lca_utils.make_lineage("a;b;c") + lin2 = lca_utils.make_lineage("a;b;d") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin, lin2 ])) - db._set_lineage_assignment(hashval2, set([ lin ])) + db._set_lineage_assignment(hashval, set([lin, lin2])) + db._set_lineage_assignment(hashval2, set([lin])) assignments = lca_utils.gather_assignments(hashval_counts, [db]) counts = count_lca_for_assignments(assignments, hashval_counts) print(counts) assert len(counts) == 2 - assert counts[lin] == 5 # makes sense - assert counts[lin2] == 0 # makes sense + assert counts[lin] == 5 # makes sense + assert counts[lin2] == 0 # makes sense + + lca_lin = lca_utils.make_lineage("a;b") + assert counts[lca_lin] == 2 # yes! - lca_lin = lca_utils.make_lineage('a;b') - assert counts[lca_lin] == 2 # yes! def test_count_lca_for_assignments_abund_4(): # test basic mechanics of gather_assignments function with three lineages @@ -317,112 +380,113 @@ def test_count_lca_for_assignments_abund_4(): hashval = 12345678 hashval2 = 87654321 hashval3 = 34567891 - hashval_counts = dict() + hashval_counts = {} hashval_counts[hashval] = 2 hashval_counts[hashval2] = 5 hashval_counts[hashval3] = 3 - lin = lca_utils.make_lineage('a;b;c') - lin2 = lca_utils.make_lineage('a;b;d') - lin3 = lca_utils.make_lineage('a;b;d;e') + lin = lca_utils.make_lineage("a;b;c") + lin2 = lca_utils.make_lineage("a;b;d") + lin3 = lca_utils.make_lineage("a;b;d;e") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin, lin2 ])) # lca: a;b - db._set_lineage_assignment(hashval2, set([ lin ])) # lca: a;b;c - db._set_lineage_assignment(hashval3, set([ lin2, lin3 ])) # a;b;d;e + db._set_lineage_assignment(hashval, set([lin, lin2])) # lca: a;b + db._set_lineage_assignment(hashval2, set([lin])) # lca: a;b;c + db._set_lineage_assignment(hashval3, set([lin2, lin3])) # a;b;d;e assignments = lca_utils.gather_assignments(hashval_counts, [db]) counts = count_lca_for_assignments(assignments, hashval_counts) print(counts) assert len(counts) == 3 - assert counts[lin] == 5 # makes sense b/c hashval2 - assert counts[lin2] == 0 # a;b;d (lin2) + a;b;d;e (lin3) -->a;b;d;e (lin3) only - assert counts[lin3] == 3 # hashval3 + assert counts[lin] == 5 # makes sense b/c hashval2 + assert counts[lin2] == 0 # a;b;d (lin2) + a;b;d;e (lin3) -->a;b;d;e (lin3) only + assert counts[lin3] == 3 # hashval3 + + lca_lin = lca_utils.make_lineage("a;b") + assert counts[lca_lin] == 2 # yes, b/c hashval - lca_lin = lca_utils.make_lineage('a;b') - assert counts[lca_lin] == 2 # yes, b/c hashval def test_count_lca_for_assignments_abund_5(): # test basic mechanics of gather_assignments function with two lineages # and two hashvals when linages match but one has lower taxo detail hashval = 12345678 hashval2 = 87654321 - hashval_counts = dict() + hashval_counts = {} hashval_counts[hashval] = 2 hashval_counts[hashval2] = 5 - lin = lca_utils.make_lineage('a;b;d') - lin2 = lca_utils.make_lineage('a;b;d;e') + lin = lca_utils.make_lineage("a;b;d") + lin2 = lca_utils.make_lineage("a;b;d;e") db = FakeLCA_Database() - db._set_lineage_assignment(hashval, set([ lin, lin2 ])) - db._set_lineage_assignment(hashval2, set([ lin ])) + db._set_lineage_assignment(hashval, set([lin, lin2])) + db._set_lineage_assignment(hashval2, set([lin])) assignments = lca_utils.gather_assignments(hashval_counts, [db]) counts = count_lca_for_assignments(assignments, hashval_counts) print(counts) assert len(counts) == 2 - assert counts[lin] == 5 # makes sense - assert counts[lin2] == 2 # lin+lin2 yield just lin2 + assert counts[lin] == 5 # makes sense + assert counts[lin2] == 2 # lin+lin2 yield just lin2 def test_is_lineage_match_1(): # basic behavior: match at order and above, but not at family or below. - lin1 = make_lineage('d__a;p__b;c__c;o__d;f__e') - lin2 = make_lineage('d__a;p__b;c__c;o__d;f__f') + lin1 = make_lineage("d__a;p__b;c__c;o__d;f__e") + lin2 = make_lineage("d__a;p__b;c__c;o__d;f__f") - assert is_lineage_match(lin1, lin2, 'superkingdom') - assert is_lineage_match(lin1, lin2, 'phylum') - assert is_lineage_match(lin1, lin2, 'class') - assert is_lineage_match(lin1, lin2, 'order') - assert not is_lineage_match(lin1, lin2, 'family') - assert not is_lineage_match(lin1, lin2, 'genus') - assert not is_lineage_match(lin1, lin2, 'species') + assert is_lineage_match(lin1, lin2, "superkingdom") + assert is_lineage_match(lin1, lin2, "phylum") + assert is_lineage_match(lin1, lin2, "class") + assert is_lineage_match(lin1, lin2, "order") + assert not is_lineage_match(lin1, lin2, "family") + assert not is_lineage_match(lin1, lin2, "genus") + assert not is_lineage_match(lin1, lin2, "species") def test_is_lineage_match_2(): # match at family, and above, levels; no genus or species to match - lin1 = make_lineage('d__a;p__b;c__c;o__d;f__f') - lin2 = make_lineage('d__a;p__b;c__c;o__d;f__f') + lin1 = make_lineage("d__a;p__b;c__c;o__d;f__f") + lin2 = make_lineage("d__a;p__b;c__c;o__d;f__f") - assert is_lineage_match(lin1, lin2, 'superkingdom') - assert is_lineage_match(lin1, lin2, 'phylum') - assert is_lineage_match(lin1, lin2, 'class') - assert is_lineage_match(lin1, lin2, 'order') - assert is_lineage_match(lin1, lin2, 'family') - assert not is_lineage_match(lin1, lin2, 'genus') - assert not is_lineage_match(lin1, lin2, 'species') + assert is_lineage_match(lin1, lin2, "superkingdom") + assert is_lineage_match(lin1, lin2, "phylum") + assert is_lineage_match(lin1, lin2, "class") + assert is_lineage_match(lin1, lin2, "order") + assert is_lineage_match(lin1, lin2, "family") + assert not is_lineage_match(lin1, lin2, "genus") + assert not is_lineage_match(lin1, lin2, "species") def test_is_lineage_match_3(): # one lineage is empty - lin1 = make_lineage('') - lin2 = make_lineage('d__a;p__b;c__c;o__d;f__f') + lin1 = make_lineage("") + lin2 = make_lineage("d__a;p__b;c__c;o__d;f__f") - assert not is_lineage_match(lin1, lin2, 'superkingdom') - assert not is_lineage_match(lin1, lin2, 'family') - assert not is_lineage_match(lin1, lin2, 'order') - assert not is_lineage_match(lin1, lin2, 'class') - assert not is_lineage_match(lin1, lin2, 'phylum') - assert not is_lineage_match(lin1, lin2, 'genus') - assert not is_lineage_match(lin1, lin2, 'species') + assert not is_lineage_match(lin1, lin2, "superkingdom") + assert not is_lineage_match(lin1, lin2, "family") + assert not is_lineage_match(lin1, lin2, "order") + assert not is_lineage_match(lin1, lin2, "class") + assert not is_lineage_match(lin1, lin2, "phylum") + assert not is_lineage_match(lin1, lin2, "genus") + assert not is_lineage_match(lin1, lin2, "species") def test_pop_to_rank_1(): # basic behavior - pop to order? - lin1 = make_lineage('d__a;p__b;c__c;o__d') - lin2 = make_lineage('d__a;p__b;c__c;o__d;f__f') + lin1 = make_lineage("d__a;p__b;c__c;o__d") + lin2 = make_lineage("d__a;p__b;c__c;o__d;f__f") print(lin1) - print(pop_to_rank(lin2, 'order')) - assert pop_to_rank(lin2, 'order') == lin1 + print(pop_to_rank(lin2, "order")) + assert pop_to_rank(lin2, "order") == lin1 def test_pop_to_rank_2(): # what if we're already above rank? - lin2 = make_lineage('d__a;p__b;c__c;o__d;f__f') + lin2 = make_lineage("d__a;p__b;c__c;o__d;f__f") - print(pop_to_rank(lin2, 'species')) - assert pop_to_rank(lin2, 'species') == lin2 + print(pop_to_rank(lin2, "species")) + assert pop_to_rank(lin2, "species") == lin2 diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 074d72d705..138ae0f829 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -12,13 +12,13 @@ def test_generate_manifest(): # test basic manifest-generating functionality. - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") loader = sourmash.load_file_as_index(protzip) rows = [] siglist = [] - for (sig, loc) in loader._signatures_with_internal(): + for sig, loc in loader._signatures_with_internal(): row = index.CollectionManifest.make_manifest_row(sig, loc) rows.append(row) siglist.append(sig) @@ -28,9 +28,9 @@ def test_generate_manifest(): assert len(manifest) == len(rows) assert len(manifest) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list for sig in siglist: assert sig in manifest @@ -38,13 +38,13 @@ def test_generate_manifest(): def test_manifest_operations(): # test basic manifest operations - += - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") loader = sourmash.load_file_as_index(protzip) rows = [] siglist = [] - for (sig, loc) in loader._signatures_with_internal(): + for sig, loc in loader._signatures_with_internal(): row = index.CollectionManifest.make_manifest_row(sig, loc) rows.append(row) siglist.append(sig) @@ -53,24 +53,24 @@ def test_manifest_operations(): manifest2 = index.CollectionManifest(rows) manifest += manifest2 - assert len(manifest) == 2*len(rows) + assert len(manifest) == 2 * len(rows) assert len(manifest) == 4 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list def test_manifest_operations_fail(): # should not be able to add a manifest to itself - not only makes # no sense, but it means you're modifying a generator in place, sometimes. - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") loader = sourmash.load_file_as_index(protzip) rows = [] siglist = [] - for (sig, loc) in loader._signatures_with_internal(): + for sig, loc in loader._signatures_with_internal(): row = index.CollectionManifest.make_manifest_row(sig, loc) rows.append(row) siglist.append(sig) @@ -82,13 +82,13 @@ def test_manifest_operations_fail(): def test_manifest_to_picklist(): # test manifest/picklist interaction basics - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") loader = sourmash.load_file_as_index(protzip) rows = [] siglist = [] - for (sig, loc) in loader._signatures_with_internal(): + for sig, loc in loader._signatures_with_internal(): row = index.CollectionManifest.make_manifest_row(sig, loc) rows.append(row) siglist.append(sig) @@ -103,7 +103,7 @@ def test_manifest_to_picklist(): def test_manifest_compare(): # test saving and loading manifests - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") loader = sourmash.load_file_as_index(protzip) manifest = loader.manifest @@ -124,7 +124,7 @@ def test_manifest_compare(): # not equal / diff values rows = list(manifest.rows) rows[0] = dict(rows[0]) - rows[0]['internal_location'] += '.foo' + rows[0]["internal_location"] += ".foo" short_mf = index.CollectionManifest(rows) assert short_mf != manifest @@ -132,13 +132,13 @@ def test_manifest_compare(): def test_save_load_manifest(): # test saving and loading manifests - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") loader = sourmash.load_file_as_index(protzip) rows = [] siglist = [] - for (sig, loc) in loader._signatures_with_internal(): + for sig, loc in loader._signatures_with_internal(): row = index.CollectionManifest.make_manifest_row(sig, loc) rows.append(row) siglist.append(sig) @@ -179,7 +179,7 @@ def test_save_load_manifest(): # not equal / diff values rows = list(manifest.rows) rows[0] = dict(rows[0]) - rows[0]['internal_location'] += '.foo' + rows[0]["internal_location"] += ".foo" short_mf = index.CollectionManifest(rows) assert short_mf != manifest @@ -189,8 +189,7 @@ def test_manifest_to_picklist_bug(runtmp): # this tests a fun combination of things that led to a bug. # tl;dr we only want to iterate once across a generator... # ref #2762 - c = runtmp - all_zip = utils.get_test_data('prot/all.zip') + all_zip = utils.get_test_data("prot/all.zip") idx = sourmash_args.load_file_as_index(all_zip) assert len(idx) == 8 @@ -201,7 +200,7 @@ def test_manifest_to_picklist_bug(runtmp): def filter_fn(row): # match? keep = False - if "09a0869" in row['md5']: + if "09a0869" in row["md5"]: keep = True return keep @@ -219,17 +218,17 @@ def filter_fn(row): def test_generate_manifest_iterate_once(): # we should only iterate across manifest rows once - protzip = utils.get_test_data('prot/protein.zip') + protzip = utils.get_test_data("prot/protein.zip") loader = sourmash.load_file_as_index(protzip) siglist = [] - for (sig, loc) in loader._signatures_with_internal(): + for sig, loc in loader._signatures_with_internal(): siglist.append(sig) # build generator function => will not allow iteration twice def genfn(): - for (sig, loc) in loader._signatures_with_internal(): + for sig, loc in loader._signatures_with_internal(): row = index.CollectionManifest.make_manifest_row(sig, loc) yield row @@ -238,9 +237,9 @@ def genfn(): assert len(manifest) == 2 assert len(manifest._md5_set) == 2 - md5_list = [ row['md5'] for row in manifest.rows ] - assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list - assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list + md5_list = [row["md5"] for row in manifest.rows] + assert "16869d2c8a1d29d1c8e56f5c561e585e" in md5_list + assert "120d311cc785cc9d0df9dc0646b2b857" in md5_list for sig in siglist: assert sig in manifest diff --git a/tests/test_manifest_protocol.py b/tests/test_manifest_protocol.py index 5b9ea003d5..d36e8a309c 100644 --- a/tests/test_manifest_protocol.py +++ b/tests/test_manifest_protocol.py @@ -13,7 +13,7 @@ def build_simple_manifest(runtmp): # load and return the manifest from prot/all.zip - filename = utils.get_test_data('prot/all.zip') + filename = utils.get_test_data("prot/all.zip") idx = sourmash.load_file_as_index(filename) mf = idx.manifest assert len(mf) == 8 @@ -22,29 +22,29 @@ def build_simple_manifest(runtmp): def build_sqlite_manifest(runtmp): # return the manifest from prot/all.zip - filename = utils.get_test_data('prot/all.zip') + filename = utils.get_test_data("prot/all.zip") idx = sourmash.load_file_as_index(filename) mf = idx.manifest # build sqlite manifest from this 'un - mfdb = runtmp.output('test.sqlmf') + mfdb = runtmp.output("test.sqlmf") return SqliteCollectionManifest.load_from_manifest(mf, dbfile=mfdb) - + def save_load_manifest(runtmp): # save/load the manifest from a CSV. mf = build_simple_manifest(runtmp) - mf_csv = runtmp.output('mf.csv') + mf_csv = runtmp.output("mf.csv") mf.write_to_filename(mf_csv) load_mf = CollectionManifest.load_from_filename(mf_csv) return load_mf - -@pytest.fixture(params=[build_simple_manifest, - save_load_manifest, - build_sqlite_manifest]) + +@pytest.fixture( + params=[build_simple_manifest, save_load_manifest, build_sqlite_manifest] +) def manifest_obj(request, runtmp): build_fn = request.param @@ -55,6 +55,7 @@ def manifest_obj(request, runtmp): ### generic CollectionManifeset tests go here ### + def test_manifest_len(manifest_obj): # check that 'len' works assert len(manifest_obj) == 8 @@ -78,39 +79,38 @@ def test_manifest_bool(manifest_obj): def test_make_manifest_row(manifest_obj): # build a manifest row from a signature - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = sourmash.load_one_signature(sig47) - row = manifest_obj.make_manifest_row(ss, 'foo', include_signature=False) - assert not 'signature' in row - assert row['internal_location'] == 'foo' - - assert row['md5'] == ss.md5sum() - assert row['md5short'] == ss.md5sum()[:8] - assert row['ksize'] == 31 - assert row['moltype'] == 'DNA' - assert row['num'] == 0 - assert row['scaled'] == 1000 - assert row['n_hashes'] == len(ss.minhash) - assert not row['with_abundance'] - assert row['name'] == ss.name - assert row['filename'] == ss.filename - - + row = manifest_obj.make_manifest_row(ss, "foo", include_signature=False) + assert "signature" not in row + assert row["internal_location"] == "foo" + + assert row["md5"] == ss.md5sum() + assert row["md5short"] == ss.md5sum()[:8] + assert row["ksize"] == 31 + assert row["moltype"] == "DNA" + assert row["num"] == 0 + assert row["scaled"] == 1000 + assert row["n_hashes"] == len(ss.minhash) + assert not row["with_abundance"] + assert row["name"] == ss.name + assert row["filename"] == ss.filename + + def test_manifest_create_manifest(manifest_obj): # test the 'create_manifest' method - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = sourmash.load_one_signature(sig47) def yield_sigs(): - yield ss, 'fiz' + yield ss, "fiz" - new_mf = manifest_obj.create_manifest(yield_sigs(), - include_signature=False) + new_mf = manifest_obj.create_manifest(yield_sigs(), include_signature=False) assert len(new_mf) == 1 new_row = list(new_mf.rows)[0] - - row = manifest_obj.make_manifest_row(ss, 'fiz', include_signature=False) + + row = manifest_obj.make_manifest_row(ss, "fiz", include_signature=False) required_keys = BaseCollectionManifest.required_keys for k in required_keys: @@ -119,32 +119,37 @@ def yield_sigs(): def test_manifest_select_to_manifest(manifest_obj): # do some light testing of 'select_to_manifest' - new_mf = manifest_obj.select_to_manifest(moltype='DNA') + new_mf = manifest_obj.select_to_manifest(moltype="DNA") assert len(new_mf) == 2 def test_manifest_locations(manifest_obj): # check the 'locations' method - locs = set(['dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig', - 'dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig', - 'hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig', - 'hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig', - 'protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig', - 'protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig', - 'dna-sig.noext', - 'dna-sig.sig.gz'] - ) + locs = set( + [ + "dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + "dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig", + "hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + "hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig", + "protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig", + "protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig", + "dna-sig.noext", + "dna-sig.sig.gz", + ] + ) assert set(manifest_obj.locations()) == locs def test_manifest___contains__(manifest_obj): # check the 'in' operator - sigfile = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') + sigfile = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) ss = sourmash.load_one_signature(sigfile) assert ss in manifest_obj - sigfile2 = utils.get_test_data('2.fa.sig') + sigfile2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sigfile2, ksize=31) assert ss2 not in manifest_obj @@ -159,36 +164,37 @@ def test_manifest_to_picklist(manifest_obj): def test_manifest_filter_rows(manifest_obj): # test filter_rows - filter_fn = lambda x: 'OS223' in x['name'] + def filter_fn(x): + return "OS223" in x["name"] mf = manifest_obj.filter_rows(filter_fn) assert len(mf) == 1 row = list(mf.rows)[0] - assert row['name'] == 'NC_011663.1 Shewanella baltica OS223, complete genome' + assert row["name"] == "NC_011663.1 Shewanella baltica OS223, complete genome" def test_manifest_filter_cols(manifest_obj): # test filter_rows - col_filter_fn = lambda x: 'OS223' in x[0] + def col_filter_fn(x): + return "OS223" in x[0] - mf = manifest_obj.filter_on_columns(col_filter_fn, ['name']) + mf = manifest_obj.filter_on_columns(col_filter_fn, ["name"]) assert len(mf) == 1 row = list(mf.rows)[0] - assert row['name'] == 'NC_011663.1 Shewanella baltica OS223, complete genome' + assert row["name"] == "NC_011663.1 Shewanella baltica OS223, complete genome" def test_manifest_iadd(manifest_obj): # test the 'create_manifest' method - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = sourmash.load_one_signature(sig47) def yield_sigs(): - yield ss, 'fiz' + yield ss, "fiz" - new_mf = manifest_obj.create_manifest(yield_sigs(), - include_signature=False) + new_mf = manifest_obj.create_manifest(yield_sigs(), include_signature=False) assert len(new_mf) == 1 new_mf += manifest_obj @@ -197,14 +203,13 @@ def yield_sigs(): def test_manifest_add(manifest_obj): # test the 'create_manifest' method - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = sourmash.load_one_signature(sig47) def yield_sigs(): - yield ss, 'fiz' + yield ss, "fiz" - new_mf = manifest_obj.create_manifest(yield_sigs(), - include_signature=False) + new_mf = manifest_obj.create_manifest(yield_sigs(), include_signature=False) assert len(new_mf) == 1 new_mf2 = new_mf + manifest_obj diff --git a/tests/test_minhash.py b/tests/test_minhash.py index 05802c0bff..474f1e231a 100644 --- a/tests/test_minhash.py +++ b/tests/test_minhash.py @@ -49,7 +49,7 @@ hash_murmur, _get_scaled_for_max_hash, _get_max_hash_for_scaled, - translate_codon + translate_codon, ) from sourmash import signature @@ -79,18 +79,18 @@ def _kmers_from_all_coding_frames(sequence, ksize): for frame in (0, 1, 2): # get forward k-mers for start in range(0, len(sequence) - ksize + 1 - frame, 3): - kmer = sequence[start + frame:start + frame + ksize] + kmer = sequence[start + frame : start + frame + ksize] yield kmer # get rc k-mers for start in range(0, len(seqrc) - ksize + 1 - frame, 3): - kmer = seqrc[start + frame:start + frame + ksize] + kmer = seqrc[start + frame : start + frame + ksize] yield kmer def _hash_fwd_only(mh_translate, seq): "Return the first hashval only, for coding frame +1." - assert len(seq) == mh_translate.ksize*3 + assert len(seq) == mh_translate.ksize * 3 xx = mh_translate.seq_to_hashes(seq)[0] return xx @@ -98,12 +98,12 @@ def _hash_fwd_only(mh_translate, seq): def test_basic_dna(track_abundance): # verify that MHs of size 1 stay size 1, & act properly as bottom sketches. mh = MinHash(1, 4, track_abundance=track_abundance) - assert mh.moltype == 'DNA' + assert mh.moltype == "DNA" - mh.add_sequence('ATGC') + mh.add_sequence("ATGC") a = mh.hashes - mh.add_sequence('GCAT') # this will not get added; hash > ATGC + mh.add_sequence("GCAT") # this will not get added; hash > ATGC b = mh.hashes print(a, b) @@ -117,7 +117,7 @@ def test_div_zero(track_abundance): mh = MinHash(1, 4, track_abundance=track_abundance) mh2 = mh.copy_and_clear() - mh.add_sequence('ATGC') + mh.add_sequence("ATGC") assert mh.similarity(mh2) == 0 assert mh2.similarity(mh) == 0 @@ -127,7 +127,7 @@ def test_div_zero_contained(track_abundance): mh = MinHash(0, 4, scaled=1, track_abundance=track_abundance) mh2 = mh.copy_and_clear() - mh.add_sequence('ATGC') + mh.add_sequence("ATGC") assert mh.contained_by(mh2) == 0 assert mh2.contained_by(mh) == 0 @@ -137,8 +137,8 @@ def test_contained_requires_scaled(track_abundance): mh1 = MinHash(1, 4, track_abundance=track_abundance) mh2 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) - mh1.add_sequence('ATGC') - mh2.add_sequence('ATGC') + mh1.add_sequence("ATGC") + mh2.add_sequence("ATGC") with pytest.raises(TypeError): mh2.contained_by(mh1) @@ -152,8 +152,8 @@ def test_contained_requires_scaled_2(track_abundance): mh1 = MinHash(1, 4, track_abundance=track_abundance) mh2 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) - mh1.add_sequence('ATGC') - mh2.add_sequence('ATGC') + mh1.add_sequence("ATGC") + mh2.add_sequence("ATGC") with pytest.raises(TypeError): mh2.max_containment(mh1) @@ -167,8 +167,8 @@ def test_contained_requires_scaled_3(track_abundance): mh1 = MinHash(1, 4, track_abundance=track_abundance) mh2 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) - mh1.add_sequence('ATGC') - mh2.add_sequence('ATGC') + mh1.add_sequence("ATGC") + mh2.add_sequence("ATGC") with pytest.raises(TypeError): mh2.avg_containment(mh1) @@ -179,36 +179,39 @@ def test_contained_requires_scaled_3(track_abundance): def test_bytes_dna(track_abundance): mh = MinHash(1, 4, track_abundance=track_abundance) - mh.add_sequence('ATGC') - mh.add_sequence(b'ATGC') - mh.add_sequence('ATGC') + mh.add_sequence("ATGC") + mh.add_sequence(b"ATGC") + mh.add_sequence("ATGC") a = mh.hashes - mh.add_sequence('GCAT') # this will not get added; hash > ATGC - mh.add_sequence(b'GCAT') # this will not get added; hash > ATGC - mh.add_sequence('GCAT') # this will not get added; hash > ATGC + mh.add_sequence("GCAT") # this will not get added; hash > ATGC + mh.add_sequence(b"GCAT") # this will not get added; hash > ATGC + mh.add_sequence("GCAT") # this will not get added; hash > ATGC b = mh.hashes print(a, b) assert list(a) == list(b) assert len(b) == 1 + def test_add_long_seqs_force(): # Test for (All kmers are invalid) - mh = sourmash.minhash.MinHash(n = 0, ksize=21, scaled =10, seed = 42) + mh = sourmash.minhash.MinHash(n=0, ksize=21, scaled=10, seed=42) seq = "ACGTN" * 100000 - hashes = mh.seq_to_hashes(seq, force = True) - assert(len(mh.hashes) == 0) + mh.seq_to_hashes(seq, force=True) + assert len(mh.hashes) == 0 def test_seq_to_hashes(track_abundance): - mh = sourmash.minhash.MinHash(n=0, ksize=21, scaled=1, track_abundance=track_abundance) + mh = sourmash.minhash.MinHash( + n=0, ksize=21, scaled=1, track_abundance=track_abundance + ) seq = "ATGAGAGACGATAGACAGATGACC" mh.add_sequence(seq) golden_hashes = mh.hashes - + # New seq to hashes without adding to the sketch new_hashes = mh.seq_to_hashes(seq) @@ -216,7 +219,14 @@ def test_seq_to_hashes(track_abundance): def test_seq_to_hashes_protein_1(track_abundance, dayhoff): - mh = MinHash(10, 2, is_protein=True, dayhoff=dayhoff, hp=False, track_abundance=track_abundance) + mh = MinHash( + 10, + 2, + is_protein=True, + dayhoff=dayhoff, + hp=False, + track_abundance=track_abundance, + ) prot_seq = "AGYYG" mh.add_protein(prot_seq) @@ -224,16 +234,19 @@ def test_seq_to_hashes_protein_1(track_abundance, dayhoff): golden_hashes = mh.hashes # New seq to hashes without adding to the sketch - new_hashes = mh.seq_to_hashes(prot_seq, is_protein = True) + new_hashes = mh.seq_to_hashes(prot_seq, is_protein=True) assert set(golden_hashes) == set(new_hashes) + def test_seq_to_hashes_protein_2(track_abundance): - mh = sourmash.minhash.MinHash(n=0, ksize=21, scaled=1, track_abundance=track_abundance) + mh = sourmash.minhash.MinHash( + n=0, ksize=21, scaled=1, track_abundance=track_abundance + ) seq = "ATGAGAGACGATAGACAGATGACC" with pytest.raises(ValueError): - mh.seq_to_hashes(seq, is_protein = True) + mh.seq_to_hashes(seq, is_protein=True) def test_seq_to_hashes_translated(track_abundance): @@ -252,7 +265,7 @@ def test_seq_to_hashes_translated(track_abundance): def test_seq_to_hashes_bad_kmers_as_zeroes_1(): mh = sourmash.minhash.MinHash(n=0, ksize=21, scaled=1) seq = "ATGAGAGACGATAGACAGATGACN" - + # New seq to hashes without adding to the sketch hashes = mh.seq_to_hashes(seq, force=True, bad_kmers_as_zeroes=True) @@ -262,54 +275,69 @@ def test_seq_to_hashes_bad_kmers_as_zeroes_1(): def test_seq_to_hashes_bad_kmers_as_zeroes_2(): mh = sourmash.minhash.MinHash(n=0, ksize=21, scaled=1) seq = "ATGAGAGACGATAGACAGATGACN" - + with pytest.raises(ValueError): - hashes = mh.seq_to_hashes(seq, bad_kmers_as_zeroes=True) + mh.seq_to_hashes(seq, bad_kmers_as_zeroes=True) def test_seq_to_hashes_translated_short(): - mh = MinHash(0, 2, is_protein=True, dayhoff=True, hp=False, scaled = 1) + mh = MinHash(0, 2, is_protein=True, dayhoff=True, hp=False, scaled=1) hashes = mh.seq_to_hashes("ACTGA") - assert(len(hashes) == 0) + assert len(hashes) == 0 def test_bytes_protein_dayhoff(track_abundance, dayhoff): # verify that we can hash protein/aa sequences - mh = MinHash(10, 2, is_protein=True, dayhoff=dayhoff, hp=False, - track_abundance=track_abundance) - - expected_moltype = 'protein' + mh = MinHash( + 10, + 2, + is_protein=True, + dayhoff=dayhoff, + hp=False, + track_abundance=track_abundance, + ) + + expected_moltype = "protein" if dayhoff: - expected_moltype = 'dayhoff' + expected_moltype = "dayhoff" assert mh.moltype == expected_moltype - mh.add_protein('AGYYG') - mh.add_protein('AGYYG') - mh.add_protein(b'AGYYG') + mh.add_protein("AGYYG") + mh.add_protein("AGYYG") + mh.add_protein(b"AGYYG") assert len(mh.hashes) == 4 def test_protein_dayhoff(track_abundance, dayhoff): # verify that we can hash protein/aa sequences - mh = MinHash(10, 2, is_protein=True, dayhoff=dayhoff, hp=False, track_abundance=track_abundance) - mh.add_protein('AGYYG') + mh = MinHash( + 10, + 2, + is_protein=True, + dayhoff=dayhoff, + hp=False, + track_abundance=track_abundance, + ) + mh.add_protein("AGYYG") assert len(mh.hashes) == 4 def test_bytes_protein_hp(track_abundance, hp): # verify that we can hash protein/aa sequences - mh = MinHash(10, 2, is_protein=True, dayhoff=False, hp=hp, track_abundance=track_abundance) - expected_moltype = 'protein' + mh = MinHash( + 10, 2, is_protein=True, dayhoff=False, hp=hp, track_abundance=track_abundance + ) + expected_moltype = "protein" if hp: - expected_moltype = 'hp' + expected_moltype = "hp" assert mh.moltype == expected_moltype - mh.add_protein('AGYYG') - mh.add_protein(u'AGYYG') - mh.add_protein(b'AGYYG') + mh.add_protein("AGYYG") + mh.add_protein("AGYYG") + mh.add_protein(b"AGYYG") if hp: assert len(mh.hashes) == 1 @@ -319,8 +347,10 @@ def test_bytes_protein_hp(track_abundance, hp): def test_protein_hp(track_abundance, hp): # verify that we can hash protein/aa sequences - mh = MinHash(10, 2, is_protein=True, dayhoff=False, hp=hp, track_abundance=track_abundance) - mh.add_protein('AGYYG') + mh = MinHash( + 10, 2, is_protein=True, dayhoff=False, hp=hp, track_abundance=track_abundance + ) + mh.add_protein("AGYYG") if hp: assert len(mh.hashes) == 1 @@ -330,8 +360,8 @@ def test_protein_hp(track_abundance, hp): def test_module_translate_codon(track_abundance): # Ensure that translation occurs properly - module level function tests - assert "S" == translate_codon('TCT') - assert "S" == translate_codon('TC') + assert "S" == translate_codon("TCT") + assert "S" == translate_codon("TC") assert "X" == translate_codon("T") with pytest.raises(ValueError): @@ -341,14 +371,15 @@ def test_module_translate_codon(track_abundance): def test_dayhoff(track_abundance): # verify that we can hash to dayhoff-encoded protein/aa sequences - mh_dayhoff = MinHash(10, 2, is_protein=True, - dayhoff=True, hp=False, track_abundance=track_abundance) - mh_dayhoff.add_sequence('ACTGAC') + mh_dayhoff = MinHash( + 10, 2, is_protein=True, dayhoff=True, hp=False, track_abundance=track_abundance + ) + mh_dayhoff.add_sequence("ACTGAC") assert len(mh_dayhoff.hashes) == 2 # verify that dayhoff-encoded hashes are different from protein/aa hashes mh_protein = MinHash(10, 2, is_protein=True, track_abundance=track_abundance) - mh_protein.add_sequence('ACTGAC') + mh_protein.add_sequence("ACTGAC") assert len(mh_protein.hashes) == 2 print(mh_protein.hashes) @@ -360,39 +391,40 @@ def test_dayhoff_2(track_abundance): mh = MinHash(0, 7, scaled=1, dayhoff=True, track_abundance=1) # first, check protein -> dayhoff hashes via minhash - mh.add_protein('CADHIFC') + mh.add_protein("CADHIFC") assert len(mh) == 1 hashval = list(mh.hashes)[0] - assert hashval == hash_murmur('abcdefa') + assert hashval == hash_murmur("abcdefa") # also check seq_to_hashes - hashes = list(mh.seq_to_hashes('CADHIFC', is_protein=True)) + hashes = list(mh.seq_to_hashes("CADHIFC", is_protein=True)) assert hashval == hashes[0] # do we handle stop codons properly? mh = mh.copy_and_clear() - mh.add_protein('CADHIF*') + mh.add_protein("CADHIF*") assert len(mh) == 1 hashval = list(mh.hashes)[0] - assert hashval == hash_murmur('abcdef*') + assert hashval == hash_murmur("abcdef*") # again, check seq_to_hashes - hashes = list(mh.seq_to_hashes('CADHIF*', is_protein=True)) + hashes = list(mh.seq_to_hashes("CADHIF*", is_protein=True)) assert hashval == hashes[0] def test_hp(track_abundance): # verify that we can hash to hp-encoded protein/aa sequences - mh_hp = MinHash(10, 2, is_protein=True, - dayhoff=False, hp=True, track_abundance=track_abundance) - assert mh_hp.moltype == 'hp' + mh_hp = MinHash( + 10, 2, is_protein=True, dayhoff=False, hp=True, track_abundance=track_abundance + ) + assert mh_hp.moltype == "hp" - mh_hp.add_sequence('ACTGAC') + mh_hp.add_sequence("ACTGAC") assert len(mh_hp.hashes) == 2 # verify that hp-encoded hashes are different from protein/aa hashes mh_protein = MinHash(10, 2, is_protein=True, track_abundance=track_abundance) - mh_protein.add_sequence('ACTGAC') + mh_protein.add_sequence("ACTGAC") assert len(mh_protein.hashes) == 2 assert mh_protein.hashes != mh_hp.hashes @@ -401,30 +433,30 @@ def test_hp(track_abundance): def test_hp_2(track_abundance): mh = MinHash(0, 3, scaled=1, hp=True, track_abundance=track_abundance) - mh.add_protein('ANA') + mh.add_protein("ANA") assert len(mh) == 1 hashval = list(mh.hashes)[0] - assert hashval == hash_murmur('hph') + assert hashval == hash_murmur("hph") # also check seq_to_hashes - hashes = list(mh.seq_to_hashes('ANA', is_protein=True)) + hashes = list(mh.seq_to_hashes("ANA", is_protein=True)) assert hashval == hashes[0] mh = mh.copy_and_clear() - mh.add_protein('AN*') + mh.add_protein("AN*") assert len(mh) == 1 hashval = list(mh.hashes)[0] - assert hashval == hash_murmur('hp*') + assert hashval == hash_murmur("hp*") # also check seq_to_hashes - hashes = list(mh.seq_to_hashes('AN*', is_protein=True)) + hashes = list(mh.seq_to_hashes("AN*", is_protein=True)) assert hashval == hashes[0] def test_protein_short(track_abundance): # verify that we can hash protein/aa sequences mh = MinHash(10, 9, is_protein=True, track_abundance=track_abundance) - mh.add_protein('AG') + mh.add_protein("AG") assert len(mh.hashes) == 0, mh.hashes @@ -436,14 +468,14 @@ def test_size_limit(track_abundance): mh.add_hash(20) mh.add_hash(30) assert list(sorted(mh.hashes)) == [10, 20, 30] - mh.add_hash(5) # -> should push 30 off end + mh.add_hash(5) # -> should push 30 off end assert list(sorted(mh.hashes)) == [5, 10, 20] def test_scaled(track_abundance): # test behavior with scaled scaled = _get_scaled_for_max_hash(35) - print('XX', scaled, _get_max_hash_for_scaled(scaled)) + print("XX", scaled, _get_max_hash_for_scaled(scaled)) mh = MinHash(0, 4, track_abundance=track_abundance, scaled=scaled) assert mh._max_hash == 35 @@ -461,11 +493,11 @@ def test_scaled(track_abundance): def test_no_scaled(track_abundance): # no 'scaled', num=0 - should fail with pytest.raises(ValueError): - mh = MinHash(0, 4, track_abundance=track_abundance) + MinHash(0, 4, track_abundance=track_abundance) def test_max_hash_conversion(): - SCALED=100000 + SCALED = 100000 max_hash = _get_max_hash_for_scaled(SCALED) new_scaled = _get_scaled_for_max_hash(max_hash) assert new_scaled == SCALED @@ -481,15 +513,15 @@ def test_max_hash_and_scaled_zero(): def test_max_hash_and_scaled_error(track_abundance): # test behavior when supplying both max_hash and scaled with pytest.raises(ValueError): - mh = MinHash(0, 4, track_abundance=track_abundance, max_hash=35, - scaled=5) + MinHash(0, 4, track_abundance=track_abundance, max_hash=35, scaled=5) def test_max_hash_cannot_limit(track_abundance): # make sure you can't set both n and scaled. with pytest.raises(ValueError): - mh = MinHash(2, 4, track_abundance=track_abundance, - scaled=_get_scaled_for_max_hash(1)) + MinHash( + 2, 4, track_abundance=track_abundance, scaled=_get_scaled_for_max_hash(1) + ) def test_no_downsample_scaled_if_n(track_abundance): @@ -498,13 +530,13 @@ def test_no_downsample_scaled_if_n(track_abundance): with pytest.raises(ValueError) as excinfo: mh.downsample(scaled=100000000) - assert 'cannot downsample a num MinHash using scaled' in str(excinfo.value) + assert "cannot downsample a num MinHash using scaled" in str(excinfo.value) def test_scaled_num_both(track_abundance): # make sure you can't set both max_n and scaled. with pytest.raises(ValueError): - mh = MinHash(2, 4, track_abundance=track_abundance, scaled=2) + MinHash(2, 4, track_abundance=track_abundance, scaled=2) def test_mh_jaccard_similarity(): @@ -514,7 +546,7 @@ def test_mh_jaccard_similarity(): a.add_many([1, 3, 5, 8]) b.add_many([1, 3, 5, 6, 8, 10]) - assert a.similarity(b) == 4. / 6. + assert a.similarity(b) == 4.0 / 6.0 def test_mh_similarity_downsample_jaccard_value(): @@ -526,10 +558,10 @@ def test_mh_similarity_downsample_jaccard_value(): b = MinHash(0, 20, scaled=scaled100, track_abundance=False) a.add_many([1, 3, 5, 8, 70]) - b.add_many([1, 3, 5, 6, 8, 10, 70 ]) + b.add_many([1, 3, 5, 6, 8, 10, 70]) # the hash=70 will be truncated by downsampling - assert a.similarity(b, downsample=True) == 4. / 6. + assert a.similarity(b, downsample=True) == 4.0 / 6.0 def test_mh_angular_similarity(): @@ -539,11 +571,11 @@ def test_mh_angular_similarity(): # are always positive (https://en.wikipedia.org/wiki/Cosine_similarity) a = MinHash(0, 20, scaled=scaled50, track_abundance=True) b = MinHash(0, 20, scaled=scaled50, track_abundance=True) - a.set_abundances({ 1:5, 3:3, 5:2, 8:2}) - b.set_abundances({ 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 }) + a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2}) + b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1}) cos_sim = 0.9356 - angular_sim = 1 - 2*math.acos(cos_sim) / math.pi + angular_sim = 1 - 2 * math.acos(cos_sim) / math.pi assert round(angular_sim, 4) == 0.7703 assert round(a.similarity(b), 4) == round(angular_sim, 4) @@ -553,13 +585,13 @@ def test_mh_angular_similarity_2(): # check actual angular similarity for a second non-trivial case a = MinHash(0, 20, scaled=scaled100, track_abundance=True) b = MinHash(0, 20, scaled=scaled100, track_abundance=True) - a.set_abundances({ 1:5, 3:3, 5:2, 8:2, 70:70 }) - b.set_abundances({ 1:3, 3:2, 5:1, 6:1, 8:1, 10:1, 70:70 }) + a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2, 70: 70}) + b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1, 70: 70}) assert round(a.similarity(b), 4) == 0.9728 # ignore_abundance => jaccard - assert a.similarity(b, ignore_abundance=True) == 5. / 7. + assert a.similarity(b, ignore_abundance=True) == 5.0 / 7.0 def test_mh_similarity_downsample_angular_value(): @@ -570,8 +602,8 @@ def test_mh_similarity_downsample_angular_value(): # max_hash = 100 b = MinHash(0, 20, scaled=scaled100, track_abundance=True) - a.set_abundances({ 1:5, 3:3, 5:2, 8:2, 70:70 }) - b.set_abundances({ 1:3, 3:2, 5:1, 6:1, 8:1, 10:1, 70:70 }) + a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2, 70: 70}) + b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1, 70: 70}) # the hash=70 will be truncated by downsampling sim = a.similarity(b, downsample=True) @@ -579,15 +611,16 @@ def test_mh_similarity_downsample_angular_value(): # with ignore_abundance, will be equal to jaccard jaccard = a.similarity(b, downsample=True, ignore_abundance=True) - assert jaccard == 4. / 6. + assert jaccard == 4.0 / 6.0 + def test_mh_angular_similarity_fail(): # raise TypeError if calling angular_similarity directly and # one or both sketches do not have abundance info a = MinHash(0, 20, scaled=scaled50, track_abundance=True) b = MinHash(0, 20, scaled=scaled50, track_abundance=False) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} a.set_abundances(a_values) b.add_many(b_values.keys()) @@ -596,14 +629,20 @@ def test_mh_angular_similarity_fail(): with pytest.raises(TypeError) as exc: a.angular_similarity(b) print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) # both sketches lack track abundance a = MinHash(0, 20, scaled=scaled50, track_abundance=False) a.add_many(a_values.keys()) with pytest.raises(TypeError) as exc: a.angular_similarity(b) print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) def test_mh_similarity_downsample_true(track_abundance): @@ -614,8 +653,8 @@ def test_mh_similarity_downsample_true(track_abundance): # max_hash = 100 b = MinHash(0, 20, scaled=scaled100, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) b.set_abundances(b_values) @@ -642,8 +681,8 @@ def test_mh_similarity_downsample_errors(track_abundance): # max_hash = 100 b = MinHash(0, 20, scaled=scaled100, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) b.set_abundances(b_values) @@ -653,20 +692,20 @@ def test_mh_similarity_downsample_errors(track_abundance): # error, incompatible max hash with pytest.raises(ValueError) as e: - a.similarity(b, ignore_abundance=True) # downsample=False - assert 'mismatch in scaled; comparison fail' in str(e.value) + a.similarity(b, ignore_abundance=True) # downsample=False + assert "mismatch in scaled; comparison fail" in str(e.value) with pytest.raises(ValueError) as e: a.similarity(b, ignore_abundance=False) # downsample=False - assert 'mismatch in scaled; comparison fail' in str(e.value) + assert "mismatch in scaled; comparison fail" in str(e.value) with pytest.raises(ValueError) as e: - b.similarity(a, ignore_abundance=True) # downsample=False - assert 'mismatch in scaled; comparison fail' in str(e.value) + b.similarity(a, ignore_abundance=True) # downsample=False + assert "mismatch in scaled; comparison fail" in str(e.value) with pytest.raises(ValueError) as e: b.similarity(a, ignore_abundance=False) # downsample=false - assert 'mismatch in scaled; comparison fail' in str(e.value) + assert "mismatch in scaled; comparison fail" in str(e.value) def test_basic_dna_bad(track_abundance): @@ -674,10 +713,10 @@ def test_basic_dna_bad(track_abundance): mh = MinHash(1, 4, track_abundance=track_abundance) with pytest.raises(ValueError) as e: - mh.add_sequence('ATGR') + mh.add_sequence("ATGR") print(e) - assert 'invalid DNA character in input k-mer: ATGR' in str(e.value) + assert "invalid DNA character in input k-mer: ATGR" in str(e.value) def test_basic_dna_bad_2(track_abundance): @@ -685,40 +724,40 @@ def test_basic_dna_bad_2(track_abundance): mh = MinHash(1, 6, track_abundance=track_abundance) with pytest.raises(ValueError): - mh.add_protein('YYYY') + mh.add_protein("YYYY") def test_basic_dna_bad_force(track_abundance): # test behavior on bad DNA; use 100 so multiple hashes get added. mh = MinHash(100, 4, track_abundance=track_abundance) assert len(mh.hashes) == 0 - mh.add_sequence('ATGN', True) # ambiguous kmer skipped. + mh.add_sequence("ATGN", True) # ambiguous kmer skipped. assert len(mh.hashes) == 0 - mh.add_sequence('AATGN', True) # but good k-mers still used. + mh.add_sequence("AATGN", True) # but good k-mers still used. assert len(mh.hashes) == 1 - mh.add_sequence('AATG', True) # checking that right kmer was added - assert len(mh.hashes) == 1 # (only 1 hash <- this is a dup) + mh.add_sequence("AATG", True) # checking that right kmer was added + assert len(mh.hashes) == 1 # (only 1 hash <- this is a dup) def test_basic_dna_bad_force_2(track_abundance): # test behavior on bad DNA mh = MinHash(100, 4, track_abundance=track_abundance) assert len(mh.hashes) == 0 - mh.add_sequence('AAGNCGG', True) # ambiguous kmers skipped. + mh.add_sequence("AAGNCGG", True) # ambiguous kmers skipped. assert len(mh.hashes) == 0 - mh.add_sequence('AATGNGCGG', True) # ambiguous kmers skipped. + mh.add_sequence("AATGNGCGG", True) # ambiguous kmers skipped. assert len(mh.hashes) == 2 - mh.add_sequence('AATG', True) # checking that right kmers were added - mh.add_sequence('GCGG', True) - assert len(mh.hashes) == 2 # (only 2 hashes should be there) + mh.add_sequence("AATG", True) # checking that right kmers were added + mh.add_sequence("GCGG", True) + assert len(mh.hashes) == 2 # (only 2 hashes should be there) def test_consume_lowercase(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) - a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA'.lower()) - b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') + a.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA".lower()) + b.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") assert round(a.similarity(b), 3) == 1.0 assert round(b.similarity(b), 3) == 1.0 @@ -730,8 +769,8 @@ def test_similarity_1(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) - a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') - b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') + a.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") + b.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") assert round(a.similarity(b), 3) == 1.0 assert round(b.similarity(b), 3) == 1.0 @@ -739,14 +778,13 @@ def test_similarity_1(track_abundance): assert round(a.similarity(a), 3) == 1.0 # add same sequence again - b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') + b.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") assert round(a.similarity(b), 3) == 1.0 assert round(b.similarity(b), 3) == 1.0 assert round(b.similarity(a), 3) == 1.0 assert round(a.similarity(a), 3) == 1.0 - - b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT') + b.add_sequence("GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT") x = a.similarity(b) assert x >= 0.3, x @@ -777,7 +815,7 @@ def test_frozen_copy(track_abundance): def test_mh_copy(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) - a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') + a.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") b = a.__copy__() assert round(b.similarity(a), 3) == 1.0 @@ -786,7 +824,7 @@ def test_mh_len(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) assert len(a) == 0 - a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') + a.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") assert len(a) == 20 @@ -800,7 +838,7 @@ def test_mh_len_2(track_abundance): def test_mh_unsigned_long_long(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) - a.add_hash(9227159859419181011) # too big for a C long int. + a.add_hash(9227159859419181011) # too big for a C long int. assert 9227159859419181011 in a.hashes @@ -826,10 +864,20 @@ def test_mh_count_common_diff_protein(track_abundance): def test_mh_count_common_diff_maxhash(track_abundance): - a = MinHash(0, 5, is_protein=False, track_abundance=track_abundance, - scaled=_get_scaled_for_max_hash(1)) - b = MinHash(0, 5, is_protein=True, track_abundance=track_abundance, - scaled=_get_scaled_for_max_hash(2)) + a = MinHash( + 0, + 5, + is_protein=False, + track_abundance=track_abundance, + scaled=_get_scaled_for_max_hash(1), + ) + b = MinHash( + 0, + 5, + is_protein=True, + track_abundance=track_abundance, + scaled=_get_scaled_for_max_hash(2), + ) with pytest.raises(ValueError): a.count_common(b) @@ -991,6 +1039,7 @@ def test_mh_merge_check_length2(track_abundance): c.merge(b) assert len(c.hashes) == 3 + def test_mh_asymmetric_merge(track_abundance): # test merging two asymmetric (different size) MHs a = MinHash(20, 10, track_abundance=track_abundance) @@ -1055,13 +1104,13 @@ def test_mh_inplace_concat_asymmetric(track_abundance): try: d.similarity(a) except TypeError as exc: - assert 'must have same num' in str(exc) + assert "must have same num" in str(exc) a = a.downsample(num=d.num) if track_abundance: - assert round(d.similarity(a), 3) == 0.795 # see: d += a, above. + assert round(d.similarity(a), 3) == 0.795 # see: d += a, above. else: - assert d.similarity(a) == 1.0 # see: d += a, above. + assert d.similarity(a) == 1.0 # see: d += a, above. c = c.downsample(num=b.num) if track_abundance: @@ -1132,11 +1181,13 @@ def test_mh_similarity_diff_seed(track_abundance): def test_mh_compare_diff_max_hash(track_abundance): - a = MinHash(0, 5, track_abundance=track_abundance, - scaled=_get_max_hash_for_scaled(5)) + a = MinHash( + 0, 5, track_abundance=track_abundance, scaled=_get_max_hash_for_scaled(5) + ) - b = MinHash(0, 5, track_abundance=track_abundance, - scaled=_get_max_hash_for_scaled(10)) + b = MinHash( + 0, 5, track_abundance=track_abundance, scaled=_get_max_hash_for_scaled(10) + ) with pytest.raises(ValueError): a.similarity(b) @@ -1159,10 +1210,12 @@ def test_mh_concat_diff_ksize(track_abundance): def test_mh_concat_diff_max_hash(track_abundance): - a = MinHash(0, 5, track_abundance=track_abundance, - scaled=_get_max_hash_for_scaled(5)) - b = MinHash(0, 5, track_abundance=track_abundance, - scaled=_get_max_hash_for_scaled(10)) + a = MinHash( + 0, 5, track_abundance=track_abundance, scaled=_get_max_hash_for_scaled(5) + ) + b = MinHash( + 0, 5, track_abundance=track_abundance, scaled=_get_max_hash_for_scaled(10) + ) with pytest.raises(ValueError): a += b @@ -1178,7 +1231,7 @@ def test_mh_concat_diff_seed(track_abundance): def test_short_sequence(track_abundance): a = MinHash(20, 5, track_abundance=track_abundance) - a.add_sequence('GGGG') + a.add_sequence("GGGG") # adding a short sequence should fail silently assert len(a.hashes) == 0 @@ -1190,7 +1243,7 @@ def test_bytes_murmur(): x = hash_murmur(b"ACG") assert x == 1731421407650554201 - x = hash_murmur(u"ACG") + x = hash_murmur("ACG") assert x == 1731421407650554201 @@ -1214,11 +1267,11 @@ def test_murmur(): def test_abundance_simple(): a = MinHash(20, 5, is_protein=False, track_abundance=True) - a.add_sequence('AAAAA') + a.add_sequence("AAAAA") assert list(a.hashes) == [2110480117637990133] assert a.hashes == {2110480117637990133: 1} - a.add_sequence('AAAAA') + a.add_sequence("AAAAA") assert list(a.hashes) == [2110480117637990133] assert a.hashes == {2110480117637990133: 2} @@ -1269,15 +1322,15 @@ def test_abundance_simple_2(): a = MinHash(20, 5, is_protein=False, track_abundance=True) b = MinHash(20, 5, is_protein=False, track_abundance=True) - a.add_sequence('AAAAA') + a.add_sequence("AAAAA") assert list(a.hashes) == [2110480117637990133] assert a.hashes == {2110480117637990133: 1} - a.add_sequence('AAAAA') + a.add_sequence("AAAAA") assert list(a.hashes) == [2110480117637990133] assert a.hashes == {2110480117637990133: 2} - b.add_sequence('AAAAA') + b.add_sequence("AAAAA") assert a.count_common(b) == 1 @@ -1285,13 +1338,13 @@ def test_abundance_count_common(): a = MinHash(20, 5, is_protein=False, track_abundance=True) b = MinHash(20, 5, is_protein=False, track_abundance=False) - a.add_sequence('AAAAA') - a.add_sequence('AAAAA') + a.add_sequence("AAAAA") + a.add_sequence("AAAAA") assert list(a.hashes) == [2110480117637990133] assert a.hashes == {2110480117637990133: 2} - b.add_sequence('AAAAA') - b.add_sequence('GGGGG') + b.add_sequence("AAAAA") + b.add_sequence("GGGGG") assert a.count_common(b) == 1 assert a.count_common(b) == b.count_common(a) @@ -1302,8 +1355,8 @@ def test_abundance_similarity(): a = MinHash(20, 10, track_abundance=True) b = MinHash(20, 10, track_abundance=False) - a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') - b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') + a.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") + b.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") assert round(a.similarity(b), 3) == 1.0 assert round(b.similarity(b), 3) == 1.0 @@ -1311,13 +1364,13 @@ def test_abundance_similarity(): assert round(a.similarity(a), 3) == 1.0 # add same sequence again - b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') + b.add_sequence("TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA") assert round(a.similarity(b), 3) == 1.0 assert round(b.similarity(b), 3) == 1.0 assert round(b.similarity(a), 3) == 1.0 assert round(a.similarity(a), 3) == 1.0 - b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT') + b.add_sequence("GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT") x = a.similarity(b) assert x >= 0.3, x @@ -1338,9 +1391,7 @@ def test_set_abundance(): def test_set_abundance_2(): datapath = utils.get_test_data("genome-s12.fa.gz.sig") - sig = sourmash.load_one_signature(datapath, - ksize=30, - select_moltype='dna') + sig = sourmash.load_one_signature(datapath, ksize=30, select_moltype="dna") new_mh = sig.minhash.copy_and_clear() mins = sig.minhash.hashes mins = {k: 1 for k in mins} @@ -1377,7 +1428,7 @@ def test_set_abundance_clear_3(): a.add_hash(10) assert a.hashes == {10: 1} - + a.set_abundances({20: 1, 30: 4}, clear=False) assert a.hashes == {10: 1, 20: 1, 30: 4} @@ -1387,32 +1438,34 @@ def test_set_abundance_clear_4(): # the abundances together a = MinHash(20, 5, is_protein=False, track_abundance=True) - a.set_abundances({20: 2, 10: 1}, clear=False) # should also sort the hashes + a.set_abundances({20: 2, 10: 1}, clear=False) # should also sort the hashes assert a.hashes == {10: 1, 20: 2} a.set_abundances({20: 1, 10: 2}, clear=False) assert a.hashes == {10: 3, 20: 3} + def test_clear_abundance_on_zero(): mh = sourmash.minhash.MinHash(n=0, ksize=31, scaled=1, track_abundance=True) - mh.set_abundances({ 1: 5, 2: 3, 3 : 5 }) - mh.set_abundances({ 1: 0 }, clear=False) + mh.set_abundances({1: 5, 2: 3, 3: 5}) + mh.set_abundances({1: 0}, clear=False) assert 1 not in dict(mh.hashes) assert dict(mh.hashes)[2] == 3 assert dict(mh.hashes)[3] == 5 assert len(mh) == 2 with pytest.raises(ValueError): - mh.set_abundances({ 2: -1 }) # Test on clear = True + mh.set_abundances({2: -1}) # Test on clear = True with pytest.raises(ValueError): - mh.set_abundances({ 2: -1 }, clear=False) - - assert len(mh) == 2 # Assert that nothing was affected + mh.set_abundances({2: -1}, clear=False) + + assert len(mh) == 2 # Assert that nothing was affected + def test_reset_abundance_initialized(): a = MinHash(1, 4, track_abundance=True) - a.add_sequence('ATGC') + a.add_sequence("ATGC") # If we had a minhash with abundances and drop it, this shouldn't fail. # Convert from Abundance to Regular MinHash @@ -1423,12 +1476,14 @@ def test_reset_abundance_initialized(): def test_set_abundance_initialized(): a = MinHash(1, 4, track_abundance=False) - a.add_sequence('ATGC') + a.add_sequence("ATGC") with pytest.raises(RuntimeError) as e: a.track_abundance = True - assert "Can only set track_abundance=True if the MinHash is empty" in e.value.args[0] + assert ( + "Can only set track_abundance=True if the MinHash is empty" in e.value.args[0] + ) def test_set_abundance_num(): @@ -1459,8 +1514,9 @@ def test_mh_copy_and_clear(track_abundance): def test_mh_copy_and_clear_with_max_hash(track_abundance): # test basic creation of new, empty MinHash w/max_hash param set - a = MinHash(0, 10, track_abundance=track_abundance, - scaled=_get_scaled_for_max_hash(20)) + a = MinHash( + 0, 10, track_abundance=track_abundance, scaled=_get_scaled_for_max_hash(20) + ) for i in range(0, 40, 2): a.add_hash(i) @@ -1484,8 +1540,13 @@ def test_scaled_property(track_abundance): def test_pickle_protein(track_abundance): # check that protein/etc ksize is handled properly during serialization. - a = MinHash(0, 10, track_abundance=track_abundance, is_protein=True, - scaled=_get_scaled_for_max_hash(20)) + a = MinHash( + 0, + 10, + track_abundance=track_abundance, + is_protein=True, + scaled=_get_scaled_for_max_hash(20), + ) for i in range(0, 40, 2): a.add_hash(i) @@ -1505,8 +1566,13 @@ def test_pickle_protein(track_abundance): def test_pickle_dayhoff(track_abundance): # check that dayhoff ksize is handled properly during serialization. - a = MinHash(0, 10, track_abundance=track_abundance, dayhoff=True, - scaled=_get_scaled_for_max_hash(20)) + a = MinHash( + 0, + 10, + track_abundance=track_abundance, + dayhoff=True, + scaled=_get_scaled_for_max_hash(20), + ) for i in range(0, 40, 2): a.add_hash(i) @@ -1526,8 +1592,13 @@ def test_pickle_dayhoff(track_abundance): def test_pickle_hp(track_abundance): # check that hp ksize is handled properly during serialization. - a = MinHash(0, 10, track_abundance=track_abundance, hp=True, - scaled=_get_scaled_for_max_hash(20)) + a = MinHash( + 0, + 10, + track_abundance=track_abundance, + hp=True, + scaled=_get_scaled_for_max_hash(20), + ) for i in range(0, 40, 2): a.add_hash(i) @@ -1546,8 +1617,9 @@ def test_pickle_hp(track_abundance): def test_pickle_max_hash(track_abundance): - a = MinHash(0, 10, track_abundance=track_abundance, - scaled=_get_scaled_for_max_hash(20)) + a = MinHash( + 0, 10, track_abundance=track_abundance, scaled=_get_scaled_for_max_hash(20) + ) for i in range(0, 40, 2): a.add_hash(i) @@ -1650,8 +1722,9 @@ def test_minhash_abund_merge_flat_2(): def test_distance_matrix(track_abundance): import numpy - siglist = [next(signature.load_signatures(utils.get_test_data(f))) - for f in utils.SIG_FILES] + siglist = [ + next(signature.load_signatures(utils.get_test_data(f))) for f in utils.SIG_FILES + ] D1 = numpy.zeros([len(siglist), len(siglist)]) D2 = numpy.zeros([len(siglist), len(siglist)]) @@ -1690,14 +1763,15 @@ def test_remove_many(track_abundance): assert len(a) == 33 assert all(c % 6 != 0 for c in a.hashes) + def test_remove_minhash(track_abundance): original_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) added_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) tested_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) original_mh.add_many(list(range(101))) - added_mh.add_many(list(range(101,201))) # contains original in it - tested_mh.add_many(list(range(201))) # original + added + added_mh.add_many(list(range(101, 201))) # contains original in it + tested_mh.add_many(list(range(201))) # original + added # Now we should expect tested_minhash == original_minhash # Note we are passing a MinHash object instead of an iterable object @@ -1718,7 +1792,7 @@ def test_add_many(track_abundance): b = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) a.add_many(list(range(0, 100, 2))) - a.add_many(list(range(0, 100, 2))) # => abundance = 2 + a.add_many(list(range(0, 100, 2))) # => abundance = 2 assert len(a) == 50 assert all(c % 2 == 0 for c in a.hashes) @@ -1733,8 +1807,7 @@ def test_add_many(track_abundance): def test_set_abundances_huge(): max_hash = 4000000 - a = MinHash(0, 10, track_abundance=True, - scaled=_get_scaled_for_max_hash(max_hash)) + a = MinHash(0, 10, track_abundance=True, scaled=_get_scaled_for_max_hash(max_hash)) hashes = list(range(max_hash)) abundances = itertools.repeat(2) @@ -1744,7 +1817,7 @@ def test_set_abundances_huge(): def test_try_change_hashes(track_abundance): a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) - b = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) + MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) a.add_many(list(range(0, 100, 2))) @@ -1846,7 +1919,10 @@ def test_inflate_error(): with pytest.raises(ValueError) as exc: mh = mh.inflate(mh2) - assert "inflate operates on a flat MinHash and takes a MinHash object with track_abundance=True" in str(exc.value) + assert ( + "inflate operates on a flat MinHash and takes a MinHash object with track_abundance=True" + in str(exc.value) + ) def test_inflate_not_a_subset(): @@ -1878,7 +1954,7 @@ def test_inflate_not_a_subset(): mh3 = mh.inflate(mh2) assert mh3.hashes[10] == 3 - assert 20 not in mh3.hashes # should intersect, in this case. + assert 20 not in mh3.hashes # should intersect, in this case. assert mh3.hashes[30] == 3 @@ -1887,14 +1963,14 @@ def test_add_kmer(track_abundance): mh1 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) mh2 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) - mh1.add_sequence('ATGCGTGC') + mh1.add_sequence("ATGCGTGC") a = mh1.hashes - mh2.add_kmer('ATGC') - mh2.add_kmer('TGCG') - mh2.add_kmer('GCGT') - mh2.add_kmer('CGTG') - mh2.add_kmer('GTGC') + mh2.add_kmer("ATGC") + mh2.add_kmer("TGCG") + mh2.add_kmer("GCGT") + mh2.add_kmer("CGTG") + mh2.add_kmer("GTGC") b = mh2.hashes assert set(a.items()) == set(b.items()) @@ -1905,7 +1981,7 @@ def test_add_kmer_too_long(track_abundance): mh1 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) with pytest.raises(ValueError): - mh1.add_kmer('ATGCGTGC') + mh1.add_kmer("ATGCGTGC") def test_get_mins_deprecated(track_abundance): @@ -1961,9 +2037,14 @@ def test_downsample_scaled(track_abundance): # test downsample(scaled...) method mh = MinHash(0, 21, scaled=1, track_abundance=track_abundance) - mins = (1, 2, 3, - 9223372036854775808 + 1, 9223372036854775808 + 2, - 9223372036854775808 + 3) + mins = ( + 1, + 2, + 3, + 9223372036854775808 + 1, + 9223372036854775808 + 2, + 9223372036854775808 + 3, + ) mh.add_many(mins) assert len(mh) == 6 @@ -1978,7 +2059,7 @@ def test_downsample_scaled(track_abundance): def test_is_molecule_type_1(track_abundance): mh = MinHash(1, 21, track_abundance=track_abundance) - assert mh.moltype == 'DNA' + assert mh.moltype == "DNA" assert mh.is_dna assert not mh.is_protein assert not mh.hp @@ -1987,7 +2068,7 @@ def test_is_molecule_type_1(track_abundance): def test_is_molecule_type_2(track_abundance): mh = MinHash(1, 21, track_abundance=track_abundance, is_protein=True) - assert mh.moltype == 'protein' + assert mh.moltype == "protein" assert not mh.is_dna assert mh.is_protein assert not mh.hp @@ -1996,17 +2077,16 @@ def test_is_molecule_type_2(track_abundance): def test_is_molecule_type_3(track_abundance): mh = MinHash(1, 21, track_abundance=track_abundance, hp=True) - assert mh.moltype == 'hp' + assert mh.moltype == "hp" assert not mh.is_dna assert not mh.is_protein assert mh.hp assert not mh.dayhoff - def test_is_molecule_type_4(track_abundance): mh = MinHash(1, 21, track_abundance=track_abundance, dayhoff=True) - assert mh.moltype == 'dayhoff' + assert mh.moltype == "dayhoff" assert not mh.is_dna assert not mh.is_protein assert not mh.hp @@ -2021,7 +2101,7 @@ def test_addition_num_incompatible(): mh2.add_hash(1) with pytest.raises(TypeError) as exc: - mh3 = mh1 + mh2 + mh1 + mh2 assert "incompatible num values: self=10 other=20" in str(exc.value) @@ -2030,8 +2110,8 @@ def test_addition_abund(): mh1 = MinHash(10, 21, track_abundance=True) mh2 = MinHash(10, 21, track_abundance=True) - mh1.set_abundances({ 0: 1 }) - mh2.set_abundances({ 0: 3 }) + mh1.set_abundances({0: 1}) + mh2.set_abundances({0: 3}) mh3 = mh1 + mh2 hashcounts = mh3.hashes @@ -2057,8 +2137,8 @@ def test_iaddition_abund(): mh1 = MinHash(10, 21, track_abundance=True) mh2 = MinHash(10, 21, track_abundance=True) - mh1.set_abundances({ 0: 1 }) - mh2.set_abundances({ 0: 3 }) + mh1.set_abundances({0: 1}) + mh2.set_abundances({0: 3}) mh1 += mh2 hashcounts = mh1.hashes @@ -2093,10 +2173,11 @@ def test_intersection_1_num(): mh2.add_hash(2) mh3 = mh1.intersection(mh2) - print("mh.intersection INTERSECTION HASHES:",set(mh3.hashes)) + print("mh.intersection INTERSECTION HASHES:", set(mh3.hashes)) assert len(mh3) == 1 assert 0 in mh3.hashes + def test_and_operator(): mh1 = MinHash(20, 21) mh1.add_hash(5) @@ -2110,11 +2191,14 @@ def test_and_operator(): mh3 = mh1.intersection(mh2) mh4 = mh1 & mh2 - print("\n Intersection hashes (mh3): ", mh3.hashes, "\n '&' hashes: (mh4)", mh4.hashes) + print( + "\n Intersection hashes (mh3): ", mh3.hashes, "\n '&' hashes: (mh4)", mh4.hashes + ) assert mh3 assert mh3 == mh4 + def test_intersection_2_scaled(): mh1 = MinHash(0, 21, scaled=1) mh2 = MinHash(0, 21, scaled=1) @@ -2136,7 +2220,7 @@ def test_intersection_3_abundance_error(): mh2 = MinHash(0, 21, scaled=1, track_abundance=True) with pytest.raises(TypeError) as exc: - mh3 = mh1.intersection(mh2) + mh1.intersection(mh2) assert str(exc.value) == "can only intersect flat MinHash objects" @@ -2147,7 +2231,7 @@ def test_intersection_4_incompatible_ksize(): mh2 = MinHash(500, 31) with pytest.raises(ValueError) as exc: - mh3 = mh1.intersection(mh2) + mh1.intersection(mh2) assert str(exc.value) == "different ksizes cannot be compared" @@ -2157,7 +2241,7 @@ def test_intersection_5_incompatible(): mh1 = MinHash(0, 21, scaled=1) with pytest.raises(TypeError) as exc: - mh3 = mh1.intersection(set()) + mh1.intersection(set()) assert str(exc.value) == "can only intersect MinHash objects" @@ -2189,6 +2273,7 @@ def test_intersection_6_full_num(): assert mh1.intersection_and_union_size(mh2) == (10, 20) + def test_intersection_7_full_scaled(): # intersection of two scaled objects is correct mh1 = MinHash(0, 21, scaled=100) @@ -2231,8 +2316,8 @@ def test_merge_abund(): mh1 = MinHash(10, 21, track_abundance=True) mh2 = MinHash(10, 21, track_abundance=True) - mh1.set_abundances({ 0: 1 }) - mh2.set_abundances({ 0: 3 }) + mh1.set_abundances({0: 1}) + mh2.set_abundances({0: 3}) ret = mh1.merge(mh2) assert ret is None @@ -2315,6 +2400,7 @@ def test_merge_scaled(): for k in mh2.hashes: assert k in mh3.hashes + def test_add_is_symmetric(): mh1 = MinHash(20, 21) mh1.add_hash(5) @@ -2324,10 +2410,11 @@ def test_add_is_symmetric(): mh3 = mh1 + mh2 mh4 = mh2 + mh1 print("\n mh3 EQUALS ", mh3.hashes, "\n mh4 EQUALS", mh4.hashes) - #if mh3 != 0, then it is "true", so it passes + # if mh3 != 0, then it is "true", so it passes assert mh3 assert mh3 == mh4 + def test_or_equals_add(): mh1 = MinHash(20, 21) mh1.add_hash(5) @@ -2340,6 +2427,7 @@ def test_or_equals_add(): assert mh3 assert mh3 == mh4 + def test_max_containment(): mh1 = MinHash(0, 21, scaled=1, track_abundance=False) mh2 = MinHash(0, 21, scaled=1, track_abundance=False) @@ -2347,10 +2435,10 @@ def test_max_containment(): mh1.add_many((1, 2, 3, 4)) mh2.add_many((1, 5)) - assert mh1.contained_by(mh2) == 1/4 - assert mh2.contained_by(mh1) == 1/2 - assert mh1.max_containment(mh2) == 1/2 - assert mh2.max_containment(mh1) == 1/2 + assert mh1.contained_by(mh2) == 1 / 4 + assert mh2.contained_by(mh1) == 1 / 2 + assert mh1.max_containment(mh2) == 1 / 2 + assert mh2.max_containment(mh1) == 1 / 2 def test_max_containment_empty(): @@ -2385,8 +2473,8 @@ def test_avg_containment(): mh1.add_many((1, 2, 3, 4)) mh2.add_many((1, 5)) - assert mh1.contained_by(mh2) == 1/4 - assert mh2.contained_by(mh1) == 1/2 + assert mh1.contained_by(mh2) == 1 / 4 + assert mh2.contained_by(mh1) == 1 / 2 assert mh1.avg_containment(mh2) == 0.375 assert mh2.avg_containment(mh1) == 0.375 @@ -2454,7 +2542,7 @@ def test_frozen_and_mutable_3(track_abundance): def test_dna_kmers(): # test seq_to_hashes for dna -> dna - mh = MinHash(0, ksize=31, scaled=1) # DNA + mh = MinHash(0, ksize=31, scaled=1) # DNA seq = "ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTGGCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGCCGAACTTTTGACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCGCAATTGAAAACTTTCGTCGATCAGGAATTTGCCCAAATAAAACATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGTCACAACGTTACTGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAATCTACCGTCGATATTGCTGAGTCCACCCGCCGTATTGCGGCAAGCCGCATTCCGGCTGATCACATGGTGCTGAT" # first calculate seq to hashes @@ -2469,7 +2557,7 @@ def test_dna_kmers(): # k-mer by k-mer? for i in range(0, len(seq) - 31 + 1): # calculate each k-mer - kmer = seq[i:i+31] + kmer = seq[i : i + 31] # add to minhash obj single_mh = mh.copy_and_clear() @@ -2488,7 +2576,7 @@ def test_dna_kmers(): def test_dna_kmers_2(): # test kmers_and_hashes for dna -> dna - mh = MinHash(0, ksize=31, scaled=1) # DNA + mh = MinHash(0, ksize=31, scaled=1) # DNA seq = "ATGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTGGCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGCCGAACTTTTGACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCGCAATTGAAAACTTTCGTCGATCAGGAATTTGCCCAAATAAAACATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGTCACAACGTTACTGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAATCTACCGTCGATATTGCTGAGTCCACCCGCCGTATTGCGGCAAGCCGCATTCCGGCTGATCACATGGTGCTGAT" # k-mer by k-mer? @@ -2504,7 +2592,7 @@ def test_dna_kmers_2(): def test_dna_kmers_3_bad_dna(): # test kmers_and_hashes for dna -> dna, with some bad k-mers in there - mh = MinHash(0, ksize=31, scaled=1) # DNA + mh = MinHash(0, ksize=31, scaled=1) # DNA seq = "NTGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTGGCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGCCGAACTTTTGACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCGCAATTGAAAACTTTCGTCGATCAGGAATTTGCCCAAATAAAACATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGTCACAACGTTACTGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAATCTACCGTCGATATTGCTGAGTCCACCCGCCGTATTGCGGCAAGCCGCATTCCGGCTGATCACATGGTGCTGAT" with pytest.raises(ValueError) as exc: @@ -2515,7 +2603,7 @@ def test_dna_kmers_3_bad_dna(): def test_dna_kmers_4_bad_dna(): # test kmers_and_hashes for bad dna -> dna, using force - mh = MinHash(0, ksize=31, scaled=1) # DNA + mh = MinHash(0, ksize=31, scaled=1) # DNA seq = "NTGCGAGTGTTGAAGTTCGGCGGTACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCCAGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTGGCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACGTATTTTTGCCGAACTTTTGACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCGCAATTGAAAACTTTCGTCGATCAGGAATTTGCCCAAATAAAACATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGTCACAACGTTACTGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAATCTACCGTCGATATTGCTGAGTCCACCCGCCGTATTGCGGCAAGCCGCATTCCGGCTGATCACATGGTGCTGAT" # k-mer by k-mer? @@ -2524,8 +2612,8 @@ def test_dna_kmers_4_bad_dna(): # add to minhash obj single_mh = mh.copy_and_clear() - if hashval == None: - assert kmer == seq[:31] # first k-mer is baaaaad. + if hashval is None: + assert kmer == seq[:31] # first k-mer is baaaaad. found_bad_kmer = True continue @@ -2555,7 +2643,7 @@ def test_protein_kmers(): # k-mer by k-mer? for i in range(0, len(seq) - 7 + 1): # calculate each k-mer - kmer = seq[i:i+7] + kmer = seq[i : i + 7] # add to minhash obj single_mh = mh.copy_and_clear() @@ -2605,7 +2693,7 @@ def test_dayhoff_kmers(): # k-mer by k-mer? for i in range(0, len(seq) - 7 + 1): # calculate each k-mer - kmer = seq[i:i+7] + kmer = seq[i : i + 7] # add to minhash obj single_mh = mh.copy_and_clear() @@ -2655,7 +2743,7 @@ def test_hp_kmers(): # k-mer by k-mer? for i in range(0, len(seq) - 7 + 1): # calculate each k-mer - kmer = seq[i:i+7] + kmer = seq[i : i + 7] # add to minhash obj single_mh = mh.copy_and_clear() @@ -2789,8 +2877,8 @@ def test_containment(track_abundance): mh2.add_many((1, 5)) mh2.add_many((1, 5)) - assert mh1.contained_by(mh2) == 1/4 - assert mh2.contained_by(mh1) == 1/2 + assert mh1.contained_by(mh2) == 1 / 4 + assert mh2.contained_by(mh1) == 1 / 2 def test_sum_abundances(track_abundance): @@ -2808,8 +2896,8 @@ def test_sum_abundances(track_abundance): assert mh1.sum_abundances == 6 assert mh2.sum_abundances == 6 else: - assert mh1.sum_abundances == None - assert mh2.sum_abundances == None + assert mh1.sum_abundances is None + assert mh2.sum_abundances is None def test_mean_abundance(track_abundance): @@ -2885,32 +2973,44 @@ def test_unique_dataset_hashes(track_abundance): def test_containment_ANI(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2, ksize=31).minhash - m1_cont_m2 = mh1.containment_ani(mh2, estimate_ci =True) - m2_cont_m1 = mh2.containment_ani(mh1, estimate_ci =True) + m1_cont_m2 = mh1.containment_ani(mh2, estimate_ci=True) + m2_cont_m1 = mh2.containment_ani(mh1, estimate_ci=True) print("\nmh1 contained by mh2", m1_cont_m2) print("mh2 contained by mh1", m2_cont_m1) - assert (round(m1_cont_m2.ani,3), m1_cont_m2.ani_low, m1_cont_m2.ani_high) == (1.0, 1.0, 1.0) - assert (round(m2_cont_m1.ani,3), round(m2_cont_m1.ani_low,3), round(m2_cont_m1.ani_high,3)) == (0.966, 0.965, 0.967) - - m1_mc_m2 = mh1.max_containment_ani(mh2, estimate_ci =True) - m2_mc_m1 = mh2.max_containment_ani(mh1, estimate_ci =True) + assert (round(m1_cont_m2.ani, 3), m1_cont_m2.ani_low, m1_cont_m2.ani_high) == ( + 1.0, + 1.0, + 1.0, + ) + assert ( + round(m2_cont_m1.ani, 3), + round(m2_cont_m1.ani_low, 3), + round(m2_cont_m1.ani_high, 3), + ) == (0.966, 0.965, 0.967) + + m1_mc_m2 = mh1.max_containment_ani(mh2, estimate_ci=True) + m2_mc_m1 = mh2.max_containment_ani(mh1, estimate_ci=True) print("mh1 max containment", m1_mc_m2) print("mh2 max containment", m2_mc_m1) m1_mc_m2.size_is_inaccurate = False m2_mc_m1.size_is_inaccurate = False assert m1_mc_m2 == m2_mc_m1 - assert (round(m1_mc_m2.ani, 3), round(m1_mc_m2.ani_low, 3), round(m1_mc_m2.ani_high, 3)) == (1.0,1.0,1.0) + assert ( + round(m1_mc_m2.ani, 3), + round(m1_mc_m2.ani_low, 3), + round(m1_mc_m2.ani_high, 3), + ) == (1.0, 1.0, 1.0) def test_containment_ANI_precalc_containment(): - f1 = utils.get_test_data('47+63.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("47+63.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2, ksize=31).minhash # precalc containments and assert same results @@ -2918,27 +3018,37 @@ def test_containment_ANI_precalc_containment(): s2c = mh2.contained_by(mh1) mc = max(s1c, s2c) - assert mh1.containment_ani(mh2, estimate_ci=True) == mh1.containment_ani(mh2, containment=s1c, estimate_ci=True) - assert mh2.containment_ani(mh1) == mh2.containment_ani(mh1, containment=s2c) - assert mh1.max_containment_ani(mh2) == mh2.max_containment_ani(mh1) - assert mh1.max_containment_ani(mh2) == mh1.max_containment_ani(mh2, max_containment=mc) - assert mh1.max_containment_ani(mh2) == mh2.max_containment_ani(mh1, max_containment=mc) + assert mh1.containment_ani(mh2, estimate_ci=True) == mh1.containment_ani( + mh2, containment=s1c, estimate_ci=True + ) + assert mh2.containment_ani(mh1) == mh2.containment_ani(mh1, containment=s2c) + assert mh1.max_containment_ani(mh2) == mh2.max_containment_ani(mh1) + assert mh1.max_containment_ani(mh2) == mh1.max_containment_ani( + mh2, max_containment=mc + ) + assert mh1.max_containment_ani(mh2) == mh2.max_containment_ani( + mh1, max_containment=mc + ) def test_avg_containment_ani(): - f1 = utils.get_test_data('47+63.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("47+63.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2, ksize=31).minhash # check average_containment_ani ac_m1 = mh1.avg_containment_ani(mh2) ac_m2 = mh2.avg_containment_ani(mh1) - assert ac_m1 == ac_m2 == (mh1.containment_ani(mh2).ani + mh2.containment_ani(mh1).ani)/2 + assert ( + ac_m1 + == ac_m2 + == (mh1.containment_ani(mh2).ani + mh2.containment_ani(mh1).ani) / 2 + ) def test_containment_ANI_downsample(): - f2 = utils.get_test_data('2+63.fa.sig') - f3 = utils.get_test_data('47+63.fa.sig') + f2 = utils.get_test_data("2+63.fa.sig") + f3 = utils.get_test_data("47+63.fa.sig") mh2 = sourmash.load_one_signature(f2, ksize=31).minhash mh3 = sourmash.load_one_signature(f3, ksize=31).minhash # check that downsampling works properly @@ -2947,8 +3057,8 @@ def test_containment_ANI_downsample(): assert mh2.scaled != mh3.scaled ds_s3c = mh2.containment_ani(mh3, downsample=True) ds_s4c = mh3.containment_ani(mh2, downsample=True) - mc_w_ds_1 = mh2.max_containment_ani(mh3, downsample=True) - mc_w_ds_2 = mh3.max_containment_ani(mh2, downsample=True) + mc_w_ds_1 = mh2.max_containment_ani(mh3, downsample=True) + mc_w_ds_2 = mh3.max_containment_ani(mh2, downsample=True) print(ds_s3c) with pytest.raises(ValueError) as e: mh2.containment_ani(mh3) @@ -2962,19 +3072,19 @@ def test_containment_ANI_downsample(): assert mh2.scaled == mh3.scaled ds_s3c_manual = mh2.containment_ani(mh3) ds_s4c_manual = mh3.containment_ani(mh2) - ds_mc_manual = mh2.max_containment_ani(mh3) + ds_mc_manual = mh2.max_containment_ani(mh3) assert ds_s3c == ds_s3c_manual assert ds_s4c == ds_s4c_manual assert mc_w_ds_1 == mc_w_ds_2 == ds_mc_manual ac_m2 = mh2.avg_containment_ani(mh3) ac_m3 = mh3.avg_containment_ani(mh2) - assert ac_m2 == ac_m3 == (ds_s3c.ani + ds_s4c.ani)/2 + assert ac_m2 == ac_m3 == (ds_s3c.ani + ds_s4c.ani) / 2 def test_jaccard_ANI(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2).minhash @@ -2984,12 +3094,16 @@ def test_jaccard_ANI(): m2_jani_m1 = mh2.jaccard_ani(mh1) assert m1_jani_m2 == m2_jani_m1 - assert (m1_jani_m2.ani, m1_jani_m2.p_nothing_in_common, m1_jani_m2.jaccard_error) == (0.9783711630110239, 0.0, 3.891666770716877e-07) + assert ( + m1_jani_m2.ani, + m1_jani_m2.p_nothing_in_common, + m1_jani_m2.jaccard_error, + ) == (0.9783711630110239, 0.0, 3.891666770716877e-07) def test_jaccard_ANI_untrustworthy(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2).minhash @@ -3000,28 +3114,32 @@ def test_jaccard_ANI_untrustworthy(): # since size is inaccurate on 2.fa.sig, need to override to be able to get ani m1_jani_m2.size_is_inaccurate = False - assert m1_jani_m2.ani == None - assert m1_jani_m2.je_exceeds_threshold==True + assert m1_jani_m2.ani is None + assert m1_jani_m2.je_exceeds_threshold == True assert m1_jani_m2.je_threshold == 1e-7 def test_jaccard_ANI_precalc_jaccard(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2).minhash # precalc jaccard and assert same result jaccard = mh1.jaccard(mh2) - print("\nJACCARD_ANI", mh1.jaccard_ani(mh2,jaccard=jaccard)) + print("\nJACCARD_ANI", mh1.jaccard_ani(mh2, jaccard=jaccard)) - assert mh1.jaccard_ani(mh2) == mh1.jaccard_ani(mh2, jaccard=jaccard) == mh2.jaccard_ani(mh1, jaccard=jaccard) + assert ( + mh1.jaccard_ani(mh2) + == mh1.jaccard_ani(mh2, jaccard=jaccard) + == mh2.jaccard_ani(mh1, jaccard=jaccard) + ) wrong_jaccard = jaccard - 0.1 assert mh1.jaccard_ani(mh2) != mh1.jaccard_ani(mh2, jaccard=wrong_jaccard) def test_jaccard_ANI_downsample(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2).minhash @@ -3058,13 +3176,13 @@ def test_containment_ani_ci_tiny_testdata(): # from the formula ANI = c^(1/k) for c=3/4 and k=21 np.testing.assert_almost_equal(m2_cani_m1.ani, 0.986394259982259, decimal=3) m2_cani_m1.size_is_inaccurate = False - assert m2_cani_m1.ani_low == None - assert m2_cani_m1.ani_high == None + assert m2_cani_m1.ani_low is None + assert m2_cani_m1.ani_high is None def test_containment_num_fail(): - f1 = utils.get_test_data('num/47.fa.sig') - f2 = utils.get_test_data('num/63.fa.sig') + f1 = utils.get_test_data("num/47.fa.sig") + f2 = utils.get_test_data("num/63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2, ksize=31).minhash @@ -3081,8 +3199,8 @@ def test_containment_num_fail(): def test_ANI_num_fail(): - f1 = utils.get_test_data('num/47.fa.sig') - f2 = utils.get_test_data('num/63.fa.sig') + f1 = utils.get_test_data("num/47.fa.sig") + f2 = utils.get_test_data("num/63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2, ksize=31).minhash @@ -3091,7 +3209,7 @@ def test_ANI_num_fail(): print(str(exc)) assert "Error: can only calculate ANI for scaled MinHashes" in str(exc) with pytest.raises(TypeError) as exc: - mh2.containment_ani(mh1, estimate_ci =True) + mh2.containment_ani(mh1, estimate_ci=True) assert "Error: can only calculate ANI for scaled MinHashes" in str(exc) with pytest.raises(TypeError) as exc: mh1.max_containment_ani(mh2) @@ -3105,8 +3223,8 @@ def test_ANI_num_fail(): def test_minhash_set_size_estimate_is_accurate(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2).minhash mh1_ds = mh1.downsample(scaled=100000) @@ -3126,22 +3244,31 @@ def test_minhash_set_size_estimate_is_accurate(): # check that relative error and confidence must be between 0 and 1 with pytest.raises(ValueError) as exc: mh2.size_is_accurate(relative_error=-1) - assert "Error: relative error and confidence values must be between 0 and 1." in str(exc) + assert ( + "Error: relative error and confidence values must be between 0 and 1." + in str(exc) + ) with pytest.raises(ValueError) as exc: mh2.size_is_accurate(confidence=-1) - assert "Error: relative error and confidence values must be between 0 and 1." in str(exc) + assert ( + "Error: relative error and confidence values must be between 0 and 1." + in str(exc) + ) with pytest.raises(ValueError) as exc: mh2.size_is_accurate(relative_error=-1, confidence=-1) - assert "Error: relative error and confidence values must be between 0 and 1." in str(exc) + assert ( + "Error: relative error and confidence values must be between 0 and 1." + in str(exc) + ) def test_minhash_ani_inaccurate_size_est(): # TODO: It's actually really tricky to get the set size to be inaccurate. Eg. For a scale factor of 10000, # you would need - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash mh2 = sourmash.load_one_signature(f2).minhash # downsample @@ -3160,12 +3287,12 @@ def test_minhash_ani_inaccurate_size_est(): m1_ca_m2_ds = mh1_ds.containment_ani(mh2_ds) print(m1_ca_m2_ds) - assert m1_ca_m2_ds.ani == None #0.987 + assert m1_ca_m2_ds.ani is None # 0.987 assert m1_ca_m2_ds.size_is_inaccurate == True def test_size_num_fail(): - f1 = utils.get_test_data('num/47.fa.sig') + f1 = utils.get_test_data("num/47.fa.sig") mh1 = sourmash.load_one_signature(f1, ksize=31).minhash with pytest.raises(TypeError) as exc: diff --git a/tests/test_nodegraph.py b/tests/test_nodegraph.py index 68283dd620..bc9e02754b 100644 --- a/tests/test_nodegraph.py +++ b/tests/test_nodegraph.py @@ -2,15 +2,19 @@ import pytest -from sourmash.nodegraph import Nodegraph, extract_nodegraph_info, calc_expected_collisions +from sourmash.nodegraph import ( + Nodegraph, + extract_nodegraph_info, + calc_expected_collisions, +) import sourmash_tst_utils as utils def test_nodegraph_to_khmer_basic(): - pytest.importorskip('khmer') + pytest.importorskip("khmer") - ng_file = utils.get_test_data('.sbt.v3/internal.0') + ng_file = utils.get_test_data(".sbt.v3/internal.0") sourmash_ng = Nodegraph.load(ng_file) khmer_sm_ng = sourmash_ng.to_khmer_nodegraph() @@ -19,7 +23,7 @@ def test_nodegraph_to_khmer_basic(): def test_nodegraph_khmer_compare(): - khmer = pytest.importorskip('khmer') + khmer = pytest.importorskip("khmer") khmer_ng = khmer.Nodegraph(3, 23, 6) khmer_ng.count("ACG") @@ -43,14 +47,14 @@ def test_nodegraph_khmer_compare(): def test_nodegraph_same_file(): - khmer = pytest.importorskip('khmer') + khmer = pytest.importorskip("khmer") try: load_nodegraph = khmer.load_nodegraph except AttributeError: load_nodegraph = khmer.Nodegraph.load - ng_file = utils.get_test_data('.sbt.v3/internal.0') - with open(ng_file, 'rb') as f: + ng_file = utils.get_test_data(".sbt.v3/internal.0") + with open(ng_file, "rb") as f: ng_data = f.read() sourmash_ng = Nodegraph.load(ng_file) @@ -85,7 +89,7 @@ def test_nodegraph_same_file(): def test_nodegraph_expected_collisions(): - ng_file = utils.get_test_data('.sbt.v3/internal.0') + ng_file = utils.get_test_data(".sbt.v3/internal.0") sourmash_ng = Nodegraph.load(ng_file) @@ -93,7 +97,7 @@ def test_nodegraph_expected_collisions(): def test_nodegraph_expected_collisions_error(): - ng_file = utils.get_test_data('.sbt.v3/internal.0') + ng_file = utils.get_test_data(".sbt.v3/internal.0") sourmash_ng = Nodegraph.load(ng_file) diff --git a/tests/test_np_utils.py b/tests/test_np_utils.py index 50aaa756f4..e23ca361a0 100644 --- a/tests/test_np_utils.py +++ b/tests/test_np_utils.py @@ -5,7 +5,6 @@ def test_memmap(): - e1 = sourmash.MinHash(n=1, ksize=20) sig1 = SourmashSignature(e1) diff --git a/tests/test_picklist.py b/tests/test_picklist.py index 73c8799689..682d6fb8af 100644 --- a/tests/test_picklist.py +++ b/tests/test_picklist.py @@ -14,23 +14,23 @@ def test_load_empty_picklist_fail(): - empty = utils.get_test_data('picklist/empty.csv') + empty = utils.get_test_data("picklist/empty.csv") - pl = SignaturePicklist('manifest', pickfile=empty) + pl = SignaturePicklist("manifest", pickfile=empty) with pytest.raises(ValueError): pl.load(allow_empty=False) def test_load_empty_picklist_allow(): - empty = utils.get_test_data('picklist/empty.csv') + empty = utils.get_test_data("picklist/empty.csv") - pl = SignaturePicklist('manifest', pickfile=empty) + pl = SignaturePicklist("manifest", pickfile=empty) pl.load(allow_empty=True) def test_dup_md5_picked(runtmp): # load a sig, duplicate, and see if a picklist gets the right one - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = sourmash.load_file_as_signatures(sig47) sig = list(ss)[0] @@ -41,26 +41,26 @@ def test_dup_md5_picked(runtmp): print(ml.manifest.rows) assert len(ml.manifest) == 1 - mf_csv = runtmp.output('select.csv') + mf_csv = runtmp.output("select.csv") ml.manifest.write_to_filename(mf_csv) # now make an index to select against, with an identical signature # (but diff name) new_sig = sig.to_mutable() - new_sig.name = 'foo' + new_sig.name = "foo" xl = LinearIndex([sig, new_sig]) ml2 = MultiIndex.load([xl], [None], None) assert len(ml2) == 2 # create a picklist... - pl = SignaturePicklist('manifest', pickfile=mf_csv) + pl = SignaturePicklist("manifest", pickfile=mf_csv) print(pl.load()) - print('loaded:', len(pl.pickset)) + print("loaded:", len(pl.pickset)) # use in select ml3 = ml2.select(picklist=pl) - print('picked:', len(ml3)) + print("picked:", len(ml3)) assert len(pl.pickset) == len(ml3) @@ -68,7 +68,7 @@ def test_dup_md5_picked(runtmp): def test_dup_md5_picked_mf_to_picklist(runtmp): # load a sig, duplicate, and see if a picklist gets the right one # uses an in memory picklist - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = sourmash.load_file_as_signatures(sig47) sig = list(ss)[0] @@ -84,7 +84,7 @@ def test_dup_md5_picked_mf_to_picklist(runtmp): # now make an index to select against, with an identical signature # (but diff name) new_sig = sig.to_mutable() - new_sig.name = 'foo' + new_sig.name = "foo" xl = LinearIndex([sig, new_sig]) ml2 = MultiIndex.load([xl], [None], None) @@ -92,7 +92,7 @@ def test_dup_md5_picked_mf_to_picklist(runtmp): # use picklist in select ml3 = ml2.select(picklist=pl) - print('picked:', len(ml3)) + print("picked:", len(ml3)) assert len(pl.pickset) == len(ml3) @@ -100,12 +100,12 @@ def test_dup_md5_picked_mf_to_picklist(runtmp): def test_dup_md5_picked_mf_to_picklist_sqlite(runtmp): # load a sig, duplicate, and see if a picklist gets the right one # use a sqlite db with its own to_picklist behavior. - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss = sourmash.load_file_as_signatures(sig47) sig = list(ss)[0] # save a manifest with one entry - xl = SqliteIndex.create(':memory:') + xl = SqliteIndex.create(":memory:") xl.insert(sig) print(xl.manifest.rows) @@ -116,7 +116,7 @@ def test_dup_md5_picked_mf_to_picklist_sqlite(runtmp): # now make an index to select against, with an identical signature # (but diff name) new_sig = sig.to_mutable() - new_sig.name = 'foo' + new_sig.name = "foo" xl = LinearIndex([sig, new_sig]) ml2 = MultiIndex.load([xl], [None], None) @@ -124,6 +124,6 @@ def test_dup_md5_picked_mf_to_picklist_sqlite(runtmp): # use picklist in select ml3 = ml2.select(picklist=pl) - print('picked:', len(ml3)) + print("picked:", len(ml3)) assert len(pl.pickset) == len(ml3) diff --git a/tests/test_plugin_framework.py b/tests/test_plugin_framework.py index 06156e4d85..1acb78bd6c 100644 --- a/tests/test_plugin_framework.py +++ b/tests/test_plugin_framework.py @@ -13,22 +13,23 @@ import sourmash_tst_utils as utils from sourmash import plugins from sourmash.index import LinearIndex -from sourmash.save_load import (Base_SaveSignaturesToLocation, - SaveSignaturesToLocation) +from sourmash.save_load import Base_SaveSignaturesToLocation, SaveSignaturesToLocation + + +_Dist = collections.namedtuple("_Dist", ["version"]) -_Dist = collections.namedtuple('_Dist', ['version']) class FakeEntryPoint: """ A class that stores a name and an object to be returned on 'load()'. Mocks the EntryPoint class used by importlib.metadata. """ - module = 'test_plugin_framework' - dist = _Dist('0.1') - group = 'groupfoo' - def __init__(self, name, load_obj, *, - error_on_import=None): + module = "test_plugin_framework" + dist = _Dist("0.1") + group = "groupfoo" + + def __init__(self, name, load_obj, *, error_on_import=None): self.name = name self.load_obj = load_obj self.error_on_import = error_on_import @@ -38,15 +39,17 @@ def load(self): raise self.error_on_import("as requested") return self.load_obj + # # Test basic features of the load_from plugin hook. # + class Test_EntryPointBasics_LoadFrom: def get_some_sigs(self, location, *args, **kwargs): - ss2 = utils.get_test_data('2.fa.sig') - ss47 = utils.get_test_data('47.fa.sig') - ss63 = utils.get_test_data('63.fa.sig') + ss2 = utils.get_test_data("2.fa.sig") + ss47 = utils.get_test_data("47.fa.sig") + ss63 = utils.get_test_data("63.fa.sig") sig2 = sourmash.load_one_signature(ss2, ksize=31) sig47 = sourmash.load_one_signature(ss47, ksize=31) @@ -55,12 +58,17 @@ def get_some_sigs(self, location, *args, **kwargs): lidx = LinearIndex([sig2, sig47, sig63], location) return lidx + get_some_sigs.priority = 1 - + def setup_method(self): self.saved_plugins = plugins._plugin_load_from - plugins._plugin_load_from = [FakeEntryPoint('test_load', self.get_some_sigs), - FakeEntryPoint('test_load', self.get_some_sigs, error_on_import=ModuleNotFoundError)] + plugins._plugin_load_from = [ + FakeEntryPoint("test_load", self.get_some_sigs), + FakeEntryPoint( + "test_load", self.get_some_sigs, error_on_import=ModuleNotFoundError + ), + ] def teardown_method(self): plugins._plugin_load_from = self.saved_plugins @@ -70,7 +78,7 @@ def test_load_1(self): assert len(ps) == 1 def test_load_2(self, runtmp): - fake_location = runtmp.output('passed-through location') + fake_location = runtmp.output("passed-through location") idx = sourmash.load_file_as_index(fake_location) print(idx, idx.location) @@ -80,9 +88,9 @@ def test_load_2(self, runtmp): class Test_EntryPoint_LoadFrom_Priority: def get_some_sigs(self, location, *args, **kwargs): - ss2 = utils.get_test_data('2.fa.sig') - ss47 = utils.get_test_data('47.fa.sig') - ss63 = utils.get_test_data('63.fa.sig') + ss2 = utils.get_test_data("2.fa.sig") + ss47 = utils.get_test_data("47.fa.sig") + ss63 = utils.get_test_data("63.fa.sig") sig2 = sourmash.load_one_signature(ss2, ksize=31) sig47 = sourmash.load_one_signature(ss47, ksize=31) @@ -91,39 +99,43 @@ def get_some_sigs(self, location, *args, **kwargs): lidx = LinearIndex([sig2, sig47, sig63], location) return lidx + get_some_sigs.priority = 5 def set_called_flag_1(self, location, *args, **kwargs): # high priority 1, raise ValueError - print('setting flag 1') + print("setting flag 1") self.was_called_flag_1 = True raise ValueError + set_called_flag_1.priority = 1 def set_called_flag_2(self, location, *args, **kwargs): # high priority 2, return None - print('setting flag 2') + print("setting flag 2") self.was_called_flag_2 = True return None + set_called_flag_2.priority = 2 def set_called_flag_3(self, location, *args, **kwargs): # lower priority 10, should not be called - print('setting flag 3') + print("setting flag 3") self.was_called_flag_3 = True return None + set_called_flag_3.priority = 10 def setup_method(self): self.saved_plugins = plugins._plugin_load_from plugins._plugin_load_from = [ - FakeEntryPoint('test_load', self.get_some_sigs), - FakeEntryPoint('test_load_2', self.set_called_flag_1), - FakeEntryPoint('test_load_3', self.set_called_flag_2), - FakeEntryPoint('test_load_4', self.set_called_flag_3) - ] + FakeEntryPoint("test_load", self.get_some_sigs), + FakeEntryPoint("test_load_2", self.set_called_flag_1), + FakeEntryPoint("test_load_3", self.set_called_flag_2), + FakeEntryPoint("test_load_4", self.set_called_flag_3), + ] self.was_called_flag_1 = False self.was_called_flag_2 = False self.was_called_flag_3 = False @@ -140,7 +152,7 @@ def test_load_1(self): assert not self.was_called_flag_3 def test_load_2(self, runtmp): - fake_location = runtmp.output('passed-through location') + fake_location = runtmp.output("passed-through location") idx = sourmash.load_file_as_index(fake_location) print(idx, idx.location) @@ -156,10 +168,12 @@ def test_load_2(self, runtmp): # Test basic features of the save_to plugin hook. # + class FakeSaveClass(Base_SaveSignaturesToLocation): """ A fake save class that just records what was sent to it. """ + priority = 50 def __init__(self, location): @@ -169,7 +183,7 @@ def __init__(self, location): @classmethod def matches(cls, location): if location: - return location.endswith('.this-is-a-test') + return location.endswith(".this-is-a-test") def add(self, ss): super().add(ss) @@ -184,8 +198,12 @@ class Test_EntryPointBasics_SaveTo: # test the basics def setup_method(self): self.saved_plugins = plugins._plugin_save_to - plugins._plugin_save_to = [FakeEntryPoint('test_save', FakeSaveClass), - FakeEntryPoint('test_save', FakeSaveClass, error_on_import=ModuleNotFoundError)] + plugins._plugin_save_to = [ + FakeEntryPoint("test_save", FakeSaveClass), + FakeEntryPoint( + "test_save", FakeSaveClass, error_on_import=ModuleNotFoundError + ), + ] def teardown_method(self): plugins._plugin_save_to = self.saved_plugins @@ -197,9 +215,9 @@ def test_save_1(self): def test_save_2(self, runtmp): # load some signatures to save - ss2 = utils.get_test_data('2.fa.sig') - ss47 = utils.get_test_data('47.fa.sig') - ss63 = utils.get_test_data('63.fa.sig') + ss2 = utils.get_test_data("2.fa.sig") + ss47 = utils.get_test_data("47.fa.sig") + ss63 = utils.get_test_data("63.fa.sig") sig2 = sourmash.load_one_signature(ss2, ksize=31) sig47 = sourmash.load_one_signature(ss47, ksize=31) @@ -207,7 +225,7 @@ def test_save_2(self, runtmp): # build a fake location that matches the FakeSaveClass # extension - fake_location = runtmp.output('out.this-is-a-test') + fake_location = runtmp.output("out.this-is-a-test") # this should use the plugin architecture to return an object # of type FakeSaveClass, with the three signatures in it. @@ -230,8 +248,8 @@ class Test_EntryPointPriority_SaveTo: def setup_method(self): self.saved_plugins = plugins._plugin_save_to plugins._plugin_save_to = [ - FakeEntryPoint('test_save', FakeSaveClass), - FakeEntryPoint('test_save2', FakeSaveClass_HighPriority), + FakeEntryPoint("test_save", FakeSaveClass), + FakeEntryPoint("test_save2", FakeSaveClass_HighPriority), ] def teardown_method(self): @@ -244,9 +262,9 @@ def test_save_1(self): def test_save_2(self, runtmp): # load some signatures to save - ss2 = utils.get_test_data('2.fa.sig') - ss47 = utils.get_test_data('47.fa.sig') - ss63 = utils.get_test_data('63.fa.sig') + ss2 = utils.get_test_data("2.fa.sig") + ss47 = utils.get_test_data("47.fa.sig") + ss63 = utils.get_test_data("63.fa.sig") sig2 = sourmash.load_one_signature(ss2, ksize=31) sig47 = sourmash.load_one_signature(ss47, ksize=31) @@ -254,7 +272,7 @@ def test_save_2(self, runtmp): # build a fake location that matches the FakeSaveClass # extension - fake_location = runtmp.output('out.this-is-a-test') + fake_location = runtmp.output("out.this-is-a-test") # this should use the plugin architecture to return an object # of type FakeSaveClass, with the three signatures in it. @@ -276,18 +294,20 @@ def test_save_2(self, runtmp): # Test basic features of the save_to plugin hook. # + class FakeCommandClass(plugins.CommandLinePlugin): """ A fake CLI class. """ - command = 'nifty' + + command = "nifty" description = "do somethin' nifty" def __init__(self, parser): super().__init__(parser) - parser.add_argument('arg1') - parser.add_argument('--other', action='store_true') - parser.add_argument('--do-fail', action='store_true') + parser.add_argument("arg1") + parser.add_argument("--other", action="store_true") + parser.add_argument("--do-fail", action="store_true") def main(self, args): super().main(args) @@ -305,8 +325,7 @@ def setup_method(self): _ = plugins.get_cli_script_plugins() self.saved_plugins = plugins._plugin_cli plugins._plugin_cli_once = False - plugins._plugin_cli = [FakeEntryPoint('test_command', - FakeCommandClass)] + plugins._plugin_cli = [FakeEntryPoint("test_command", FakeCommandClass)] def teardown_method(self): plugins._plugin_cli = self.saved_plugins @@ -316,17 +335,17 @@ def test_empty(self, runtmp): plugins._plugin_cli = [] with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts') + runtmp.sourmash("scripts") out = runtmp.last_result.out err = runtmp.last_result.err print(out) print(err) - assert '(No script plugins detected!)' in out + assert "(No script plugins detected!)" in out def test_cmd_0(self, runtmp): # test default output with some plugins with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts') + runtmp.sourmash("scripts") out = runtmp.last_result.out err = runtmp.last_result.err @@ -354,32 +373,32 @@ def test_cmd_2(self): def test_cmd_3(self, runtmp): # test ability to run 'nifty' ;) with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'nifty') + runtmp.sourmash("scripts", "nifty") out = runtmp.last_result.out err = runtmp.last_result.err print(out) print(err) - assert 'nifty: error: the following arguments are required: arg1' in err - assert 'usage: nifty [-h] [-q] [-d] [--other] [--do-fail] arg1' in err + assert "nifty: error: the following arguments are required: arg1" in err + assert "usage: nifty [-h] [-q] [-d] [--other] [--do-fail] arg1" in err def test_cmd_4(self, runtmp): # test basic argument parsing etc - runtmp.sourmash('scripts', 'nifty', '--other', 'some arg') + runtmp.sourmash("scripts", "nifty", "--other", "some arg") out = runtmp.last_result.out err = runtmp.last_result.err print(out) print(err) - assert 'other is True' in out - assert 'hello, world! argument is: some arg' in out + assert "other is True" in out + assert "hello, world! argument is: some arg" in out def test_cmd_5(self, runtmp): # test exit code passthru with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts', 'nifty', '--do-fail', 'some arg') + runtmp.sourmash("scripts", "nifty", "--do-fail", "some arg") status = runtmp.last_result.status out = runtmp.last_result.out @@ -388,22 +407,23 @@ def test_cmd_5(self, runtmp): print(err) print(status) - assert 'other is False' in out - assert 'hello, world! argument is: some arg' in out + assert "other is False" in out + assert "hello, world! argument is: some arg" in out class FakeCommandClass_Second(plugins.CommandLinePlugin): """ A fake CLI class. """ - command = 'more_nifty' + + command = "more_nifty" description = "do somethin' else nifty" def __init__(self, parser): super().__init__(parser) - parser.add_argument('arg1') - parser.add_argument('--other', action='store_true') - parser.add_argument('--do-fail', action='store_true') + parser.add_argument("arg1") + parser.add_argument("--other", action="store_true") + parser.add_argument("--do-fail", action="store_true") def main(self, args): super().main(args) @@ -419,6 +439,7 @@ class FakeCommandClass_Broken_1: """ A fake CLI class. """ + # command = 'more_nifty' # no command def __init__(self, parser): @@ -432,7 +453,8 @@ class FakeCommandClass_Broken_2: """ A fake CLI class. """ - command = 'broken' + + command = "broken" # no description def __init__(self, parser): @@ -448,18 +470,15 @@ def setup_method(self): _ = plugins.get_cli_script_plugins() self.saved_plugins = plugins._plugin_cli plugins._plugin_cli_once = False - plugins._plugin_cli = [FakeEntryPoint('test_command', - FakeCommandClass), - FakeEntryPoint('test_command2', - FakeCommandClass_Second), - FakeEntryPoint('test_command3', - FakeCommandClass_Broken_1), - FakeEntryPoint('test_command4', - FakeCommandClass_Broken_2), - FakeEntryPoint('error-on-import', - FakeCommandClass, - error_on_import=ModuleNotFoundError) - ] + plugins._plugin_cli = [ + FakeEntryPoint("test_command", FakeCommandClass), + FakeEntryPoint("test_command2", FakeCommandClass_Second), + FakeEntryPoint("test_command3", FakeCommandClass_Broken_1), + FakeEntryPoint("test_command4", FakeCommandClass_Broken_2), + FakeEntryPoint( + "error-on-import", FakeCommandClass, error_on_import=ModuleNotFoundError + ), + ] def teardown_method(self): plugins._plugin_cli = self.saved_plugins @@ -467,7 +486,7 @@ def teardown_method(self): def test_cmd_0(self, runtmp): # test default output for a few plugins with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash('scripts') + runtmp.sourmash("scripts") out = runtmp.last_result.out err = runtmp.last_result.err @@ -481,7 +500,7 @@ def test_cmd_0(self, runtmp): def test_cmd_1(self, runtmp): # test 'nifty' - runtmp.sourmash('scripts', 'nifty', 'some arg') + runtmp.sourmash("scripts", "nifty", "some arg") status = runtmp.last_result.status out = runtmp.last_result.out @@ -490,12 +509,12 @@ def test_cmd_1(self, runtmp): print(err) print(status) - assert 'other is False' in out - assert 'hello, world! argument is: some arg' in out + assert "other is False" in out + assert "hello, world! argument is: some arg" in out def test_cmd_2(self, runtmp): # test 'more_nifty' - runtmp.sourmash('scripts', 'more_nifty', 'some arg') + runtmp.sourmash("scripts", "more_nifty", "some arg") status = runtmp.last_result.status out = runtmp.last_result.out @@ -504,12 +523,12 @@ def test_cmd_2(self, runtmp): print(err) print(status) - assert 'other is False' in out - assert 'hello, world! argument is: some arg' in out + assert "other is False" in out + assert "hello, world! argument is: some arg" in out def test_sourmash_info(self, runtmp): # test 'sourmash info -v' => shows the plugins - runtmp.sourmash('info', '-v') + runtmp.sourmash("info", "-v") out = runtmp.last_result.out err = runtmp.last_result.err diff --git a/tests/test_prefetch.py b/tests/test_prefetch.py index 7ab2d2c1dd..44c6b4aac5 100644 --- a/tests/test_prefetch.py +++ b/tests/test_prefetch.py @@ -25,29 +25,40 @@ def test_prefetch_basic(runtmp, linear_gather): c = runtmp # test a basic prefetch - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - linear_gather) + c.run_sourmash("prefetch", "-k", "31", sig47, sig63, sig2, sig47, linear_gather) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" in c.last_result.err + assert ( + "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" + in c.last_result.err + ) assert "selecting specified query k=31" in c.last_result.err - assert "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" in c.last_result.err - assert "query sketch has scaled=1000; will be dynamically downsampled as needed" in c.last_result.err + assert ( + "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" + in c.last_result.err + ) + assert ( + "query sketch has scaled=1000; will be dynamically downsampled as needed" + in c.last_result.err + ) err = c.last_result.err assert "loaded 5 total signatures from 3 locations." in err assert "after selecting signatures compatible with search, 3 remain." in err assert "total of 2 matching signatures." in c.last_result.err - assert "of 5177 distinct query hashes, 5177 were found in matches above threshold." in c.last_result.err + assert ( + "of 5177 distinct query hashes, 5177 were found in matches above threshold." + in c.last_result.err + ) assert "a total of 0 query hashes remain unmatched." in c.last_result.err @@ -55,15 +66,18 @@ def test_prefetch_select_query_ksize(runtmp, linear_gather): # test prefetch where query and subject db both have multiple ksizes c = runtmp - ss = utils.get_test_data('GCF_000005845.2_ASM584v2_genomic.fna.gz.sig') + ss = utils.get_test_data("GCF_000005845.2_ASM584v2_genomic.fna.gz.sig") - c.run_sourmash('prefetch', ss, ss, linear_gather) + c.run_sourmash("prefetch", ss, ss, linear_gather) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'of 4476 distinct query hashes, 4476 were found in matches above threshold.' in c.last_result.err + assert ( + "of 4476 distinct query hashes, 4476 were found in matches above threshold." + in c.last_result.err + ) def test_prefetch_subject_scaled_is_larger(runtmp, linear_gather): @@ -71,26 +85,39 @@ def test_prefetch_subject_scaled_is_larger(runtmp, linear_gather): c = runtmp # make a query sketch with scaled=1000 - fa = utils.get_test_data('genome-s10.fa.gz') - c.run_sourmash('sketch', 'dna', fa, '-o', 'query.sig') - assert os.path.exists(runtmp.output('query.sig')) + fa = utils.get_test_data("genome-s10.fa.gz") + c.run_sourmash("sketch", "dna", fa, "-o", "query.sig") + assert os.path.exists(runtmp.output("query.sig")) # this has a scaled of 10000, from same genome: - against1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - against2 = utils.get_test_data('scaled/all.sbt.zip') - against3 = utils.get_test_data('scaled/all.lca.json') + against1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + against2 = utils.get_test_data("scaled/all.sbt.zip") + against3 = utils.get_test_data("scaled/all.lca.json") # run against large scaled, then small (self) - c.run_sourmash('prefetch', 'query.sig', against1, against2, against3, - 'query.sig', linear_gather) + c.run_sourmash( + "prefetch", + "query.sig", + against1, + against2, + against3, + "query.sig", + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'total of 8 matching signatures.' in c.last_result.err - assert 'of 48 distinct query hashes, 48 were found in matches above threshold.' in c.last_result.err - assert 'final scaled value (max across query and all matches) is 10000' in c.last_result.err + assert "total of 8 matching signatures." in c.last_result.err + assert ( + "of 48 distinct query hashes, 48 were found in matches above threshold." + in c.last_result.err + ) + assert ( + "final scaled value (max across query and all matches) is 10000" + in c.last_result.err + ) def test_prefetch_subject_scaled_is_larger_outsigs(runtmp, linear_gather): @@ -98,30 +125,45 @@ def test_prefetch_subject_scaled_is_larger_outsigs(runtmp, linear_gather): c = runtmp # make a query sketch with scaled=1000 - fa = utils.get_test_data('genome-s10.fa.gz') - c.run_sourmash('sketch', 'dna', fa, '-o', 'query.sig') - assert os.path.exists(runtmp.output('query.sig')) + fa = utils.get_test_data("genome-s10.fa.gz") + c.run_sourmash("sketch", "dna", fa, "-o", "query.sig") + assert os.path.exists(runtmp.output("query.sig")) # this has a scaled of 10000, from same genome: - against1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - against2 = utils.get_test_data('scaled/all.sbt.zip') - against3 = utils.get_test_data('scaled/all.lca.json') + against1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + against2 = utils.get_test_data("scaled/all.sbt.zip") + against3 = utils.get_test_data("scaled/all.lca.json") # run against large scaled, then small (self) - c.run_sourmash('prefetch', 'query.sig', against1, against2, against3, - 'query.sig', linear_gather, '--save-matches', 'matches.sig') + c.run_sourmash( + "prefetch", + "query.sig", + against1, + against2, + against3, + "query.sig", + linear_gather, + "--save-matches", + "matches.sig", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'total of 8 matching signatures.' in c.last_result.err - assert 'of 48 distinct query hashes, 48 were found in matches above threshold.' in c.last_result.err - assert 'final scaled value (max across query and all matches) is 10000' in c.last_result.err + assert "total of 8 matching signatures." in c.last_result.err + assert ( + "of 48 distinct query hashes, 48 were found in matches above threshold." + in c.last_result.err + ) + assert ( + "final scaled value (max across query and all matches) is 10000" + in c.last_result.err + ) # make sure non-downsampled sketches were saved. - matches = sourmash.load_file_as_signatures(runtmp.output('matches.sig')) - scaled_vals = set([ match.minhash.scaled for match in matches ]) + matches = sourmash.load_file_as_signatures(runtmp.output("matches.sig")) + scaled_vals = set([match.minhash.scaled for match in matches]) assert 1000 in scaled_vals assert 10000 in scaled_vals assert len(scaled_vals) == 2 @@ -131,25 +173,36 @@ def test_prefetch_query_abund(runtmp, linear_gather): c = runtmp # test a basic prefetch w/abund query - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - linear_gather) + c.run_sourmash("prefetch", "-k", "31", sig47, sig63, sig2, sig47, linear_gather) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" in c.last_result.err + assert ( + "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" + in c.last_result.err + ) assert "selecting specified query k=31" in c.last_result.err - assert "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" in c.last_result.err - assert "query sketch has scaled=1000; will be dynamically downsampled as needed" in c.last_result.err + assert ( + "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" + in c.last_result.err + ) + assert ( + "query sketch has scaled=1000; will be dynamically downsampled as needed" + in c.last_result.err + ) assert "total of 2 matching signatures." in c.last_result.err - assert "of 5177 distinct query hashes, 5177 were found in matches above threshold." in c.last_result.err + assert ( + "of 5177 distinct query hashes, 5177 were found in matches above threshold." + in c.last_result.err + ) assert "a total of 0 query hashes remain unmatched." in c.last_result.err @@ -157,25 +210,36 @@ def test_prefetch_subj_abund(runtmp, linear_gather): c = runtmp # test a basic prefetch w/abund signature. - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - linear_gather) + c.run_sourmash("prefetch", "-k", "31", sig47, sig63, sig2, sig47, linear_gather) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" in c.last_result.err + assert ( + "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" + in c.last_result.err + ) assert "selecting specified query k=31" in c.last_result.err - assert "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" in c.last_result.err - assert "query sketch has scaled=1000; will be dynamically downsampled as needed" in c.last_result.err + assert ( + "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" + in c.last_result.err + ) + assert ( + "query sketch has scaled=1000; will be dynamically downsampled as needed" + in c.last_result.err + ) assert "total of 2 matching signatures." in c.last_result.err - assert "of 5177 distinct query hashes, 5177 were found in matches above threshold." in c.last_result.err + assert ( + "of 5177 distinct query hashes, 5177 were found in matches above threshold." + in c.last_result.err + ) assert "a total of 0 query hashes remain unmatched." in c.last_result.err @@ -183,14 +247,15 @@ def test_prefetch_csv_out(runtmp, linear_gather): c = runtmp # test a basic prefetch, with CSV output - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - csvout = c.output('out.csv') + csvout = c.output("out.csv") - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '-o', csvout, linear_gather) + c.run_sourmash( + "prefetch", "-k", "31", sig47, sig63, sig2, sig47, "-o", csvout, linear_gather + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -199,25 +264,26 @@ def test_prefetch_csv_out(runtmp, linear_gather): assert os.path.exists(csvout) expected_intersect_bp = [2529000, 5177000] - with open(csvout, 'rt', newline="") as fp: + with open(csvout, newline="") as fp: r = csv.DictReader(fp) - for (row, expected) in zip(r, expected_intersect_bp): + for row, expected in zip(r, expected_intersect_bp): print(row) - assert int(row['intersect_bp']) == expected + assert int(row["intersect_bp"]) == expected def test_prefetch_csv_gz_out(runtmp, linear_gather): c = runtmp # test a basic prefetch, with CSV output to a .gz file - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - csvout = c.output('out.csv.gz') + csvout = c.output("out.csv.gz") - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '-o', csvout, linear_gather) + c.run_sourmash( + "prefetch", "-k", "31", sig47, sig63, sig2, sig47, "-o", csvout, linear_gather + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -226,25 +292,35 @@ def test_prefetch_csv_gz_out(runtmp, linear_gather): assert os.path.exists(csvout) expected_intersect_bp = [2529000, 5177000] - with gzip.open(csvout, 'rt', newline="") as fp: + with gzip.open(csvout, "rt", newline="") as fp: r = csv.DictReader(fp) - for (row, expected) in zip(r, expected_intersect_bp): + for row, expected in zip(r, expected_intersect_bp): print(row) - assert int(row['intersect_bp']) == expected + assert int(row["intersect_bp"]) == expected def test_prefetch_matches(runtmp, linear_gather): c = runtmp # test a basic prefetch, with --save-matches - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - - matches_out = c.output('matches.sig') - - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--save-matches', matches_out, linear_gather) + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + + matches_out = c.output("matches.sig") + + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--save-matches", + matches_out, + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -255,7 +331,7 @@ def test_prefetch_matches(runtmp, linear_gather): sigs = sourmash.load_file_as_index(matches_out) expected_matches = [sig63, sig47] - for (match, expected) in zip(sigs.signatures(), expected_matches): + for match, expected in zip(sigs.signatures(), expected_matches): ss = sourmash.load_one_signature(expected, ksize=31) assert match == ss @@ -264,16 +340,26 @@ def test_prefetch_matches_to_dir(runtmp, linear_gather): c = runtmp # test a basic prefetch, with --save-matches to a directory - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss63 = sourmash.load_one_signature(sig63) ss47 = sourmash.load_one_signature(sig47) - matches_out = c.output('matches_dir/') - - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--save-matches', matches_out, linear_gather) + matches_out = c.output("matches_dir/") + + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--save-matches", + matches_out, + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -296,16 +382,26 @@ def test_prefetch_matches_to_sig_gz(runtmp, linear_gather): import gzip # test a basic prefetch, with --save-matches to a sig.gz file - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss63 = sourmash.load_one_signature(sig63) ss47 = sourmash.load_one_signature(sig47) - matches_out = c.output('matches.sig.gz') - - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--save-matches', matches_out, linear_gather) + matches_out = c.output("matches.sig.gz") + + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--save-matches", + matches_out, + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -332,16 +428,26 @@ def test_prefetch_matches_to_zip(runtmp, linear_gather): # test a basic prefetch, with --save-matches to a zipfile import zipfile - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss63 = sourmash.load_one_signature(sig63) ss47 = sourmash.load_one_signature(sig47) - matches_out = c.output('matches.zip') - - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--save-matches', matches_out, linear_gather) + matches_out = c.output("matches.zip") + + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--save-matches", + matches_out, + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -367,14 +473,22 @@ def test_prefetch_matching_hashes(runtmp, linear_gather): c = runtmp # test a basic prefetch, with --save-matches - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - - matches_out = c.output('matches.sig') - - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, - '--save-matching-hashes', matches_out, linear_gather) + utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + + matches_out = c.output("matches.sig") + + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + "--save-matching-hashes", + matches_out, + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -390,7 +504,7 @@ def test_prefetch_matching_hashes(runtmp, linear_gather): intersect.add_many(matches) ss = sourmash.load_one_signature(matches_out) - assert ss.name.endswith('-known') + assert ss.name.endswith("-known") assert ss.minhash == intersect @@ -398,14 +512,23 @@ def test_prefetch_nomatch_hashes(runtmp, linear_gather): c = runtmp # test a basic prefetch, with --save-matches - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - - nomatch_out = c.output('unmatched_hashes.sig') - - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, - '--save-unmatched-hashes', nomatch_out, linear_gather) + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + + nomatch_out = c.output("unmatched_hashes.sig") + + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + "--save-unmatched-hashes", + nomatch_out, + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -420,7 +543,7 @@ def test_prefetch_nomatch_hashes(runtmp, linear_gather): remain.remove_many(ss63.minhash.hashes) ss = sourmash.load_one_signature(nomatch_out) - assert ss.name.endswith('-unknown') + assert ss.name.endswith("-unknown") assert ss.minhash == remain @@ -428,12 +551,11 @@ def test_prefetch_no_num_query(runtmp, linear_gather): c = runtmp # can't do prefetch with num signatures for query - sig47 = utils.get_test_data('num/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("num/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig47, - linear_gather) + c.run_sourmash("prefetch", "-k", "31", sig47, sig63, sig47, linear_gather) print(c.last_result.status) print(c.last_result.out) @@ -446,50 +568,66 @@ def test_prefetch_no_num_subj(runtmp, linear_gather): c = runtmp # can't do prefetch with num signatures for query; no matches! - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('num/63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("num/63.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, linear_gather) + c.run_sourmash("prefetch", "-k", "31", sig47, sig63, linear_gather) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status != 0 - assert "ERROR in prefetch: after picklists and patterns, no signatures to search!?" in c.last_result.err + assert ( + "ERROR in prefetch: after picklists and patterns, no signatures to search!?" + in c.last_result.err + ) def test_prefetch_db_fromfile(runtmp, linear_gather): c = runtmp # test a basic prefetch - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - from_file = c.output('from-list.txt') + from_file = c.output("from-list.txt") - with open(from_file, 'wt') as fp: + with open(from_file, "w") as fp: print(sig63, file=fp) print(sig2, file=fp) print(sig47, file=fp) - c.run_sourmash('prefetch', '-k', '31', sig47, linear_gather, - '--db-from-file', from_file) + c.run_sourmash( + "prefetch", "-k", "31", sig47, linear_gather, "--db-from-file", from_file + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" in c.last_result.err + assert ( + "WARNING: no output(s) specified! Nothing will be saved from this prefetch!" + in c.last_result.err + ) assert "selecting specified query k=31" in c.last_result.err - assert "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" in c.last_result.err - assert "query sketch has scaled=1000; will be dynamically downsampled as needed" in c.last_result.err + assert ( + "loaded query: NC_009665.1 Shewanella baltica... (k=31, DNA)" + in c.last_result.err + ) + assert ( + "query sketch has scaled=1000; will be dynamically downsampled as needed" + in c.last_result.err + ) assert "total of 2 matching signatures." in c.last_result.err - assert "of 5177 distinct query hashes, 5177 were found in matches above threshold." in c.last_result.err + assert ( + "of 5177 distinct query hashes, 5177 were found in matches above threshold." + in c.last_result.err + ) assert "a total of 0 query hashes remain unmatched." in c.last_result.err @@ -497,10 +635,10 @@ def test_prefetch_no_db(runtmp, linear_gather): c = runtmp # test a basic prefetch with no databases/signatures - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('prefetch', '-k', '31', sig47, linear_gather) + c.run_sourmash("prefetch", "-k", "31", sig47, linear_gather) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -512,13 +650,23 @@ def test_prefetch_no_db(runtmp, linear_gather): def test_prefetch_check_scaled_bounds_negative(runtmp, linear_gather): c = runtmp - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--scaled', '-5', linear_gather) + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--scaled", + "-5", + linear_gather, + ) assert "ERROR: scaled value must be positive" in str(exc.value) @@ -526,41 +674,75 @@ def test_prefetch_check_scaled_bounds_negative(runtmp, linear_gather): def test_prefetch_check_scaled_bounds_less_than_minimum(runtmp, linear_gather): c = runtmp - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--scaled', '50', linear_gather) - - assert "WARNING: scaled value should be >= 100. Continuing anyway." in str(exc.value) + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--scaled", + "50", + linear_gather, + ) + + assert "WARNING: scaled value should be >= 100. Continuing anyway." in str( + exc.value + ) def test_prefetch_check_scaled_bounds_more_than_maximum(runtmp, linear_gather): c = runtmp - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--scaled', '1e9', linear_gather) - - assert "WARNING: scaled value should be <= 1e6. Continuing anyway." in str(exc.value) + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--scaled", + "1e9", + linear_gather, + ) + + assert "WARNING: scaled value should be <= 1e6. Continuing anyway." in str( + exc.value + ) def test_prefetch_downsample_scaled(runtmp, linear_gather): c = runtmp # test --scaled - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--scaled', '1e5', linear_gather) + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--scaled", + "1e5", + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -569,21 +751,19 @@ def test_prefetch_downsample_scaled(runtmp, linear_gather): assert "downsampling query from scaled=1000 to 10000" in c.last_result.err - - def test_prefetch_downsample_multiple(runtmp, linear_gather): # test multiple different downsamplings in prefetch code - query_sig = utils.get_test_data('GCF_000006945.2-s500.sig') + query_sig = utils.get_test_data("GCF_000006945.2-s500.sig") # load in the hashes and do split them into four bins, randomly. ss = sourmash.load_one_signature(query_sig) hashes = list(ss.minhash.hashes) - random.seed(a=1) # fix seed so test is reproducible + random.seed(a=1) # fix seed so test is reproducible random.shuffle(hashes) # split into 4 bins: - mh_bins = [ ss.minhash.copy_and_clear() for i in range(4) ] + mh_bins = [ss.minhash.copy_and_clear() for i in range(4)] for i, hashval in enumerate(hashes): mh_bins[i % 4].add_hash(hashval) @@ -602,25 +782,38 @@ def test_prefetch_downsample_multiple(runtmp, linear_gather): gathersigs.append(f"bin{i}.sig") - runtmp.sourmash('prefetch', linear_gather, query_sig, *gathersigs) + runtmp.sourmash("prefetch", linear_gather, query_sig, *gathersigs) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert "final scaled value (max across query and all matches) is 1000" in runtmp.last_result.err + assert ( + "final scaled value (max across query and all matches) is 1000" + in runtmp.last_result.err + ) def test_prefetch_empty(runtmp, linear_gather): c = runtmp # test --scaled - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '--scaled', '1e9', linear_gather) + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "--scaled", + "1e9", + linear_gather, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -633,13 +826,13 @@ def test_prefetch_basic_many_sigs(runtmp, linear_gather): c = runtmp # test what happens with many (and duplicate) signatures - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") manysigs = [sig63, sig2, sig47] * 5 - c.run_sourmash('prefetch', '-k', '31', sig47, *manysigs, linear_gather) + c.run_sourmash("prefetch", "-k", "31", sig47, *manysigs, linear_gather) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -647,18 +840,22 @@ def test_prefetch_basic_many_sigs(runtmp, linear_gather): assert c.last_result.status == 0 assert "total of 10 matching signatures so far." in c.last_result.err assert "total of 10 matching signatures." in c.last_result.err - assert "of 5177 distinct query hashes, 5177 were found in matches above threshold." in c.last_result.err + assert ( + "of 5177 distinct query hashes, 5177 were found in matches above threshold." + in c.last_result.err + ) assert "a total of 0 query hashes remain unmatched." in c.last_result.err def test_prefetch_with_picklist(runtmp): # test 'sourmash prefetch' with picklists - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") - runtmp.sourmash('prefetch', metag_sig, *gcf_sigs, - '--picklist', f"{picklist}:md5:md5") + runtmp.sourmash( + "prefetch", metag_sig, *gcf_sigs, "--picklist", f"{picklist}:md5:md5" + ) err = runtmp.last_result.err print(err) @@ -670,18 +867,22 @@ def test_prefetch_with_picklist(runtmp): print(out) assert "total of 3 matching signatures." in err - assert "of 1466 distinct query hashes, 453 were found in matches above threshold." in err + assert ( + "of 1466 distinct query hashes, 453 were found in matches above threshold." + in err + ) assert "a total of 1013 query hashes remain unmatched." in err def test_prefetch_with_picklist_exclude(runtmp): # test 'sourmash prefetch' with picklists, exclude - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") - runtmp.sourmash('prefetch', metag_sig, *gcf_sigs, - '--picklist', f"{picklist}:md5:md5:exclude") + runtmp.sourmash( + "prefetch", metag_sig, *gcf_sigs, "--picklist", f"{picklist}:md5:md5:exclude" + ) err = runtmp.last_result.err print(err) @@ -692,17 +893,19 @@ def test_prefetch_with_picklist_exclude(runtmp): print(out) assert "total of 9 matching signatures." in err - assert "of 1466 distinct query hashes, 1013 were found in matches above threshold." in err + assert ( + "of 1466 distinct query hashes, 1013 were found in matches above threshold." + in err + ) assert "a total of 453 query hashes remain unmatched." in err def test_prefetch_with_pattern_include(runtmp): # test 'sourmash prefetch' with --include-db-pattern - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") - runtmp.sourmash('prefetch', metag_sig, *gcf_sigs, - '--include', 'thermotoga') + runtmp.sourmash("prefetch", metag_sig, *gcf_sigs, "--include", "thermotoga") err = runtmp.last_result.err print(err) @@ -711,17 +914,19 @@ def test_prefetch_with_pattern_include(runtmp): print(out) assert "total of 3 matching signatures." in err - assert "of 1466 distinct query hashes, 453 were found in matches above threshold." in err + assert ( + "of 1466 distinct query hashes, 453 were found in matches above threshold." + in err + ) assert "a total of 1013 query hashes remain unmatched." in err def test_prefetch_with_pattern_exclude(runtmp): # test 'sourmash prefetch' with --exclude-db-pattern - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") - runtmp.sourmash('prefetch', metag_sig, *gcf_sigs, - '--exclude', 'thermotoga') + runtmp.sourmash("prefetch", metag_sig, *gcf_sigs, "--exclude", "thermotoga") err = runtmp.last_result.err print(err) @@ -730,27 +935,37 @@ def test_prefetch_with_pattern_exclude(runtmp): print(out) assert "total of 9 matching signatures." in err - assert "of 1466 distinct query hashes, 1013 were found in matches above threshold." in err + assert ( + "of 1466 distinct query hashes, 1013 were found in matches above threshold." + in err + ) assert "a total of 453 query hashes remain unmatched." in err def test_prefetch_output_with_abundance(runtmp, prefetch_gather, linear_gather): c = runtmp - query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - against = utils.get_test_data('gather-abund/genome-s10.fa.gz.sig') - - c.run_sourmash('prefetch', linear_gather, query, against, - '--save-matching-hashes', c.output('match-hash.sig'), - '--save-unmatched-hashes', c.output('nomatch-hash.sig')) + query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + against = utils.get_test_data("gather-abund/genome-s10.fa.gz.sig") + + c.run_sourmash( + "prefetch", + linear_gather, + query, + against, + "--save-matching-hashes", + c.output("match-hash.sig"), + "--save-unmatched-hashes", + c.output("nomatch-hash.sig"), + ) print(c.last_result.out) - assert os.path.exists(c.output('match-hash.sig')) - ss = list(sourmash.load_file_as_signatures(c.output('match-hash.sig')))[0] + assert os.path.exists(c.output("match-hash.sig")) + ss = list(sourmash.load_file_as_signatures(c.output("match-hash.sig")))[0] assert ss.minhash.track_abundance - assert os.path.exists(c.output('nomatch-hash.sig')) - ss = list(sourmash.load_file_as_signatures(c.output('nomatch-hash.sig')))[0] + assert os.path.exists(c.output("nomatch-hash.sig")) + ss = list(sourmash.load_file_as_signatures(c.output("nomatch-hash.sig")))[0] assert ss.minhash.track_abundance @@ -758,14 +973,15 @@ def test_prefetch_ani_csv_out(runtmp, linear_gather): c = runtmp # test a basic prefetch, with CSV output - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - csvout = c.output('out.csv') + csvout = c.output("out.csv") - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '-o', csvout, linear_gather) + c.run_sourmash( + "prefetch", "-k", "31", sig47, sig63, sig2, sig47, "-o", csvout, linear_gather + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -774,37 +990,56 @@ def test_prefetch_ani_csv_out(runtmp, linear_gather): assert os.path.exists(csvout) prefetch_result_names = PrefetchResult.prefetch_write_cols - exp1 = {'q_ani': '0.9771552502238963','m_ani': '0.9767860811200507', - 'ac_ani': '0.9769706656719734','mc_ani': '0.9771552502238963', - 'pfn': 'False'} - exp2 = {'q_ani': '1.0','m_ani': '1.0', - 'ac_ani': '1.0','mc_ani': '1.0', - 'pfn': 'False'} + exp1 = { + "q_ani": "0.9771552502238963", + "m_ani": "0.9767860811200507", + "ac_ani": "0.9769706656719734", + "mc_ani": "0.9771552502238963", + "pfn": "False", + } + exp2 = { + "q_ani": "1.0", + "m_ani": "1.0", + "ac_ani": "1.0", + "mc_ani": "1.0", + "pfn": "False", + } expected_ani_vals = [exp1, exp2] - with open(csvout, 'rt', newline="") as fp: + with open(csvout, newline="") as fp: r = csv.DictReader(fp) - for (row, expected) in zip(r, expected_ani_vals): + for row, expected in zip(r, expected_ani_vals): print(row) assert prefetch_result_names == list(row.keys()) - assert approx_eq(row['query_containment_ani'], expected['q_ani']) - assert approx_eq(row['match_containment_ani'], expected['m_ani']) - assert approx_eq(row['max_containment_ani'], expected['mc_ani']) - assert approx_eq(row['average_containment_ani'], expected['ac_ani']) - assert row['potential_false_negative'] == expected['pfn'] + assert approx_eq(row["query_containment_ani"], expected["q_ani"]) + assert approx_eq(row["match_containment_ani"], expected["m_ani"]) + assert approx_eq(row["max_containment_ani"], expected["mc_ani"]) + assert approx_eq(row["average_containment_ani"], expected["ac_ani"]) + assert row["potential_false_negative"] == expected["pfn"] def test_prefetch_ani_csv_out_estimate_ci(runtmp, linear_gather): c = runtmp # test a basic prefetch, with CSV output - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') - - csvout = c.output('out.csv') - - c.run_sourmash('prefetch', '-k', '31', sig47, sig63, sig2, sig47, - '-o', csvout, linear_gather, '--estimate-ani-ci') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") + + csvout = c.output("out.csv") + + c.run_sourmash( + "prefetch", + "-k", + "31", + sig47, + sig63, + sig2, + sig47, + "-o", + csvout, + linear_gather, + "--estimate-ani-ci", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -813,54 +1048,64 @@ def test_prefetch_ani_csv_out_estimate_ci(runtmp, linear_gather): assert os.path.exists(csvout) prefetch_result_names_ci = PrefetchResult.prefetch_write_cols_ci - exp1 = {'q_ani': '0.9771552502238963','m_ani': '0.9767860811200507', - 'q_ani_low': "0.9762537506990911", 'q_ani_high': "0.9780336875157754", - 'm_ani_low': "0.9758801604653301", "m_ani_high": "0.9776692390768575", - 'ac_ani': '0.9769706656719734','mc_ani': '0.9771552502238963', - 'pfn': 'False'} - exp2 = {'q_ani': '1.0','m_ani': '1.0', - 'q_ani_low': "1.0", 'q_ani_high': "1.0", - 'm_ani_low': "1.0", "m_ani_high": "1.0", - 'ac_ani': '1.0','mc_ani': '1.0', - 'pfn': 'False'} + exp1 = { + "q_ani": "0.9771552502238963", + "m_ani": "0.9767860811200507", + "q_ani_low": "0.9762537506990911", + "q_ani_high": "0.9780336875157754", + "m_ani_low": "0.9758801604653301", + "m_ani_high": "0.9776692390768575", + "ac_ani": "0.9769706656719734", + "mc_ani": "0.9771552502238963", + "pfn": "False", + } + exp2 = { + "q_ani": "1.0", + "m_ani": "1.0", + "q_ani_low": "1.0", + "q_ani_high": "1.0", + "m_ani_low": "1.0", + "m_ani_high": "1.0", + "ac_ani": "1.0", + "mc_ani": "1.0", + "pfn": "False", + } expected_ani_vals = [exp1, exp2] - with open(csvout, 'rt', newline="") as fp: + with open(csvout, newline="") as fp: r = csv.DictReader(fp) - for (row, expected) in zip(r, expected_ani_vals): + for row, expected in zip(r, expected_ani_vals): print(row) assert prefetch_result_names_ci == list(row.keys()) - assert approx_eq(row['query_containment_ani'],expected['q_ani']) - assert approx_eq(row['query_containment_ani_low'], expected['q_ani_low']) - assert approx_eq(row['query_containment_ani_high'], expected['q_ani_high']) - assert approx_eq(row['match_containment_ani'], expected['m_ani']) - assert approx_eq(row['match_containment_ani_low'], expected['m_ani_low']) - assert approx_eq(row['match_containment_ani_high'], expected['m_ani_high']) - assert approx_eq(row['max_containment_ani'], expected['mc_ani']) - assert approx_eq(row['average_containment_ani'], expected['ac_ani']) - assert row['potential_false_negative'] == expected['pfn'] + assert approx_eq(row["query_containment_ani"], expected["q_ani"]) + assert approx_eq(row["query_containment_ani_low"], expected["q_ani_low"]) + assert approx_eq(row["query_containment_ani_high"], expected["q_ani_high"]) + assert approx_eq(row["match_containment_ani"], expected["m_ani"]) + assert approx_eq(row["match_containment_ani_low"], expected["m_ani_low"]) + assert approx_eq(row["match_containment_ani_high"], expected["m_ani_high"]) + assert approx_eq(row["max_containment_ani"], expected["mc_ani"]) + assert approx_eq(row["average_containment_ani"], expected["ac_ani"]) + assert row["potential_false_negative"] == expected["pfn"] def test_prefetch_ani_containment_asymmetry(runtmp): # test contained_by asymmetries, viz #2215 - query_sig = utils.get_test_data('47.fa.sig') - merged_sig = utils.get_test_data('47-63-merge.sig') + query_sig = utils.get_test_data("47.fa.sig") + merged_sig = utils.get_test_data("47-63-merge.sig") - runtmp.sourmash('prefetch', query_sig, merged_sig, '-o', - 'query-in-merged.csv') - runtmp.sourmash('prefetch', merged_sig, query_sig, '-o', - 'merged-in-query.csv') + runtmp.sourmash("prefetch", query_sig, merged_sig, "-o", "query-in-merged.csv") + runtmp.sourmash("prefetch", merged_sig, query_sig, "-o", "merged-in-query.csv") - with sourmash_args.FileInputCSV(runtmp.output('query-in-merged.csv')) as r: + with sourmash_args.FileInputCSV(runtmp.output("query-in-merged.csv")) as r: query_in_merged = list(r)[0] - with sourmash_args.FileInputCSV(runtmp.output('merged-in-query.csv')) as r: + with sourmash_args.FileInputCSV(runtmp.output("merged-in-query.csv")) as r: merged_in_query = list(r)[0] - assert query_in_merged['query_containment_ani'] == '1.0' - assert query_in_merged['match_containment_ani'] == '0.9865155060423993' - assert query_in_merged['average_containment_ani'] == '0.9932577530211997' + assert query_in_merged["query_containment_ani"] == "1.0" + assert query_in_merged["match_containment_ani"] == "0.9865155060423993" + assert query_in_merged["average_containment_ani"] == "0.9932577530211997" - assert merged_in_query['match_containment_ani'] == '1.0' - assert merged_in_query['query_containment_ani'] == '0.9865155060423993' - assert merged_in_query['average_containment_ani'] == '0.9932577530211997' + assert merged_in_query["match_containment_ani"] == "1.0" + assert merged_in_query["query_containment_ani"] == "0.9865155060423993" + assert merged_in_query["average_containment_ani"] == "0.9932577530211997" diff --git a/tests/test_sbt.py b/tests/test_sbt.py index a66d0c634e..cfc71d43dd 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -6,13 +6,11 @@ import pytest import sourmash -from sourmash import (load_one_signature, SourmashSignature, - load_file_as_signatures) +from sourmash import load_one_signature, SourmashSignature, load_file_as_signatures from sourmash.exceptions import IndexNotSupported from sourmash.sbt import SBT, GraphFactory, Leaf, Node -from sourmash.sbtmh import (SigLeaf, load_sbt_index) -from sourmash.sbt_storage import (FSStorage, RedisStorage, - IPFSStorage, ZipStorage) +from sourmash.sbtmh import SigLeaf, load_sbt_index +from sourmash.sbt_storage import FSStorage, RedisStorage, IPFSStorage, ZipStorage from sourmash.search import make_jaccard_search_query from sourmash.picklist import SignaturePicklist, PickStyle @@ -24,29 +22,29 @@ def test_simple(runtmp, n_children): root = SBT(factory, d=n_children) leaf1 = Leaf("a", factory()) - leaf1.data.count('AAAAA') - leaf1.data.count('AAAAT') - leaf1.data.count('AAAAC') + leaf1.data.count("AAAAA") + leaf1.data.count("AAAAT") + leaf1.data.count("AAAAC") leaf2 = Leaf("b", factory()) - leaf2.data.count('AAAAA') - leaf2.data.count('AAAAT') - leaf2.data.count('AAAAG') + leaf2.data.count("AAAAA") + leaf2.data.count("AAAAT") + leaf2.data.count("AAAAG") leaf3 = Leaf("c", factory()) - leaf3.data.count('AAAAA') - leaf3.data.count('AAAAT') - leaf3.data.count('CAAAA') + leaf3.data.count("AAAAA") + leaf3.data.count("AAAAT") + leaf3.data.count("CAAAA") leaf4 = Leaf("d", factory()) - leaf4.data.count('AAAAA') - leaf4.data.count('CAAAA') - leaf4.data.count('GAAAA') + leaf4.data.count("AAAAA") + leaf4.data.count("CAAAA") + leaf4.data.count("GAAAA") leaf5 = Leaf("e", factory()) - leaf5.data.count('AAAAA') - leaf5.data.count('AAAAT') - leaf5.data.count('GAAAA') + leaf5.data.count("AAAAA") + leaf5.data.count("AAAAT") + leaf5.data.count("GAAAA") root.add_node(leaf1) root.add_node(leaf2) @@ -58,8 +56,8 @@ def test_simple(runtmp, n_children): def search_kmer(leaf, kmer): return leaf.data.get(kmer) - leaves = [leaf1, leaf2, leaf3, leaf4, leaf5 ] - kmers = [ "AAAAA", "AAAAT", "AAAAG", "CAAAA", "GAAAA" ] + leaves = [leaf1, leaf2, leaf3, leaf4, leaf5] + kmers = ["AAAAA", "AAAAT", "AAAAG", "CAAAA", "GAAAA"] # define an exhaustive search function that looks in all the leaf nodes. def search_kmer_in_list(kmer): @@ -75,20 +73,20 @@ def search_kmer_in_list(kmer): for kmer in kmers: assert set(root._find_nodes(search_kmer, kmer)) == search_kmer_in_list(kmer) - print('-----') - print([ x.metadata for x in root._find_nodes(search_kmer, "AAAAA") ]) - print([ x.metadata for x in root._find_nodes(search_kmer, "AAAAT") ]) - print([ x.metadata for x in root._find_nodes(search_kmer, "AAAAG") ]) - print([ x.metadata for x in root._find_nodes(search_kmer, "CAAAA") ]) - print([ x.metadata for x in root._find_nodes(search_kmer, "GAAAA") ]) + print("-----") + print([x.metadata for x in root._find_nodes(search_kmer, "AAAAA")]) + print([x.metadata for x in root._find_nodes(search_kmer, "AAAAT")]) + print([x.metadata for x in root._find_nodes(search_kmer, "AAAAG")]) + print([x.metadata for x in root._find_nodes(search_kmer, "CAAAA")]) + print([x.metadata for x in root._find_nodes(search_kmer, "GAAAA")]) # save SBT to a directory and then reload - root.save(runtmp.output('demo')) - root = SBT.load(runtmp.output('demo')) + root.save(runtmp.output("demo")) + root = SBT.load(runtmp.output("demo")) for kmer in kmers: new_result = {str(r) for r in root._find_nodes(search_kmer, kmer)} - print(*new_result, sep='\n') + print(*new_result, sep="\n") assert new_result == {str(r) for r in search_kmer_in_list(kmer)} @@ -99,29 +97,29 @@ def test_longer_search(n_children): root = SBT(factory, d=n_children) leaf1 = Leaf("a", factory()) - leaf1.data.count('AAAAA') - leaf1.data.count('AAAAT') - leaf1.data.count('AAAAC') + leaf1.data.count("AAAAA") + leaf1.data.count("AAAAT") + leaf1.data.count("AAAAC") leaf2 = Leaf("b", factory()) - leaf2.data.count('AAAAA') - leaf2.data.count('AAAAT') - leaf2.data.count('AAAAG') + leaf2.data.count("AAAAA") + leaf2.data.count("AAAAT") + leaf2.data.count("AAAAG") leaf3 = Leaf("c", factory()) - leaf3.data.count('AAAAA') - leaf3.data.count('AAAAT') - leaf3.data.count('CAAAA') + leaf3.data.count("AAAAA") + leaf3.data.count("AAAAT") + leaf3.data.count("CAAAA") leaf4 = Leaf("d", factory()) - leaf4.data.count('AAAAA') - leaf4.data.count('CAAAA') - leaf4.data.count('GAAAA') + leaf4.data.count("AAAAA") + leaf4.data.count("CAAAA") + leaf4.data.count("GAAAA") leaf5 = Leaf("e", factory()) - leaf5.data.count('AAAAA') - leaf5.data.count('AAAAT') - leaf5.data.count('GAAAA') + leaf5.data.count("AAAAA") + leaf5.data.count("AAAAT") + leaf5.data.count("GAAAA") root.add_node(leaf1) root.add_node(leaf2) @@ -131,32 +129,32 @@ def test_longer_search(n_children): def kmers(k, seq): for start in range(len(seq) - k + 1): - yield seq[start:start + k] + yield seq[start : start + k] def search_transcript(node, seq, threshold): - presence = [ node.data.get(kmer) for kmer in kmers(ksize, seq) ] + presence = [node.data.get(kmer) for kmer in kmers(ksize, seq)] if sum(presence) >= int(threshold * (len(seq) - ksize + 1)): return 1 return 0 - try1 = [ x.metadata for x in root._find_nodes(search_transcript, "AAAAT", 1.0) ] - assert set(try1) == set([ 'a', 'b', 'c', 'e' ]), try1 # no 'd' + try1 = [x.metadata for x in root._find_nodes(search_transcript, "AAAAT", 1.0)] + assert set(try1) == set(["a", "b", "c", "e"]), try1 # no 'd' - try2 = [ x.metadata for x in root._find_nodes(search_transcript, "GAAAAAT", 0.6) ] - assert set(try2) == set([ 'a', 'b', 'c', 'd', 'e' ]) + try2 = [x.metadata for x in root._find_nodes(search_transcript, "GAAAAAT", 0.6)] + assert set(try2) == set(["a", "b", "c", "d", "e"]) - try3 = [ x.metadata for x in root._find_nodes(search_transcript, "GAAAA", 1.0) ] - assert set(try3) == set([ 'd', 'e' ]), try3 + try3 = [x.metadata for x in root._find_nodes(search_transcript, "GAAAA", 1.0)] + assert set(try3) == set(["d", "e"]), try3 -#@pytest.mark.parametrize("old_version", ["v1", "v2", "v3", "v4", "v5"]) +# @pytest.mark.parametrize("old_version", ["v1", "v2", "v3", "v4", "v5"]) @pytest.mark.parametrize("old_version", ["v3", "v4", "v5"]) def test_tree_old_load(old_version): - tree_old = SBT.load(utils.get_test_data('{}.sbt.json'.format(old_version)), - leaf_loader=SigLeaf.load) + tree_old = SBT.load( + utils.get_test_data(f"{old_version}.sbt.json"), leaf_loader=SigLeaf.load + ) - tree_cur = SBT.load(utils.get_test_data('v6.sbt.json'), - leaf_loader=SigLeaf.load) + tree_cur = SBT.load(utils.get_test_data("v6.sbt.json"), leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) to_search = load_one_signature(testdata1) @@ -177,8 +175,8 @@ def test_tree_old_load(old_version): def test_load_future(tmpdir): - with open(str(tmpdir.join("v9999.sbt.json")), 'w') as f: - json.dump({'version': 9999}, f) + with open(str(tmpdir.join("v9999.sbt.json")), "w") as f: + json.dump({"version": 9999}, f) with pytest.raises(IndexNotSupported) as excinfo: SBT.load(str(tmpdir.join("v9999.sbt.json"))) @@ -196,21 +194,20 @@ def test_tree_save_load(runtmp, n_children): tree.add_node(leaf) to_search = leaf - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) old_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*old_result, sep='\n') + print(*old_result, sep="\n") - tree.save(runtmp.output('demo')) - tree = SBT.load(runtmp.output('demo'), - leaf_loader=SigLeaf.load) + tree.save(runtmp.output("demo")) + tree = SBT.load(runtmp.output("demo"), leaf_loader=SigLeaf.load) - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) new_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*new_result, sep='\n') + print(*new_result, sep="\n") assert old_result == new_result @@ -219,7 +216,6 @@ def test_search_minhashes(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory) - n_leaves = 0 for f in utils.SIG_FILES: sig = load_one_signature(utils.get_test_data(f)) leaf = SigLeaf(os.path.basename(f), sig) @@ -257,12 +253,12 @@ def test_binary_nary_tree(): assert all([len(list(t.leaves())) == n_leaves for t in trees.values()]) results = {} - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") for d, tree in trees.items(): search_obj = make_jaccard_search_query(threshold=0.1) results[d] = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*results[2], sep='\n') + print(*results[2], sep="\n") assert results[2] == results[5] assert results[5] == results[10] @@ -327,26 +323,26 @@ def test_sbt_fsstorage(runtmp): tree.add_node(leaf) to_search = leaf - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) old_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*old_result, sep='\n') + print(*old_result, sep="\n") - with FSStorage(runtmp.location, '.fstree') as storage: - tree.save(runtmp.output('tree.sbt.json'), storage=storage) + with FSStorage(runtmp.location, ".fstree") as storage: + tree.save(runtmp.output("tree.sbt.json"), storage=storage) - tree = SBT.load(runtmp.output('tree.sbt.json'), leaf_loader=SigLeaf.load) - print('*' * 60) - print("{}:".format(to_search.metadata)) + tree = SBT.load(runtmp.output("tree.sbt.json"), leaf_loader=SigLeaf.load) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) new_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*new_result, sep='\n') + print(*new_result, sep="\n") assert old_result == new_result assert os.path.exists(runtmp.output(tree.storage.subdir)) - assert os.path.exists(runtmp.output('.fstree')) + assert os.path.exists(runtmp.output(".fstree")) def test_sbt_zipstorage(tmpdir): @@ -361,31 +357,31 @@ def test_sbt_zipstorage(tmpdir): tree.add_node(leaf) to_search = leaf - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) old_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*old_result, sep='\n') + print(*old_result, sep="\n") with ZipStorage(str(tmpdir.join("tree.sbt.zip")), mode="w") as storage: tree.save(str(tmpdir.join("tree.sbt.json")), storage=storage) with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage: - tree = SBT.load(str(tmpdir.join("tree.sbt.json")), - leaf_loader=SigLeaf.load, - storage=storage) + tree = SBT.load( + str(tmpdir.join("tree.sbt.json")), leaf_loader=SigLeaf.load, storage=storage + ) - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) new_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*new_result, sep='\n') + print(*new_result, sep="\n") assert old_result == new_result def test_sbt_ipfsstorage(runtmp): - ipfshttpclient = pytest.importorskip('ipfshttpclient') + ipfshttpclient = pytest.importorskip("ipfshttpclient") factory = GraphFactory(31, 1e5, 4) tree = SBT(factory) @@ -397,34 +393,34 @@ def test_sbt_ipfsstorage(runtmp): tree.add_node(leaf) to_search = leaf - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) old_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*old_result, sep='\n') + print(*old_result, sep="\n") try: with IPFSStorage() as storage: - tree.save(runtmp.output('tree.sbt.json'), storage=storage) + tree.save(runtmp.output("tree.sbt.json"), storage=storage) except ipfshttpclient.exceptions.ConnectionError: pytest.xfail("ipfs not installed/functioning probably") with IPFSStorage() as storage: - tree = SBT.load(runtmp.output('tree.sbt.json'), - leaf_loader=SigLeaf.load, - storage=storage) + tree = SBT.load( + runtmp.output("tree.sbt.json"), leaf_loader=SigLeaf.load, storage=storage + ) - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) new_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*new_result, sep='\n') + print(*new_result, sep="\n") assert old_result == new_result def test_sbt_redisstorage(runtmp): - redis = pytest.importorskip('redis') + redis = pytest.importorskip("redis") factory = GraphFactory(31, 1e5, 4) tree = SBT(factory) @@ -435,28 +431,28 @@ def test_sbt_redisstorage(runtmp): tree.add_node(leaf) to_search = leaf - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) old_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*old_result, sep='\n') + print(*old_result, sep="\n") try: with RedisStorage() as storage: - tree.save(runtmp.output('tree.sbt.json'), storage=storage) + tree.save(runtmp.output("tree.sbt.json"), storage=storage) except redis.exceptions.ConnectionError: pytest.xfail("Couldn't connect to redis server") with RedisStorage() as storage: - tree = SBT.load(runtmp.output('tree.sbt.json'), - leaf_loader=SigLeaf.load, - storage=storage) + tree = SBT.load( + runtmp.output("tree.sbt.json"), leaf_loader=SigLeaf.load, storage=storage + ) - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) new_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*new_result, sep='\n') + print(*new_result, sep="\n") assert old_result == new_result @@ -475,12 +471,12 @@ def test_save_zip(tmpdir): new_tree = SBT.load(str(newsbt), leaf_loader=SigLeaf.load) assert isinstance(new_tree.storage, ZipStorage) - assert new_tree.storage.list_sbts() == ['new.sbt.json'] + assert new_tree.storage.list_sbts() == ["new.sbt.json"] to_search = load_one_signature(utils.get_test_data(utils.SIG_FILES[0])) print("*" * 60) - print("{}:".format(to_search)) + print(f"{to_search}:") search_obj = make_jaccard_search_query(threshold=0.1) old_result = {str(s.signature) for s in tree.find(search_obj, to_search)} new_result = {str(s.signature) for s in new_tree.find(search_obj, to_search)} @@ -502,7 +498,7 @@ def test_load_zip(tmpdir): to_search = load_one_signature(utils.get_test_data(utils.SIG_FILES[0])) print("*" * 60) - print("{}:".format(to_search)) + print(f"{to_search}:") search_obj = make_jaccard_search_query(threshold=0.1) new_result = {str(s.signature) for s in tree.find(search_obj, to_search)} print(*new_result, sep="\n") @@ -516,7 +512,7 @@ def test_load_zip_uncompressed(tmpdir): testdata = utils.get_test_data("v6.sbt.zip") testsbt = tmpdir.join("v6.sbt.json") - with zipfile.ZipFile(testdata, 'r') as z: + with zipfile.ZipFile(testdata, "r") as z: z.extractall(str(tmpdir)) tree = SBT.load(str(testsbt), leaf_loader=SigLeaf.load) @@ -524,7 +520,7 @@ def test_load_zip_uncompressed(tmpdir): to_search = load_one_signature(utils.get_test_data(utils.SIG_FILES[0])) print("*" * 60) - print("{}:".format(to_search)) + print(f"{to_search}:") search_obj = make_jaccard_search_query(threshold=0.1) new_result = {str(s.signature) for s in tree.find(search_obj, to_search)} print(*new_result, sep="\n") @@ -532,11 +528,11 @@ def test_load_zip_uncompressed(tmpdir): def test_tree_repair(): - tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'), - leaf_loader=SigLeaf.load) + tree_repair = SBT.load( + utils.get_test_data("leaves.sbt.json"), leaf_loader=SigLeaf.load + ) - tree_cur = SBT.load(utils.get_test_data('v3.sbt.json'), - leaf_loader=SigLeaf.load) + tree_cur = SBT.load(utils.get_test_data("v3.sbt.json"), leaf_loader=SigLeaf.load) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) to_search = load_one_signature(testdata1) @@ -550,8 +546,9 @@ def test_tree_repair(): def test_tree_repair_insert(): - tree_repair = SBT.load(utils.get_test_data('leaves.sbt.json'), - leaf_loader=SigLeaf.load) + tree_repair = SBT.load( + utils.get_test_data("leaves.sbt.json"), leaf_loader=SigLeaf.load + ) for f in utils.SIG_FILES: sig = load_one_signature(utils.get_test_data(f)) @@ -579,23 +576,23 @@ def test_save_sparseness(runtmp, n_children): tree.add_node(leaf) to_search = leaf - print('*' * 60) - print("{}:".format(to_search.metadata)) + print("*" * 60) + print(f"{to_search.metadata}:") search_obj = make_jaccard_search_query(threshold=0.1) old_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} - print(*old_result, sep='\n') + print(*old_result, sep="\n") - tree.save(runtmp.output('demo'), sparseness=1.0) - tree_loaded = SBT.load(runtmp.output('demo'), - leaf_loader=SigLeaf.load) + tree.save(runtmp.output("demo"), sparseness=1.0) + tree_loaded = SBT.load(runtmp.output("demo"), leaf_loader=SigLeaf.load) assert all(not isinstance(n, Node) for _, n in tree_loaded) - print('*' * 60) - print("{}:".format(to_search.metadata)) - new_result = {str(s.signature) for s in tree_loaded.find(search_obj, - to_search.data)} - print(*new_result, sep='\n') + print("*" * 60) + print(f"{to_search.metadata}:") + new_result = { + str(s.signature) for s in tree_loaded.find(search_obj, to_search.data) + } + print(*new_result, sep="\n") assert old_result == new_result @@ -615,8 +612,8 @@ def test_sbt_as_index_select(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig")) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig")) tree.insert(sig47) tree.insert(sig63) @@ -624,7 +621,7 @@ def test_sbt_as_index_select(): xx = tree.select(ksize=31) assert xx == tree - xx = tree.select(moltype='DNA') + xx = tree.select(moltype="DNA") assert xx == tree xx = tree.select(abund=False) @@ -634,7 +631,7 @@ def test_sbt_as_index_select(): tree.select(ksize=21) with pytest.raises(ValueError): - tree.select(moltype='protein') + tree.select(moltype="protein") with pytest.raises(ValueError): tree.select(abund=True) @@ -646,15 +643,15 @@ def test_sbt_as_index_select_picklist(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig")) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig")) tree.insert(sig47) tree.insert(sig63) # construct a picklist... - picklist = SignaturePicklist('md5prefix8') - picklist.init(['09a08691']) + picklist = SignaturePicklist("md5prefix8") + picklist.init(["09a08691"]) # select on picklist tree = tree.select(picklist=picklist) @@ -663,7 +660,7 @@ def test_sbt_as_index_select_picklist(): ss = siglist[0] assert ss.minhash.ksize == 31 - assert ss.md5sum().startswith('09a08691c') + assert ss.md5sum().startswith("09a08691c") def test_sbt_as_index_select_picklist_exclude(): @@ -672,15 +669,15 @@ def test_sbt_as_index_select_picklist_exclude(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig")) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig")) tree.insert(sig47) tree.insert(sig63) # construct a picklist... - picklist = SignaturePicklist('md5prefix8', pickstyle=PickStyle.EXCLUDE) - picklist.init(['09a08691']) + picklist = SignaturePicklist("md5prefix8", pickstyle=PickStyle.EXCLUDE) + picklist.init(["09a08691"]) # select on picklist tree = tree.select(picklist=picklist) @@ -689,7 +686,7 @@ def test_sbt_as_index_select_picklist_exclude(): ss = siglist[0] assert ss.minhash.ksize == 31 - assert ss.md5sum().startswith('38729c637') + assert ss.md5sum().startswith("38729c637") def test_sbt_as_index_find_picklist(): @@ -698,15 +695,15 @@ def test_sbt_as_index_find_picklist(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig")) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig")) tree.insert(sig47) tree.insert(sig63) # construct a picklist... - picklist = SignaturePicklist('md5prefix8') - picklist.init(['09a08691']) + picklist = SignaturePicklist("md5prefix8") + picklist.init(["09a08691"]) # run a 'find' with sig63, should find 47 and 63 both. search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0) @@ -723,7 +720,7 @@ def test_sbt_as_index_find_picklist(): # and check that it is the expected one! ss = results[0].signature assert ss.minhash.ksize == 31 - assert ss.md5sum().startswith('09a08691c') + assert ss.md5sum().startswith("09a08691c") def test_sbt_as_index_find_picklist_exclude(): @@ -732,15 +729,15 @@ def test_sbt_as_index_find_picklist_exclude(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig")) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig")) tree.insert(sig47) tree.insert(sig63) # construct a picklist... - picklist = SignaturePicklist('md5prefix8', pickstyle=PickStyle.EXCLUDE) - picklist.init(['09a08691']) + picklist = SignaturePicklist("md5prefix8", pickstyle=PickStyle.EXCLUDE) + picklist.init(["09a08691"]) # run a 'find' with sig63, should find 47 and 63 both. search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0) @@ -757,7 +754,7 @@ def test_sbt_as_index_find_picklist_exclude(): # and check that it is the expected one! ss = results[0].signature assert ss.minhash.ksize == 31 - assert ss.md5sum().startswith('38729c637') + assert ss.md5sum().startswith("38729c637") def test_sbt_as_index_find_picklist_twice(): @@ -766,15 +763,15 @@ def test_sbt_as_index_find_picklist_twice(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig")) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig")) tree.insert(sig47) tree.insert(sig63) # construct a picklist... - picklist = SignaturePicklist('md5prefix8') - picklist.init(['09a08691']) + picklist = SignaturePicklist("md5prefix8") + picklist.init(["09a08691"]) # run a 'find' with sig63, should find 47 and 63 both. search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0) @@ -787,7 +784,9 @@ def test_sbt_as_index_find_picklist_twice(): with pytest.raises(ValueError): tree = tree.select(picklist=picklist) - assert "we do not (yet) support multiple picklists for SBT databases" in str(exc) + assert "we do not (yet) support multiple picklists for SBT databases" in str( + exc + ) def test_sbt_as_index_signatures(): @@ -795,8 +794,8 @@ def test_sbt_as_index_signatures(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig")) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig")) tree.insert(sig47) tree.insert(sig63) @@ -813,9 +812,9 @@ def test_sbt_gather_threshold_1(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) + sig2 = load_one_signature(utils.get_test_data("2.fa.sig"), ksize=31) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) tree.insert(sig47) tree.insert(sig63) @@ -861,7 +860,7 @@ def test_sbt_gather_threshold_1(): assert name is None # check with a too-high threshold -> should be no results. - print('len mh', len(new_mh)) + print("len mh", len(new_mh)) with pytest.raises(ValueError): tree.best_containment(SourmashSignature(new_mh), threshold_bp=5000) @@ -871,9 +870,9 @@ def test_sbt_gather_threshold_5(): factory = GraphFactory(31, 1e5, 4) tree = SBT(factory, d=2) - sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) + sig2 = load_one_signature(utils.get_test_data("2.fa.sig"), ksize=31) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) tree.insert(sig47) tree.insert(sig63) @@ -902,7 +901,7 @@ def test_sbt_gather_threshold_5(): assert name is None # now, check with a threshold_bp that should be meet-able. - results = tree.best_containment(SourmashSignature(new_mh), threshold_bp=5000) + tree.best_containment(SourmashSignature(new_mh), threshold_bp=5000) assert result containment, match_sig, name = result assert containment == 1.0 @@ -913,9 +912,9 @@ def test_sbt_gather_threshold_5(): @utils.in_tempdir def test_gather_single_return(c): # test gather() number of returns - sig2file = utils.get_test_data('2.fa.sig') - sig47file = utils.get_test_data('47.fa.sig') - sig63file = utils.get_test_data('63.fa.sig') + sig2file = utils.get_test_data("2.fa.sig") + sig47file = utils.get_test_data("47.fa.sig") + sig63file = utils.get_test_data("63.fa.sig") sig2 = load_one_signature(sig2file, ksize=31) sig47 = load_one_signature(sig47file, ksize=31) @@ -953,10 +952,10 @@ def test_sbt_jaccard_ordering(runtmp): def _intersect(x, y): return x.intersection_and_union_size(y)[0] - print('a intersect b:', _intersect(a, b)) - print('a intersect c:', _intersect(a, c)) - print('a jaccard b:', a.jaccard(b)) - print('a jaccard c:', a.jaccard(c)) + print("a intersect b:", _intersect(a, b)) + print("a intersect c:", _intersect(a, c)) + print("a jaccard b:", a.jaccard(b)) + print("a jaccard c:", a.jaccard(c)) assert _intersect(a, b) > _intersect(a, c) assert a.jaccard(b) < a.jaccard(c) @@ -965,9 +964,9 @@ def _intersect(x, y): assert a.jaccard(c) > 0.15 # now - make signatures, try out :) - ss_a = sourmash.SourmashSignature(a, name='A') - ss_b = sourmash.SourmashSignature(b, name='B') - ss_c = sourmash.SourmashSignature(c, name='C') + ss_a = sourmash.SourmashSignature(a, name="A") + ss_b = sourmash.SourmashSignature(b, name="B") + ss_c = sourmash.SourmashSignature(c, name="C") factory = GraphFactory(31, 1e5, 4) db = SBT(factory, d=2) @@ -988,16 +987,21 @@ def test_sbt_protein_command_index(runtmp): c = runtmp # test command-line creation of SBT database with protein sigs - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) - db_out = c.output('protein.sbt.zip') + db_out = c.output("protein.sbt.zip") - c.run_sourmash('index', db_out, sigfile1, sigfile2, - '--scaled', '100', '-k', '19', '--protein') + c.run_sourmash( + "index", db_out, sigfile1, sigfile2, "--scaled", "100", "-k", "19", "--protein" + ) # check to make sure .sbt.protein directory doesn't get created - assert not os.path.exists(c.output('.sbt.protein')) + assert not os.path.exists(c.output(".sbt.protein")) db2 = load_sbt_index(db_out) @@ -1005,14 +1009,19 @@ def test_sbt_protein_command_index(runtmp): sig2 = sourmash.load_one_signature(sigfile2) # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list # and search, gather - results = db2.search(sig1, threshold=0.0, ignore_abundance=True, - do_containment=False, best_only=False) + results = db2.search( + sig1, + threshold=0.0, + ignore_abundance=True, + do_containment=False, + best_only=False, + ) assert len(results) == 2 result = db2.best_containment(sig2) @@ -1024,13 +1033,18 @@ def test_sbt_protein_command_index(runtmp): @utils.in_tempdir def test_sbt_protein_search_no_threshold(c): # test the '.search' method on SBTs w/no threshold - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) - db_out = c.output('protein.sbt.zip') + db_out = c.output("protein.sbt.zip") - c.run_sourmash('index', db_out, sigfile1, sigfile2, - '--scaled', '100', '-k', '19', '--protein') + c.run_sourmash( + "index", db_out, sigfile1, sigfile2, "--scaled", "100", "-k", "19", "--protein" + ) db2 = load_sbt_index(db_out) @@ -1038,34 +1052,41 @@ def test_sbt_protein_search_no_threshold(c): # and search, gather with pytest.raises(TypeError) as exc: - results = db2.search(sig1) + db2.search(sig1) assert "'search' requires 'threshold'" in str(exc) @utils.in_thisdir def test_sbt_protein_command_search(c): # test command-line search/gather of SBT database with protein sigs - sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/protein.sbt.zip') + sigfile1 = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/protein.sbt.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out) - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out) + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out @utils.in_tempdir def test_sbt_hp_command_index(c): # test command-line creation of SBT database with hp sigs - sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/hp/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) - db_out = c.output('hp.sbt.zip') + db_out = c.output("hp.sbt.zip") - c.run_sourmash('index', db_out, sigfile1, sigfile2, - '--scaled', '100', '-k', '19', '--hp') + c.run_sourmash( + "index", db_out, sigfile1, sigfile2, "--scaled", "100", "-k", "19", "--hp" + ) db2 = load_sbt_index(db_out) @@ -1073,14 +1094,19 @@ def test_sbt_hp_command_index(c): sig2 = sourmash.load_one_signature(sigfile2) # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list # and search, gather - results = db2.search(sig1, threshold=0.0, ignore_abundance=True, - do_containment=False, best_only=False) + results = db2.search( + sig1, + threshold=0.0, + ignore_abundance=True, + do_containment=False, + best_only=False, + ) assert results result = db2.best_containment(sig2) @@ -1092,27 +1118,34 @@ def test_sbt_hp_command_index(c): @utils.in_thisdir def test_sbt_hp_command_search(c): # test command-line search/gather of SBT database with hp sigs - sigfile1 = utils.get_test_data('prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/hp.sbt.zip') + sigfile1 = utils.get_test_data( + "prot/hp/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/hp.sbt.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out, "--threshold", "0.0") + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out @utils.in_tempdir def test_sbt_dayhoff_command_index(c): # test command-line creation of SBT database with dayhoff sigs - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - sigfile2 = utils.get_test_data('prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + sigfile2 = utils.get_test_data( + "prot/dayhoff/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) - db_out = c.output('dayhoff.sbt.zip') + db_out = c.output("dayhoff.sbt.zip") - c.run_sourmash('index', db_out, sigfile1, sigfile2, - '--scaled', '100', '-k', '19', '--dayhoff') + c.run_sourmash( + "index", db_out, sigfile1, sigfile2, "--scaled", "100", "-k", "19", "--dayhoff" + ) db2 = load_sbt_index(db_out) @@ -1120,14 +1153,19 @@ def test_sbt_dayhoff_command_index(c): sig2 = sourmash.load_one_signature(sigfile2) # check reconstruction -- - mh_list = [ x.minhash for x in db2.signatures() ] + mh_list = [x.minhash for x in db2.signatures()] assert len(mh_list) == 2 assert sig1.minhash in mh_list assert sig2.minhash in mh_list # and search, gather - results = db2.search(sig1, threshold=0.0, ignore_abundance=True, - do_containment=False, best_only=False) + results = db2.search( + sig1, + threshold=0.0, + ignore_abundance=True, + do_containment=False, + best_only=False, + ) assert len(results) == 2 result = db2.best_containment(sig2) @@ -1139,21 +1177,23 @@ def test_sbt_dayhoff_command_index(c): @utils.in_thisdir def test_sbt_dayhoff_command_search(c): # test command-line search/gather of SBT database with dayhoff sigs - sigfile1 = utils.get_test_data('prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db_out = utils.get_test_data('prot/dayhoff.sbt.zip') + sigfile1 = utils.get_test_data( + "prot/dayhoff/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + db_out = utils.get_test_data("prot/dayhoff.sbt.zip") - c.run_sourmash('search', sigfile1, db_out, '--threshold', '0.0') - assert '2 matches' in c.last_result.out + c.run_sourmash("search", sigfile1, db_out, "--threshold", "0.0") + assert "2 matches" in c.last_result.out - c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') - assert 'found 1 matches total' in c.last_result.out - assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + c.run_sourmash("gather", sigfile1, db_out, "--threshold", "0.0") + assert "found 1 matches total" in c.last_result.out + assert "the recovered matches hit 100.0% of the query" in c.last_result.out def test_sbt_node_cache(): - tree = SBT.load(utils.get_test_data('v6.sbt.json'), - leaf_loader=SigLeaf.load, - cache_size=1) + tree = SBT.load( + utils.get_test_data("v6.sbt.json"), leaf_loader=SigLeaf.load, cache_size=1 + ) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) to_search = load_one_signature(testdata1) @@ -1172,28 +1212,28 @@ def test_sbt_node_cache(): def test_sbt_no_containment_on_num(): - tree = SBT.load(utils.get_test_data('v6.sbt.json'), - leaf_loader=SigLeaf.load, - cache_size=1) + tree = SBT.load( + utils.get_test_data("v6.sbt.json"), leaf_loader=SigLeaf.load, cache_size=1 + ) testdata1 = utils.get_test_data(utils.SIG_FILES[0]) to_search = load_one_signature(testdata1) search_obj = make_jaccard_search_query(do_containment=True, threshold=0.05) with pytest.raises(TypeError) as exc: - results = list(tree.find(search_obj, to_search)) + list(tree.find(search_obj, to_search)) assert "this search requires a scaled signature" in str(exc) def test_build_sbt_zip_with_dups(runtmp): - dups_data = utils.get_test_data('duplicate-sigs') + dups_data = utils.get_test_data("duplicate-sigs") all_sigs = set(sourmash.load_file_as_signatures(dups_data)) assert len(all_sigs) == 4 - runtmp.run_sourmash('index', 'dups.sbt.zip', dups_data) - outfile = runtmp.output('dups.sbt.zip') + runtmp.run_sourmash("index", "dups.sbt.zip", dups_data) + outfile = runtmp.output("dups.sbt.zip") sbt_sigs = set(sourmash.load_file_as_signatures(outfile)) assert len(sbt_sigs) == 4 @@ -1202,17 +1242,17 @@ def test_build_sbt_zip_with_dups(runtmp): def test_build_sbt_zip_with_dups_exists(runtmp): - dups_data = utils.get_test_data('duplicate-sigs') + dups_data = utils.get_test_data("duplicate-sigs") all_sigs = set(sourmash.load_file_as_signatures(dups_data)) assert len(all_sigs) == 4 - runtmp.run_sourmash('index', 'dups.sbt.zip', dups_data) - outfile = runtmp.output('dups.sbt.zip') + runtmp.run_sourmash("index", "dups.sbt.zip", dups_data) + outfile = runtmp.output("dups.sbt.zip") # run again, to see what happens :) - runtmp.run_sourmash('index', 'dups.sbt.zip', dups_data) - outfile = runtmp.output('dups.sbt.zip') + runtmp.run_sourmash("index", "dups.sbt.zip", dups_data) + outfile = runtmp.output("dups.sbt.zip") sbt_sigs = set(sourmash.load_file_as_signatures(outfile)) assert len(sbt_sigs) == 4 @@ -1221,13 +1261,13 @@ def test_build_sbt_zip_with_dups_exists(runtmp): def test_build_sbt_json_with_dups(runtmp): - dups_data = utils.get_test_data('duplicate-sigs') + dups_data = utils.get_test_data("duplicate-sigs") all_sigs = set(sourmash.load_file_as_signatures(dups_data)) assert len(all_sigs) == 4 - runtmp.run_sourmash('index', 'dups.sbt.json', dups_data) - outfile = runtmp.output('dups.sbt.json') + runtmp.run_sourmash("index", "dups.sbt.json", dups_data) + outfile = runtmp.output("dups.sbt.json") sbt_sigs = set(sourmash.load_file_as_signatures(outfile)) assert len(sbt_sigs) == 4 @@ -1236,17 +1276,17 @@ def test_build_sbt_json_with_dups(runtmp): def test_build_sbt_json_with_dups_exists(runtmp): - dups_data = utils.get_test_data('duplicate-sigs') + dups_data = utils.get_test_data("duplicate-sigs") all_sigs = set(sourmash.load_file_as_signatures(dups_data)) assert len(all_sigs) == 4 - runtmp.run_sourmash('index', 'dups.sbt.json', dups_data) - outfile = runtmp.output('dups.sbt.json') + runtmp.run_sourmash("index", "dups.sbt.json", dups_data) + outfile = runtmp.output("dups.sbt.json") # run again, see what happens! - runtmp.run_sourmash('index', 'dups.sbt.json', dups_data) - outfile = runtmp.output('dups.sbt.json') + runtmp.run_sourmash("index", "dups.sbt.json", dups_data) + outfile = runtmp.output("dups.sbt.json") sbt_sigs = set(sourmash.load_file_as_signatures(outfile)) assert len(sbt_sigs) == 4 @@ -1258,9 +1298,9 @@ def test_load_fail_on_file_not_dir(runtmp): # make sure the load function raises a ValueError for {filename}/sbt, # rather than a NotADirectoryError - filename = runtmp.output('foo') - with open(filename, 'wt') as fp: - fp.write('something') + filename = runtmp.output("foo") + with open(filename, "w") as fp: + fp.write("something") - with pytest.raises(ValueError) as exc: - x = SBT.load(runtmp.output('foo/bar.sbt.json')) + with pytest.raises(ValueError): + SBT.load(runtmp.output("foo/bar.sbt.json")) diff --git a/tests/test_search.py b/tests/test_search.py index a1b8171cfd..c9c6d601cc 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -5,9 +5,13 @@ import sourmash_tst_utils as utils from sourmash import search, SourmashSignature, MinHash, load_one_signature -from sourmash.search import (make_jaccard_search_query, - make_containment_query, - SearchResult, PrefetchResult, GatherResult) +from sourmash.search import ( + make_jaccard_search_query, + make_containment_query, + SearchResult, + PrefetchResult, + GatherResult, +) from sourmash.index import LinearIndex @@ -20,8 +24,7 @@ def test_make_jaccard_search_query(): def test_make_jaccard_search_query_cont(): - search_obj = make_jaccard_search_query(do_containment=True, - threshold=0) + search_obj = make_jaccard_search_query(do_containment=True, threshold=0) assert search_obj.score_fn == search_obj.score_containment assert search_obj.require_scaled @@ -29,8 +32,7 @@ def test_make_jaccard_search_query_cont(): def test_make_jaccard_search_query_max_cont(): - search_obj = make_jaccard_search_query(do_max_containment=True, - threshold=0) + search_obj = make_jaccard_search_query(do_max_containment=True, threshold=0) assert search_obj.score_fn == search_obj.score_max_containment assert search_obj.require_scaled @@ -55,16 +57,18 @@ def test_make_jaccard_search_query_no_threshold_none(): def test_make_jaccard_search_query_cont_and_max_cont(): with pytest.raises(TypeError) as exc: - search_obj = make_jaccard_search_query(do_containment=True, - do_max_containment=True) + make_jaccard_search_query(do_containment=True, do_max_containment=True) - assert str(exc.value) == "'do_containment' and 'do_max_containment' cannot both be True" + assert ( + str(exc.value) + == "'do_containment' and 'do_max_containment' cannot both be True" + ) def test_cont_requires_scaled(): search_obj = make_jaccard_search_query(do_containment=True) assert search_obj.require_scaled - + mh = MinHash(n=500, ksize=31) with pytest.raises(TypeError) as exc: search_obj.check_is_compatible(SourmashSignature(mh)) @@ -73,7 +77,7 @@ def test_cont_requires_scaled(): def test_search_requires_flat(): search_obj = make_jaccard_search_query() - + mh = MinHash(n=500, ksize=31, track_abundance=True) with pytest.raises(TypeError) as exc: search_obj.check_is_compatible(SourmashSignature(mh)) @@ -164,7 +168,7 @@ def test_make_containment_query_num_minhash(): mh.add_hash(i) with pytest.raises(TypeError) as exc: - search_obj = make_containment_query(mh, 5e4) + make_containment_query(mh, 5e4) assert str(exc.value) == "query signature must be calculated with scaled" @@ -177,7 +181,7 @@ def test_make_containment_query_empty_minhash(): mh.add_hash(i) with pytest.raises(TypeError) as exc: - search_obj = make_containment_query(mh, -1) + make_containment_query(mh, -1) assert str(exc.value) == "threshold_bp must be non-negative" @@ -191,7 +195,7 @@ def test_make_containment_query_high_threshold(): # effective threshold > 1; raise ValueError with pytest.raises(ValueError): - search_obj = make_containment_query(mh, 200000) + make_containment_query(mh, 200000) class FakeIndex(LinearIndex): @@ -240,29 +244,31 @@ def test_search_with_abund_query(): query = SourmashSignature(mh) with pytest.raises(TypeError): - search.search_databases_with_abund_query(query, [], - threshold=0, - do_containment=True) + search.search_databases_with_abund_query( + query, [], threshold=0, do_containment=True + ) with pytest.raises(TypeError): - search.search_databases_with_abund_query(query, [], - threshold=0, - do_max_containment=True) + search.search_databases_with_abund_query( + query, [], threshold=0, do_max_containment=True + ) def test_scaledSearchResult(): # check that values get stored/calculated correctly - ss47_file = utils.get_test_data('47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") ss4763 = ss4763.to_mutable() ss4763.filename = ss4763_file scaled = ss47.minhash.scaled - res = SearchResult(ss47, ss4763, cmp_scaled=scaled, similarity= ss47.contained_by(ss4763)) + res = SearchResult( + ss47, ss4763, cmp_scaled=scaled, similarity=ss47.contained_by(ss4763) + ) assert res.query_name == ss47.name assert res.match_name == ss4763.name @@ -271,16 +277,16 @@ def test_scaledSearchResult(): assert res.cmp_scaled == 1000 assert res.query_abundance == ss47.minhash.track_abundance assert res.match_abundance == ss4763.minhash.track_abundance -# assert res.query_bp == len(ss47.minhash) * scaled -# assert res.match_bp == len(ss4763.minhash) * scaled + # assert res.query_bp == len(ss47.minhash) * scaled + # assert res.match_bp == len(ss4763.minhash) * scaled assert res.ksize == 31 - assert res.moltype == 'DNA' - assert res.query_filename == '47.fa' + assert res.moltype == "DNA" + assert res.query_filename == "47.fa" assert res.match_filename == ss4763_file assert res.query_md5 == ss47.md5sum() assert res.match_md5 == ss4763.md5sum() - # assert res.query_n_hashes == len(ss47.minhash) - # assert res.match_n_hashes == len(ss4763.minhash) + # assert res.query_n_hashes == len(ss47.minhash) + # assert res.match_n_hashes == len(ss4763.minhash) assert res.md5 == ss4763.md5sum() assert res.name == ss4763.name assert res.filename == ss4763.filename @@ -289,18 +295,19 @@ def test_scaledSearchResult(): # check that we _can_ get avg_containment_ani assert res.cmp.avg_containment_ani == np.mean([queryc_ani.ani, matchc_ani.ani]) + def test_numSearchResult(): # check that values get stored/calculated correctly - ss47_file = utils.get_test_data('num/47.fa.sig') - ss63_file = utils.get_test_data('num/63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss63 = load_one_signature(ss63_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("num/47.fa.sig") + ss63_file = utils.get_test_data("num/63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss63 = load_one_signature(ss63_file, ksize=31, select_moltype="dna") ss63 = ss63.to_mutable() ss63.filename = ss63_file assert ss47.minhash.num and ss63.minhash.num - res = SearchResult(ss47, ss63, similarity= ss47.jaccard(ss63)) + res = SearchResult(ss47, ss63, similarity=ss47.jaccard(ss63)) print(res.cmp_num) assert res.mh1.num assert res.cmp.cmp_num == 500 @@ -311,8 +318,8 @@ def test_numSearchResult(): assert res.query_abundance == ss47.minhash.track_abundance assert res.match_abundance == ss63.minhash.track_abundance assert res.ksize == 31 - assert res.moltype == 'DNA' - assert res.query_filename == '47.fa' + assert res.moltype == "DNA" + assert res.query_filename == "47.fa" assert res.match_filename == ss63_file assert res.query_md5 == ss47.md5sum() assert res.match_md5 == ss63.md5sum() @@ -323,7 +330,7 @@ def test_numSearchResult(): # check that we can't get ani with pytest.raises(TypeError) as exc: res.estimate_search_ani() - assert("ANI can only be estimated from scaled signatures.") in str(exc) + assert ("ANI can only be estimated from scaled signatures.") in str(exc) # get result as dictionary (of just items to write) resD = res.resultdict @@ -333,10 +340,10 @@ def test_numSearchResult(): def test_SearchResult_incompatible_sigs(): - ss47_file = utils.get_test_data('num/47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("num/47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") with pytest.raises(TypeError) as exc: SearchResult(ss47, ss4763, similarity=10) @@ -345,8 +352,8 @@ def test_SearchResult_incompatible_sigs(): def test_SearchResult_notsigs(): - ss47_file = utils.get_test_data('num/47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') + ss47_file = utils.get_test_data("num/47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") with pytest.raises(AttributeError) as exc: SearchResult(ss47_file, ss4763_file, similarity=10) @@ -356,10 +363,10 @@ def test_SearchResult_notsigs(): def test_SearchResult_no_similarity(): # check that values get stored/calculated correctly - ss47_file = utils.get_test_data('47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") with pytest.raises(ValueError) as exc: SearchResult(ss47, ss4763) @@ -369,10 +376,10 @@ def test_SearchResult_no_similarity(): def test_PrefetchResult(): # check that values get stored/calculated correctly - ss47_file = utils.get_test_data('47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") ss4763 = ss4763.to_mutable() ss4763.filename = ss4763_file @@ -381,14 +388,14 @@ def test_PrefetchResult(): intersect_mh = ss47.minhash.intersection(ss4763.minhash) intersect_bp = len(intersect_mh) * scaled - jaccard=ss4763.jaccard(ss47) - max_containment=ss4763.max_containment(ss47) - f_match_query=ss47.contained_by(ss4763) - f_query_match=ss4763.contained_by(ss47) + jaccard = ss4763.jaccard(ss47) + max_containment = ss4763.max_containment(ss47) + f_match_query = ss47.contained_by(ss4763) + f_query_match = ss4763.contained_by(ss47) queryc_ani = ss47.containment_ani(ss4763) matchc_ani = ss4763.containment_ani(ss47) - res = PrefetchResult(ss47, ss4763, cmp_scaled = scaled) + res = PrefetchResult(ss47, ss4763, cmp_scaled=scaled) assert res.query_name == ss47.name assert res.match_name == ss4763.name @@ -400,8 +407,8 @@ def test_PrefetchResult(): assert res.query_bp == len(ss47.minhash) * scaled assert res.match_bp == len(ss4763.minhash) * scaled assert res.ksize == 31 - assert res.moltype == 'DNA' - assert res.query_filename == '47.fa' + assert res.moltype == "DNA" + assert res.query_filename == "47.fa" assert res.match_filename == ss4763_file assert res.query_md5 == ss47.md5sum() assert res.match_md5 == ss4763.md5sum() @@ -426,23 +433,26 @@ def test_PrefetchResult(): def test_PrefetchResult_incompatible_sigs(): - ss47_file = utils.get_test_data('num/47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("num/47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") with pytest.raises(TypeError) as exc: PrefetchResult(ss47, ss4763) print(str(exc)) - assert "Error: prefetch and gather results must be between scaled signatures." in str(exc) + assert ( + "Error: prefetch and gather results must be between scaled signatures." + in str(exc) + ) def test_GatherResult(): # check that values get stored/calculated correctly - ss47_file = utils.get_test_data('track_abund/47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("track_abund/47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") ss4763 = ss4763.to_mutable() ss4763.filename = ss4763_file @@ -454,8 +464,8 @@ def test_GatherResult(): remaining_mh.remove_many(intersect_mh) intersect_bp = len(intersect_mh) * scaled - max_containment=ss4763.max_containment(ss47) - f_match_query = ss47.contained_by(ss4763) + max_containment = ss4763.max_containment(ss47) + ss47.contained_by(ss4763) orig_query_abunds = ss47.minhash.hashes queryc_ani = ss47.containment_ani(ss4763) matchc_ani = ss4763.containment_ani(ss47) @@ -464,12 +474,16 @@ def test_GatherResult(): gather_result_rank = 1 sum_abunds = 1000 - res = GatherResult(ss47, ss4763, cmp_scaled=scaled, - gather_querymh=remaining_mh, - gather_result_rank=gather_result_rank, - total_weighted_hashes = sum_abunds, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + res = GatherResult( + ss47, + ss4763, + cmp_scaled=scaled, + gather_querymh=remaining_mh, + gather_result_rank=gather_result_rank, + total_weighted_hashes=sum_abunds, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) assert res.query_name == ss47.name assert res.match_name == ss4763.name @@ -481,8 +495,8 @@ def test_GatherResult(): assert res.query_bp == len(ss47.minhash) * scaled assert res.match_bp == len(ss4763.minhash) * scaled assert res.ksize == 31 - assert res.moltype == 'DNA' - assert res.query_filename == 'podar-ref/47.fa' + assert res.moltype == "DNA" + assert res.query_filename == "podar-ref/47.fa" assert res.match_filename == ss4763_file assert res.query_md5 == ss47.md5sum() assert res.match_md5 == ss4763.md5sum() @@ -516,10 +530,10 @@ def test_GatherResult(): def test_GatherResult_ci(): # check that values get stored/calculated correctly - ss47_file = utils.get_test_data('track_abund/47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("track_abund/47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") ss4763 = ss4763.to_mutable() ss4763.filename = ss4763_file @@ -531,20 +545,24 @@ def test_GatherResult_ci(): remaining_mh.remove_many(intersect_mh) orig_query_abunds = ss47.minhash.hashes - queryc_ani = ss47.containment_ani(ss4763,estimate_ci=True) + queryc_ani = ss47.containment_ani(ss4763, estimate_ci=True) matchc_ani = ss4763.containment_ani(ss47, estimate_ci=True) # make some fake vals to check gather_result_rank = 1 sum_abunds = 1000 - res = GatherResult(ss47, ss4763, cmp_scaled=scaled, - gather_querymh=remaining_mh, - gather_result_rank=gather_result_rank, - total_weighted_hashes = sum_abunds, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds, - estimate_ani_ci=True) + res = GatherResult( + ss47, + ss4763, + cmp_scaled=scaled, + gather_querymh=remaining_mh, + gather_result_rank=gather_result_rank, + total_weighted_hashes=sum_abunds, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + estimate_ani_ci=True, + ) # check that we can write prefetch result directly from gather pf = PrefetchResult(ss47, ss4763, cmp_scaled=scaled, estimate_ani_ci=True) @@ -568,130 +586,183 @@ def test_GatherResult_ci(): def test_GatherResult_incompatible_sigs(): - ss47_file = utils.get_test_data('num/47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("num/47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") orig_query_abunds = ss47.minhash.hashes with pytest.raises(TypeError) as exc: - GatherResult(ss47, ss4763, cmp_scaled=1, - gather_querymh=ss47.minhash, - gather_result_rank=1, - total_weighted_hashes = 1, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + GatherResult( + ss47, + ss4763, + cmp_scaled=1, + gather_querymh=ss47.minhash, + gather_result_rank=1, + total_weighted_hashes=1, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) print(str(exc)) - assert "Error: prefetch and gather results must be between scaled signatures." in str(exc) + assert ( + "Error: prefetch and gather results must be between scaled signatures." + in str(exc) + ) def test_GatherResult_incomplete_input_cmpscaled(): - ss47_file = utils.get_test_data('47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") orig_query_abunds = ss47.minhash.hashes with pytest.raises(ValueError) as exc: - GatherResult(ss47, ss4763, cmp_scaled=None, - gather_querymh=ss47.minhash, - gather_result_rank=1, - total_weighted_hashes = 1, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + GatherResult( + ss47, + ss4763, + cmp_scaled=None, + gather_querymh=ss47.minhash, + gather_result_rank=1, + total_weighted_hashes=1, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) print(str(exc)) - assert "Error: must provide comparison scaled value ('cmp_scaled') for GatherResult" in str(exc) + assert ( + "Error: must provide comparison scaled value ('cmp_scaled') for GatherResult" + in str(exc) + ) def test_GatherResult_incomplete_input_gathermh(): - ss47_file = utils.get_test_data('47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") orig_query_abunds = ss47.minhash.hashes with pytest.raises(ValueError) as exc: - GatherResult(ss47, ss4763, cmp_scaled=1000, - gather_querymh=None, - gather_result_rank=1, - total_weighted_hashes = 1, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + GatherResult( + ss47, + ss4763, + cmp_scaled=1000, + gather_querymh=None, + gather_result_rank=1, + total_weighted_hashes=1, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) print(str(exc)) - assert "Error: must provide current gather sketch (remaining hashes) for GatherResult" in str(exc) + assert ( + "Error: must provide current gather sketch (remaining hashes) for GatherResult" + in str(exc) + ) def test_GatherResult_incomplete_input_gather_result_rank(): - ss47_file = utils.get_test_data('47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") orig_query_abunds = ss47.minhash.hashes with pytest.raises(ValueError) as exc: - GatherResult(ss47, ss4763, cmp_scaled=1000, - gather_querymh=ss47.minhash, - gather_result_rank=None, - total_weighted_hashes = 1, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + GatherResult( + ss47, + ss4763, + cmp_scaled=1000, + gather_querymh=ss47.minhash, + gather_result_rank=None, + total_weighted_hashes=1, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) print(str(exc)) assert "Error: must provide 'gather_result_rank' to GatherResult" in str(exc) def test_GatherResult_incomplete_input_total_weighted_hashes(): - ss47_file = utils.get_test_data('47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") orig_query_abunds = ss47.minhash.hashes with pytest.raises(ValueError) as exc: - GatherResult(ss47, ss4763, cmp_scaled=1000, - gather_querymh=ss47.minhash, - gather_result_rank=1, - total_weighted_hashes = None, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + GatherResult( + ss47, + ss4763, + cmp_scaled=1000, + gather_querymh=ss47.minhash, + gather_result_rank=1, + total_weighted_hashes=None, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) print(str(exc)) - assert "Error: must provide sum of all abundances ('total_weighted_hashes') to GatherResult" in str(exc) + assert ( + "Error: must provide sum of all abundances ('total_weighted_hashes') to GatherResult" + in str(exc) + ) with pytest.raises(ValueError) as exc: - GatherResult(ss47, ss4763, cmp_scaled=1000, - gather_querymh=ss47.minhash, - gather_result_rank=1, - total_weighted_hashes = 0, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + GatherResult( + ss47, + ss4763, + cmp_scaled=1000, + gather_querymh=ss47.minhash, + gather_result_rank=1, + total_weighted_hashes=0, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) print(str(exc)) - assert "Error: must provide sum of all abundances ('total_weighted_hashes') to GatherResult" in str(exc) + assert ( + "Error: must provide sum of all abundances ('total_weighted_hashes') to GatherResult" + in str(exc) + ) def test_GatherResult_incomplete_input_orig_query_abunds(): - ss47_file = utils.get_test_data('47.fa.sig') - ss4763_file = utils.get_test_data('47+63.fa.sig') - ss47 = load_one_signature(ss47_file, ksize=31, select_moltype='dna') - ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype='dna') + ss47_file = utils.get_test_data("47.fa.sig") + ss4763_file = utils.get_test_data("47+63.fa.sig") + ss47 = load_one_signature(ss47_file, ksize=31, select_moltype="dna") + ss4763 = load_one_signature(ss4763_file, ksize=31, select_moltype="dna") orig_query_abunds = None with pytest.raises(ValueError) as exc: - GatherResult(ss47, ss4763, cmp_scaled=1000, - gather_querymh=ss47.minhash, - gather_result_rank=1, - total_weighted_hashes = 1, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + GatherResult( + ss47, + ss4763, + cmp_scaled=1000, + gather_querymh=ss47.minhash, + gather_result_rank=1, + total_weighted_hashes=1, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) print(str(exc)) - assert "Error: must provide original query abundances ('orig_query_abunds') to GatherResult" in str(exc) + assert ( + "Error: must provide original query abundances ('orig_query_abunds') to GatherResult" + in str(exc) + ) orig_query_abunds = {} with pytest.raises(ValueError) as exc: - GatherResult(ss47, ss4763, cmp_scaled=1000, - gather_querymh=ss47.minhash, - gather_result_rank=1, - total_weighted_hashes = 1, - orig_query_len=len(ss47.minhash), - orig_query_abunds=orig_query_abunds) + GatherResult( + ss47, + ss4763, + cmp_scaled=1000, + gather_querymh=ss47.minhash, + gather_result_rank=1, + total_weighted_hashes=1, + orig_query_len=len(ss47.minhash), + orig_query_abunds=orig_query_abunds, + ) print(str(exc)) - assert "Error: must provide original query abundances ('orig_query_abunds') to GatherResult" in str(exc) + assert ( + "Error: must provide original query abundances ('orig_query_abunds') to GatherResult" + in str(exc) + ) diff --git a/tests/test_signature.py b/tests/test_signature.py index 95ea058dc4..b82a02364e 100644 --- a/tests/test_signature.py +++ b/tests/test_signature.py @@ -3,8 +3,13 @@ import pytest import sourmash -from sourmash.signature import SourmashSignature, save_signatures, \ - load_signatures, load_one_signature, FrozenSourmashSignature +from sourmash.signature import ( + SourmashSignature, + save_signatures, + load_signatures, + load_one_signature, + FrozenSourmashSignature, +) import sourmash_tst_utils as utils from sourmash.minhash import MinHash, FrozenMinHash from sourmash_tst_utils import SourmashCommandFailed @@ -13,7 +18,7 @@ def test_minhash_copy(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - sig = SourmashSignature(e, name='foo') + SourmashSignature(e, name="foo") f = e.copy() assert e == f @@ -21,7 +26,7 @@ def test_minhash_copy(track_abundance): def test_sig_copy(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - sig1 = SourmashSignature(e, name='foo') + sig1 = SourmashSignature(e, name="foo") sig2 = sig1.copy() assert sig1 == sig2 @@ -29,35 +34,35 @@ def test_sig_copy(track_abundance): def test_sig_copy_frozen(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - sig1 = SourmashSignature(e, name='foo') + sig1 = SourmashSignature(e, name="foo") sig2 = sig1.copy() assert sig1 == sig2 with pytest.raises(TypeError) as e: sig2.minhash.add_hash(5) - assert 'FrozenMinHash does not support modification' in str(e.value) + assert "FrozenMinHash does not support modification" in str(e.value) def test_sig_copy_frozen_mutable(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - sig1 = SourmashSignature(e, name='foo') + sig1 = SourmashSignature(e, name="foo") sig1.minhash = sig1.minhash.to_mutable() sig2 = sig1.copy() assert sig1 == sig2 with pytest.raises(TypeError) as e: sig2.minhash.add_hash(5) - assert 'FrozenMinHash does not support modification' in str(e.value) + assert "FrozenMinHash does not support modification" in str(e.value) def test_compare(track_abundance): # same content, same name -> equal e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - sig1 = SourmashSignature(e, name='foo') + SourmashSignature(e, name="foo") f = MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add_kmer("AT" * 10) - sig2 = SourmashSignature(f, name='foo') + SourmashSignature(f, name="foo") assert e == f @@ -66,11 +71,11 @@ def test_compare_ne(track_abundance): # same content, different names -> different e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - sig1 = SourmashSignature(e, name='foo') + sig1 = SourmashSignature(e, name="foo") f = MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add_kmer("AT" * 10) - sig2 = SourmashSignature(f, name='bar') + sig2 = SourmashSignature(f, name="bar") assert sig1 != sig2 @@ -79,11 +84,11 @@ def test_compare_ne2(track_abundance): # same content, different filename -> different e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - sig1 = SourmashSignature(e, name='foo', filename='a') + sig1 = SourmashSignature(e, name="foo", filename="a") f = MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add_kmer("AT" * 10) - sig2 = SourmashSignature(f, name='foo', filename='b') + sig2 = SourmashSignature(f, name="foo", filename="b") assert sig1 != sig2 assert sig2 != sig1 @@ -93,11 +98,11 @@ def test_compare_ne2_reverse(track_abundance): # same content, one has filename, other does not -> different e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - sig1 = SourmashSignature(e, name='foo') + sig1 = SourmashSignature(e, name="foo") f = MinHash(n=1, ksize=20, track_abundance=track_abundance) f.add_kmer("AT" * 10) - sig2 = SourmashSignature(f, filename='b') + sig2 = SourmashSignature(f, filename="b") assert sig2 != sig1 assert sig1 != sig2 @@ -124,8 +129,8 @@ def test_str(track_abundance): print(sig) assert repr(sig) == "SourmashSignature('', 59502a74)" - sig._name = 'fizbar' - assert repr(sig) == 'SourmashSignature(\'fizbar\', 59502a74)' + sig._name = "fizbar" + assert repr(sig) == "SourmashSignature('fizbar', 59502a74)" def test_roundtrip(track_abundance): @@ -135,7 +140,6 @@ def test_roundtrip(track_abundance): s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] - e2 = sig2.minhash assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0 @@ -164,9 +168,8 @@ def test_load_signature_ksize_nonint(track_abundance): e.add_kmer("AT" * 10) sig = SourmashSignature(e) s = save_signatures([sig]) - siglist = list(load_signatures(s, ksize='20')) + siglist = list(load_signatures(s, ksize="20")) sig2 = siglist[0] - e2 = sig2.minhash assert sig.similarity(sig2) == 1.0 assert sig2.similarity(sig) == 1.0 @@ -180,15 +183,13 @@ def test_roundtrip_empty(track_abundance): s = save_signatures([sig]) siglist = list(load_signatures(s)) sig2 = siglist[0] - e2 = sig2.minhash assert sig.similarity(sig2) == 0 assert sig2.similarity(sig) == 0 def test_roundtrip_scaled(track_abundance): - e = MinHash(n=0, ksize=20, track_abundance=track_abundance, - max_hash=10) + e = MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=10) e.add_hash(5) sig = SourmashSignature(e) s = save_signatures([sig]) @@ -203,8 +204,7 @@ def test_roundtrip_scaled(track_abundance): def test_roundtrip_seed(track_abundance): - e = MinHash(n=1, ksize=20, track_abundance=track_abundance, - seed=10) + e = MinHash(n=1, ksize=20, track_abundance=track_abundance, seed=10) e.add_hash(5) sig = SourmashSignature(e) s = save_signatures([sig]) @@ -219,26 +219,24 @@ def test_roundtrip_seed(track_abundance): def test_similarity_downsample(track_abundance): - e = MinHash(n=0, ksize=20, track_abundance=track_abundance, - max_hash=2**63) - f = MinHash(n=0, ksize=20, track_abundance=track_abundance, - max_hash=2**2) + e = MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**63) + f = MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**2) e.add_hash(1) e.add_hash(5) assert len(e.hashes) == 2 f.add_hash(1) - f.add_hash(5) # should be discarded due to max_hash + f.add_hash(5) # should be discarded due to max_hash assert len(f.hashes) == 1 ee = SourmashSignature(e) ff = SourmashSignature(f) - with pytest.raises(ValueError) as e: # mismatch in max_hash + with pytest.raises(ValueError) as e: # mismatch in max_hash ee.similarity(ff) - assert 'mismatch in scaled; comparison fail' in str(e.value) + assert "mismatch in scaled; comparison fail" in str(e.value) x = ee.similarity(ff, downsample=True) assert round(x, 1) == 1.0 @@ -252,33 +250,32 @@ def test_add_sequence_bad_dna(track_abundance): with pytest.raises(ValueError) as e: sig.add_sequence("N" * 21, force=False) - assert 'invalid DNA character in input k-mer: NNNNNNNNNNNNNNNNNNNNN' in str(e.value) + assert "invalid DNA character in input k-mer: NNNNNNNNNNNNNNNNNNNNN" in str(e.value) def test_md5(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_hash(5) sig = SourmashSignature(e) - assert sig.md5sum() == 'eae27d77ca20db309e056e3d2dcd7d69', sig.md5sum() + assert sig.md5sum() == "eae27d77ca20db309e056e3d2dcd7d69", sig.md5sum() def test_str_1(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) - sig = SourmashSignature(e, name='foo') - assert str(sig) == 'foo' + sig = SourmashSignature(e, name="foo") + assert str(sig) == "foo" def test_str_2(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) - sig = SourmashSignature(e, filename='foo.txt') - assert str(sig) == 'foo.txt' + sig = SourmashSignature(e, filename="foo.txt") + assert str(sig) == "foo.txt" def test_str_3(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) - sig = SourmashSignature(e, name='foo', - filename='foo.txt') - assert str(sig) == 'foo' + sig = SourmashSignature(e, name="foo", filename="foo.txt") + assert str(sig) == "foo" def test_name_4(track_abundance): @@ -300,7 +297,7 @@ def test_save_load_multisig(track_abundance): print(x) assert len(y) == 2 - assert sig1 in y # order not guaranteed, note. + assert sig1 in y # order not guaranteed, note. assert sig2 in y assert sig1 != sig2 @@ -309,7 +306,7 @@ def test_load_one_fail_nosig(track_abundance): x = save_signatures([]) print((x,)) with pytest.raises(ValueError): - y = load_one_signature(x) + load_one_signature(x) def test_load_one_succeed(track_abundance): @@ -332,7 +329,7 @@ def test_load_one_fail_multisig(track_abundance): x = save_signatures([sig1, sig2]) with pytest.raises(ValueError): - y = load_one_signature(x) + load_one_signature(x) def test_save_minified(track_abundance): @@ -343,24 +340,24 @@ def test_save_minified(track_abundance): sig2 = SourmashSignature(e2, name="bar baz") x = save_signatures([sig1, sig2]) - assert b'\n' not in x - assert len(x.split(b'\n')) == 1 + assert b"\n" not in x + assert len(x.split(b"\n")) == 1 y = list(load_signatures(x)) assert len(y) == 2 - assert any(sig.name == 'foo' for sig in y) - assert any(sig.name == 'bar baz' for sig in y) + assert any(sig.name == "foo" for sig in y) + assert any(sig.name == "bar baz" for sig in y) def test_load_minified(track_abundance): - sigfile = utils.get_test_data('genome-s10+s11.sig') + sigfile = utils.get_test_data("genome-s10+s11.sig") sigs = load_signatures(sigfile) minified = save_signatures(sigs) - with open(sigfile, 'r') as f: + with open(sigfile) as f: orig_file = f.read() assert len(minified) < len(orig_file) - assert b'\n' not in minified + assert b"\n" not in minified def test_load_compressed(track_abundance): @@ -372,8 +369,8 @@ def test_load_compressed(track_abundance): y = load_one_signature(x) assert sig1 == y - sigfile = utils.get_test_data('genome-s10+s11.sig.gz') - sigs = load_signatures(sigfile) + sigfile = utils.get_test_data("genome-s10+s11.sig.gz") + load_signatures(sigfile) def test_binary_fp(tmpdir, track_abundance): @@ -381,9 +378,9 @@ def test_binary_fp(tmpdir, track_abundance): e.add_kmer("AT" * 10) path = tmpdir.join("1.sig") - with open(str(path), 'wb') as fp: + with open(str(path), "wb") as fp: sig = SourmashSignature(e) - s = save_signatures([sig], fp) + save_signatures([sig], fp) def test_load_signatures_no_file_do_raise(tmpdir): @@ -409,10 +406,10 @@ def test_max_containment(): ss1 = SourmashSignature(mh1) ss2 = SourmashSignature(mh2) - assert ss1.contained_by(ss2) == 1/4 - assert ss2.contained_by(ss1) == 1/2 - assert ss1.max_containment(ss2) == 1/2 - assert ss2.max_containment(ss1) == 1/2 + assert ss1.contained_by(ss2) == 1 / 4 + assert ss2.contained_by(ss1) == 1 / 2 + assert ss1.max_containment(ss2) == 1 / 2 + assert ss2.max_containment(ss1) == 1 / 2 def test_max_containment_empty(): @@ -447,32 +444,44 @@ def test_max_containment_equal(): def test_containment_ANI(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") ss1 = sourmash.load_one_signature(f1, ksize=31) ss2 = sourmash.load_one_signature(f2, ksize=31) - s1_cont_s2 = ss1.containment_ani(ss2, estimate_ci =True) - s2_cont_s1 = ss2.containment_ani(ss1, estimate_ci =True) + s1_cont_s2 = ss1.containment_ani(ss2, estimate_ci=True) + s2_cont_s1 = ss2.containment_ani(ss1, estimate_ci=True) print("\nss1 contained by ss2", s1_cont_s2) print("ss2 contained by ss1", s2_cont_s1) - assert (round(s1_cont_s2.ani,3), s1_cont_s2.ani_low, s1_cont_s2.ani_high) == (1.0,1.0,1.0) - assert (round(s2_cont_s1.ani,3), round(s2_cont_s1.ani_low,3), round(s2_cont_s1.ani_high,3)) == (0.966, 0.965, 0.967) - - s1_mc_s2 = ss1.max_containment_ani(ss2, estimate_ci =True) - s2_mc_s1 = ss2.max_containment_ani(ss1, estimate_ci =True) + assert (round(s1_cont_s2.ani, 3), s1_cont_s2.ani_low, s1_cont_s2.ani_high) == ( + 1.0, + 1.0, + 1.0, + ) + assert ( + round(s2_cont_s1.ani, 3), + round(s2_cont_s1.ani_low, 3), + round(s2_cont_s1.ani_high, 3), + ) == (0.966, 0.965, 0.967) + + s1_mc_s2 = ss1.max_containment_ani(ss2, estimate_ci=True) + s2_mc_s1 = ss2.max_containment_ani(ss1, estimate_ci=True) print("mh1 max containment", s1_mc_s2) print("mh2 max containment", s2_mc_s1) s1_mc_s2.size_is_inaccurate = False s2_mc_s1.size_is_inaccurate = False assert s1_mc_s2 == s2_mc_s1 - assert (round(s1_mc_s2.ani, 3), round(s1_mc_s2.ani_low, 3), round(s1_mc_s2.ani_high, 3)) == (1.0,1.0,1.0) + assert ( + round(s1_mc_s2.ani, 3), + round(s1_mc_s2.ani_low, 3), + round(s1_mc_s2.ani_high, 3), + ) == (1.0, 1.0, 1.0) def test_containment_ANI_precalc_containment(): - f1 = utils.get_test_data('47+63.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("47+63.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") ss1 = sourmash.load_one_signature(f1, ksize=31) ss2 = sourmash.load_one_signature(f2, ksize=31) # precalc containments and assert same results @@ -480,38 +489,53 @@ def test_containment_ANI_precalc_containment(): s2c = ss2.contained_by(ss1) mc = max(s1c, s2c) - assert ss1.containment_ani(ss2, estimate_ci=True) == ss1.containment_ani(ss2, containment=s1c, estimate_ci=True) - assert ss2.containment_ani(ss1) == ss2.containment_ani(ss1, containment=s2c) - assert ss1.max_containment_ani(ss2) == ss2.max_containment_ani(ss1) - assert ss1.max_containment_ani(ss2) == ss1.max_containment_ani(ss2, max_containment=mc) - assert ss1.max_containment_ani(ss2) == ss2.max_containment_ani(ss1, max_containment=mc) + assert ss1.containment_ani(ss2, estimate_ci=True) == ss1.containment_ani( + ss2, containment=s1c, estimate_ci=True + ) + assert ss2.containment_ani(ss1) == ss2.containment_ani(ss1, containment=s2c) + assert ss1.max_containment_ani(ss2) == ss2.max_containment_ani(ss1) + assert ss1.max_containment_ani(ss2) == ss1.max_containment_ani( + ss2, max_containment=mc + ) + assert ss1.max_containment_ani(ss2) == ss2.max_containment_ani( + ss1, max_containment=mc + ) def test_avg_containment(): - f1 = utils.get_test_data('47+63.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("47+63.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") ss1 = sourmash.load_one_signature(f1, ksize=31) ss2 = sourmash.load_one_signature(f2, ksize=31) # check average_containment_ani ac_s1 = ss1.avg_containment(ss2) ac_s2 = ss2.avg_containment(ss1) - assert ac_s1 == ac_s2 == (ss1.contained_by(ss2) + ss2.contained_by(ss1))/2 == 0.6619979467456603 + assert ( + ac_s1 + == ac_s2 + == (ss1.contained_by(ss2) + ss2.contained_by(ss1)) / 2 + == 0.6619979467456603 + ) def test_avg_containment_ani(): - f1 = utils.get_test_data('47+63.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("47+63.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") ss1 = sourmash.load_one_signature(f1, ksize=31) ss2 = sourmash.load_one_signature(f2, ksize=31) # check average_containment_ani ac_s1 = ss1.avg_containment_ani(ss2) ac_s2 = ss2.avg_containment_ani(ss1) - assert ac_s1 == ac_s2 == (ss1.containment_ani(ss2).ani + ss2.containment_ani(ss1).ani)/2 + assert ( + ac_s1 + == ac_s2 + == (ss1.containment_ani(ss2).ani + ss2.containment_ani(ss1).ani) / 2 + ) def test_containment_ANI_downsample(): - f2 = utils.get_test_data('2+63.fa.sig') - f3 = utils.get_test_data('47+63.fa.sig') + f2 = utils.get_test_data("2+63.fa.sig") + f3 = utils.get_test_data("47+63.fa.sig") ss2 = sourmash.load_one_signature(f2, ksize=31) ss3 = sourmash.load_one_signature(f3, ksize=31) # check that downsampling works properly @@ -522,8 +546,8 @@ def test_containment_ANI_downsample(): assert ss2.minhash.scaled != ss3.minhash.scaled ds_s3c = ss2.containment_ani(ss3, downsample=True) ds_s4c = ss3.containment_ani(ss2, downsample=True) - mc_w_ds_1 = ss2.max_containment_ani(ss3, downsample=True) - mc_w_ds_2 = ss3.max_containment_ani(ss2, downsample=True) + mc_w_ds_1 = ss2.max_containment_ani(ss3, downsample=True) + mc_w_ds_2 = ss3.max_containment_ani(ss2, downsample=True) with pytest.raises(ValueError) as e: ss2.containment_ani(ss3) @@ -538,15 +562,15 @@ def test_containment_ANI_downsample(): assert ss2.minhash.scaled == ss3.minhash.scaled ds_s3c_manual = ss2.containment_ani(ss3) ds_s4c_manual = ss3.containment_ani(ss2) - ds_mc_manual = ss2.max_containment_ani(ss3) + ds_mc_manual = ss2.max_containment_ani(ss3) assert ds_s3c == ds_s3c_manual assert ds_s4c == ds_s4c_manual assert mc_w_ds_1 == mc_w_ds_2 == ds_mc_manual def test_jaccard_ANI(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") ss1 = sourmash.load_one_signature(f1, ksize=31) ss2 = sourmash.load_one_signature(f2) @@ -556,12 +580,16 @@ def test_jaccard_ANI(): s2_jani_s1 = ss2.jaccard_ani(ss1) assert s1_jani_s2 == s2_jani_s1 - assert (s1_jani_s2.ani, s1_jani_s2.p_nothing_in_common, s1_jani_s2.jaccard_error) == (0.9783711630110239, 0.0, 3.891666770716877e-07) + assert ( + s1_jani_s2.ani, + s1_jani_s2.p_nothing_in_common, + s1_jani_s2.jaccard_error, + ) == (0.9783711630110239, 0.0, 3.891666770716877e-07) def test_jaccard_ANI_untrustworthy(): - f1 = utils.get_test_data('2.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("2.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") ss1 = sourmash.load_one_signature(f1, ksize=31) ss2 = sourmash.load_one_signature(f2) @@ -572,28 +600,32 @@ def test_jaccard_ANI_untrustworthy(): # since size is inaccurate on 2.fa.sig, need to override to be able to get ani s1_jani_s2.size_is_inaccurate = False - assert s1_jani_s2.ani == None - assert s1_jani_s2.je_exceeds_threshold==True + assert s1_jani_s2.ani is None + assert s1_jani_s2.je_exceeds_threshold == True assert s1_jani_s2.je_threshold == 1e-7 def test_jaccard_ANI_precalc_jaccard(): - f1 = utils.get_test_data('47+63.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("47+63.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") ss1 = sourmash.load_one_signature(f1, ksize=31) ss2 = sourmash.load_one_signature(f2) # precalc jaccard and assert same result jaccard = ss1.jaccard(ss2) - print("\nJACCARD_ANI", ss1.jaccard_ani(ss2,jaccard=jaccard)) + print("\nJACCARD_ANI", ss1.jaccard_ani(ss2, jaccard=jaccard)) - assert ss1.jaccard_ani(ss2) == ss1.jaccard_ani(ss2, jaccard=jaccard) == ss2.jaccard_ani(ss1, jaccard=jaccard) + assert ( + ss1.jaccard_ani(ss2) + == ss1.jaccard_ani(ss2, jaccard=jaccard) + == ss2.jaccard_ani(ss1, jaccard=jaccard) + ) wrong_jaccard = jaccard - 0.1 assert ss1.jaccard_ani(ss2) != ss1.jaccard_ani(ss2, jaccard=wrong_jaccard) def test_jaccard_ANI_downsample(): - f1 = utils.get_test_data('47+63.fa.sig') - f2 = utils.get_test_data('2+63.fa.sig') + f1 = utils.get_test_data("47+63.fa.sig") + f2 = utils.get_test_data("2+63.fa.sig") ss1 = sourmash.load_one_signature(f1, ksize=31) ss2 = sourmash.load_one_signature(f2) @@ -619,10 +651,10 @@ def test_frozen_signature_update_1(track_abundance): # setting .name should fail on a FrozenSourmashSignature e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - ss = SourmashSignature(e, name='foo').to_frozen() + ss = SourmashSignature(e, name="foo").to_frozen() with pytest.raises(ValueError): - ss.name = 'foo2' + ss.name = "foo2" def test_frozen_signature_update_2(track_abundance): @@ -630,7 +662,7 @@ def test_frozen_signature_update_2(track_abundance): e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) e2 = e.copy_and_clear() - ss = SourmashSignature(e, name='foo').to_frozen() + ss = SourmashSignature(e, name="foo").to_frozen() with pytest.raises(ValueError): ss.minhash = e2 @@ -640,9 +672,9 @@ def test_frozen_signature_update_3(track_abundance): # setting .minhash should succeed with update() context manager e = MinHash(n=1, ksize=20, track_abundance=track_abundance) e.add_kmer("AT" * 10) - ss = SourmashSignature(e, name='foo').to_frozen() + ss = SourmashSignature(e, name="foo").to_frozen() with ss.update() as ss2: - ss2.name = 'foo2' + ss2.name = "foo2" - assert ss2.name == 'foo2' + assert ss2.name == "foo2" diff --git a/tests/test_sketchcomparison.py b/tests/test_sketchcomparison.py index 30282895fc..5b7e78537d 100644 --- a/tests/test_sketchcomparison.py +++ b/tests/test_sketchcomparison.py @@ -11,14 +11,15 @@ import sourmash_tst_utils as utils + # can we parameterize scaled too (so don't need separate downsample tests?) def test_FracMinHashComparison(track_abundance): # build FracMinHash Comparison and check values a = MinHash(0, 21, scaled=1, track_abundance=track_abundance) b = MinHash(0, 21, scaled=1, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -43,23 +44,45 @@ def test_FracMinHashComparison(track_abundance): intersect_mh = a.flatten().intersection(b.flatten()) assert cmp.intersect_mh == intersect_mh == b.flatten().intersection(a.flatten()) assert cmp.total_unique_intersect_hashes == 4 - assert cmp.pass_threshold # default threshold is 0; this should pass + assert cmp.pass_threshold # default threshold is 0; this should pass if track_abundance: - assert cmp.angular_similarity == a.angular_similarity(b) == b.angular_similarity(a) - assert cmp.cosine_similarity == a.angular_similarity(b) == b.angular_similarity(a) - assert cmp.weighted_intersection(from_mh=cmp.mh1).hashes == intersect_mh.inflate(a).hashes - assert cmp.weighted_intersection(from_mh=cmp.mh2).hashes == intersect_mh.inflate(b).hashes - assert cmp.weighted_intersection(from_abundD=a_values).hashes == intersect_mh.inflate(a).hashes - assert cmp.weighted_intersection(from_abundD=b_values).hashes == intersect_mh.inflate(b).hashes + assert ( + cmp.angular_similarity == a.angular_similarity(b) == b.angular_similarity(a) + ) + assert ( + cmp.cosine_similarity == a.angular_similarity(b) == b.angular_similarity(a) + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh1).hashes + == intersect_mh.inflate(a).hashes + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh2).hashes + == intersect_mh.inflate(b).hashes + ) + assert ( + cmp.weighted_intersection(from_abundD=a_values).hashes + == intersect_mh.inflate(a).hashes + ) + assert ( + cmp.weighted_intersection(from_abundD=b_values).hashes + == intersect_mh.inflate(b).hashes + ) else: with pytest.raises(TypeError) as exc: cmp.angular_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) with pytest.raises(TypeError) as exc: cmp.cosine_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) assert cmp.weighted_intersection(from_mh=cmp.mh1).hashes == intersect_mh.hashes assert cmp.weighted_intersection(from_mh=cmp.mh2).hashes == intersect_mh.hashes @@ -69,8 +92,8 @@ def test_FracMinHashComparison_downsample(track_abundance): a = MinHash(0, 21, scaled=1, track_abundance=track_abundance) b = MinHash(0, 21, scaled=1, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -84,7 +107,7 @@ def test_FracMinHashComparison_downsample(track_abundance): ds_b = b.downsample(scaled=cmp_scaled) # build FracMinHashComparison - cmp = FracMinHashComparison(a, b, cmp_scaled = cmp_scaled) + cmp = FracMinHashComparison(a, b, cmp_scaled=cmp_scaled) assert cmp.mh1 == a assert cmp.mh2 == b assert cmp.mh1_cmp == ds_a @@ -99,27 +122,59 @@ def test_FracMinHashComparison_downsample(track_abundance): assert cmp.max_containment == ds_a.max_containment(ds_b) assert cmp.jaccard == ds_a.jaccard(ds_b) == ds_b.jaccard(ds_a) intersect_mh = ds_a.flatten().intersection(ds_b.flatten()) - assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + assert ( + cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + ) assert cmp.total_unique_intersect_hashes == 8 - assert cmp.pass_threshold # default threshold is 0; this should pass + assert cmp.pass_threshold # default threshold is 0; this should pass if track_abundance: - assert cmp.angular_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a) - assert cmp.cosine_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a) - assert cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes == intersect_mh.inflate(ds_a).hashes - assert cmp.weighted_intersection(from_mh=cmp.mh2_cmp).hashes == intersect_mh.inflate(ds_b).hashes - assert cmp.weighted_intersection(from_abundD=cmp.mh1_cmp.hashes).hashes == intersect_mh.inflate(ds_a).hashes - assert cmp.weighted_intersection(from_abundD=cmp.mh2_cmp.hashes).hashes == intersect_mh.inflate(ds_b).hashes + assert ( + cmp.angular_similarity + == ds_a.angular_similarity(ds_b) + == ds_b.angular_similarity(ds_a) + ) + assert ( + cmp.cosine_similarity + == ds_a.angular_similarity(ds_b) + == ds_b.angular_similarity(ds_a) + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes + == intersect_mh.inflate(ds_a).hashes + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh2_cmp).hashes + == intersect_mh.inflate(ds_b).hashes + ) + assert ( + cmp.weighted_intersection(from_abundD=cmp.mh1_cmp.hashes).hashes + == intersect_mh.inflate(ds_a).hashes + ) + assert ( + cmp.weighted_intersection(from_abundD=cmp.mh2_cmp.hashes).hashes + == intersect_mh.inflate(ds_b).hashes + ) else: with pytest.raises(TypeError) as exc: cmp.angular_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) with pytest.raises(TypeError) as exc: cmp.cosine_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) - assert cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes == intersect_mh.hashes - assert cmp.weighted_intersection(from_mh=cmp.mh2_cmp).hashes == intersect_mh.hashes + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes == intersect_mh.hashes + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh2_cmp).hashes == intersect_mh.hashes + ) def test_FracMinHashComparison_autodownsample(track_abundance): @@ -127,8 +182,8 @@ def test_FracMinHashComparison_autodownsample(track_abundance): a = MinHash(0, 21, scaled=1, track_abundance=track_abundance) b = MinHash(0, 21, scaled=2, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -157,27 +212,59 @@ def test_FracMinHashComparison_autodownsample(track_abundance): assert cmp.max_containment == ds_a.max_containment(ds_b) assert cmp.jaccard == ds_a.jaccard(ds_b) == ds_b.jaccard(ds_a) intersect_mh = ds_a.flatten().intersection(ds_b.flatten()) - assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + assert ( + cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + ) assert cmp.total_unique_intersect_hashes == 8 - assert cmp.pass_threshold # default threshold is 0; this should pass + assert cmp.pass_threshold # default threshold is 0; this should pass if track_abundance: - assert cmp.angular_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a) - assert cmp.cosine_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a) - assert cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes == intersect_mh.inflate(ds_a).hashes - assert cmp.weighted_intersection(from_mh=cmp.mh2_cmp).hashes == intersect_mh.inflate(ds_b).hashes - assert cmp.weighted_intersection(from_abundD=a_values).hashes == intersect_mh.inflate(a).hashes - assert cmp.weighted_intersection(from_abundD=b_values).hashes == intersect_mh.inflate(b).hashes + assert ( + cmp.angular_similarity + == ds_a.angular_similarity(ds_b) + == ds_b.angular_similarity(ds_a) + ) + assert ( + cmp.cosine_similarity + == ds_a.angular_similarity(ds_b) + == ds_b.angular_similarity(ds_a) + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes + == intersect_mh.inflate(ds_a).hashes + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh2_cmp).hashes + == intersect_mh.inflate(ds_b).hashes + ) + assert ( + cmp.weighted_intersection(from_abundD=a_values).hashes + == intersect_mh.inflate(a).hashes + ) + assert ( + cmp.weighted_intersection(from_abundD=b_values).hashes + == intersect_mh.inflate(b).hashes + ) else: with pytest.raises(TypeError) as exc: cmp.angular_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) with pytest.raises(TypeError) as exc: cmp.cosine_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) - assert cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes == intersect_mh.hashes - assert cmp.weighted_intersection(from_mh=cmp.mh2_cmp).hashes == intersect_mh.hashes + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes == intersect_mh.hashes + ) + assert ( + cmp.weighted_intersection(from_mh=cmp.mh2_cmp).hashes == intersect_mh.hashes + ) def test_FracMinHashComparison_ignore_abundance(track_abundance): @@ -185,9 +272,8 @@ def test_FracMinHashComparison_ignore_abundance(track_abundance): a = MinHash(0, 21, scaled=1, track_abundance=track_abundance) b = MinHash(0, 21, scaled=1, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } - intersection_w_abund = {1:8, 3:5, 5:3, 8:3} + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -201,7 +287,7 @@ def test_FracMinHashComparison_ignore_abundance(track_abundance): ds_b = b.flatten().downsample(scaled=cmp_scaled) # build FracMinHashComparison - cmp = FracMinHashComparison(a, b, cmp_scaled = cmp_scaled, ignore_abundance=True) + cmp = FracMinHashComparison(a, b, cmp_scaled=cmp_scaled, ignore_abundance=True) assert cmp.mh1 == a assert cmp.mh2 == b assert cmp.mh1_cmp == ds_a @@ -216,18 +302,26 @@ def test_FracMinHashComparison_ignore_abundance(track_abundance): assert cmp.max_containment == ds_a.max_containment(ds_b) assert cmp.jaccard == ds_a.jaccard(ds_b) == ds_b.jaccard(ds_a) intersect_mh = ds_a.flatten().intersection(ds_b.flatten()) - assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + assert ( + cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + ) assert cmp.total_unique_intersect_hashes == 8 - assert cmp.pass_threshold # default threshold is 0; this should pass + assert cmp.pass_threshold # default threshold is 0; this should pass # with ignore_abundance = True, all of these should not be usable. Do we want errors, or ""/None? with pytest.raises(TypeError) as exc: cmp.angular_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) with pytest.raises(TypeError) as exc: cmp.cosine_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) assert not cmp.mh1_cmp.track_abundance assert not cmp.mh2_cmp.track_abundance assert cmp.weighted_intersection(from_mh=cmp.mh1_cmp).hashes == intersect_mh.hashes @@ -239,8 +333,8 @@ def test_FracMinHashComparison_fail_threshold(track_abundance): a = MinHash(0, 21, scaled=1, track_abundance=track_abundance) b = MinHash(0, 21, scaled=1, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -254,7 +348,7 @@ def test_FracMinHashComparison_fail_threshold(track_abundance): ds_b = b.flatten().downsample(scaled=cmp_scaled) # build FracMinHashComparison - cmp = FracMinHashComparison(a, b, cmp_scaled = cmp_scaled, threshold_bp=40) + cmp = FracMinHashComparison(a, b, cmp_scaled=cmp_scaled, threshold_bp=40) assert cmp.mh1 == a assert cmp.mh2 == b assert cmp.ignore_abundance == False @@ -267,15 +361,19 @@ def test_FracMinHashComparison_fail_threshold(track_abundance): assert cmp.max_containment == ds_a.max_containment(ds_b) assert cmp.jaccard == a.jaccard(b) == b.jaccard(a) intersect_mh = ds_a.flatten().intersection(ds_b.flatten()) - assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + assert ( + cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + ) assert cmp.total_unique_intersect_hashes == 8 - assert not cmp.pass_threshold # threshold is 40; this should fail + assert not cmp.pass_threshold # threshold is 40; this should fail def test_FracMinHashComparison_potential_false_negative(): - f1 = utils.get_test_data('scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz') - f2 = utils.get_test_data('scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz') - f3 = utils.get_test_data('scaled100/GCF_000783305.1_ASM78330v1_genomic.fna.gz.sig.gz') + f1 = utils.get_test_data("scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz") + f2 = utils.get_test_data("scaled100/GCF_000006945.1_ASM694v1_genomic.fna.gz.sig.gz") + f3 = utils.get_test_data( + "scaled100/GCF_000783305.1_ASM78330v1_genomic.fna.gz.sig.gz" + ) a = load_one_signature(f1, ksize=21).minhash b = load_one_signature(f2).minhash c = load_one_signature(f3).minhash @@ -289,9 +387,17 @@ def test_FracMinHashComparison_potential_false_negative(): cmp.estimate_jaccard_ani() assert cmp.jaccard_ani == a.jaccard_ani(b).ani == b.jaccard_ani(a).ani print(cmp.jaccard_ani) - assert cmp.potential_false_negative == a.jaccard_ani(b).p_exceeds_threshold == b.jaccard_ani(a).p_exceeds_threshold + assert ( + cmp.potential_false_negative + == a.jaccard_ani(b).p_exceeds_threshold + == b.jaccard_ani(a).p_exceeds_threshold + ) assert cmp.potential_false_negative == False - assert cmp.jaccard_ani_untrustworthy == a.jaccard_ani(b).je_exceeds_threshold == b.jaccard_ani(a).je_exceeds_threshold + assert ( + cmp.jaccard_ani_untrustworthy + == a.jaccard_ani(b).je_exceeds_threshold + == b.jaccard_ani(a).je_exceeds_threshold + ) cmp.estimate_ani_from_mh1_containment_in_mh2() a_cont_ani_manual = a.containment_ani(b) @@ -308,12 +414,18 @@ def test_FracMinHashComparison_potential_false_negative(): cmp.estimate_max_containment_ani() mc_ani_manual = a.max_containment_ani(b) - assert cmp.max_containment_ani == max(a.containment_ani(b).ani, b.containment_ani(a).ani) == mc_ani_manual.ani + assert ( + cmp.max_containment_ani + == max(a.containment_ani(b).ani, b.containment_ani(a).ani) + == mc_ani_manual.ani + ) assert cmp.potential_false_negative == mc_ani_manual.p_exceeds_threshold - assert cmp.avg_containment_ani == np.mean([a.containment_ani(b).ani, b.containment_ani(a).ani]) + assert cmp.avg_containment_ani == np.mean( + [a.containment_ani(b).ani, b.containment_ani(a).ani] + ) assert cmp.potential_false_negative == False - #downsample to where it becomes a potential false negative + # downsample to where it becomes a potential false negative cmp = FracMinHashComparison(a, b, cmp_scaled=16000) cmp.estimate_ani_from_mh1_containment_in_mh2() assert cmp.potential_false_negative == True @@ -323,8 +435,8 @@ def test_FracMinHashComparison_incompatible_ksize(track_abundance): a = MinHash(0, 31, scaled=1, track_abundance=track_abundance) b = MinHash(0, 21, scaled=2, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -343,8 +455,8 @@ def test_FracMinHashComparison_incompatible_moltype(track_abundance): a = MinHash(0, 31, scaled=1, track_abundance=track_abundance) b = MinHash(0, 31, scaled=2, is_protein=True, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -363,8 +475,8 @@ def test_FracMinHashComparison_incompatible_sketchtype(track_abundance): a = MinHash(0, 31, scaled=1, track_abundance=track_abundance) b = MinHash(10, 31, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -384,8 +496,8 @@ def test_FracMinHashComparison_incompatible_cmp_scaled(track_abundance): a = MinHash(0, 31, scaled=1, track_abundance=track_abundance) b = MinHash(0, 31, scaled=10, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -395,7 +507,7 @@ def test_FracMinHashComparison_incompatible_cmp_scaled(track_abundance): b.add_many(b_values.keys()) with pytest.raises(ValueError) as exc: - FracMinHashComparison(a, b, cmp_scaled = 1) + FracMinHashComparison(a, b, cmp_scaled=1) print(str(exc)) assert "new scaled 1 is lower than current sample scaled 10" in str(exc) @@ -404,8 +516,8 @@ def test_FracMinHashComparison_redownsample_without_scaled(track_abundance): a = MinHash(0, 31, scaled=1, track_abundance=track_abundance) b = MinHash(0, 31, scaled=10, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -429,8 +541,8 @@ def test_NumMinHashComparison(track_abundance): a = MinHash(10, 21, scaled=0, track_abundance=track_abundance) b = MinHash(10, 21, scaled=0, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -453,17 +565,27 @@ def test_NumMinHashComparison(track_abundance): intersect_mh = a.flatten().intersection(b.flatten()) assert cmp.intersect_mh == intersect_mh == b.flatten().intersection(a.flatten()) if track_abundance: - assert cmp.angular_similarity == a.angular_similarity(b) == b.angular_similarity(a) - assert cmp.cosine_similarity == a.angular_similarity(b) == b.angular_similarity(a) + assert ( + cmp.angular_similarity == a.angular_similarity(b) == b.angular_similarity(a) + ) + assert ( + cmp.cosine_similarity == a.angular_similarity(b) == b.angular_similarity(a) + ) else: with pytest.raises(TypeError) as exc: cmp.angular_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) with pytest.raises(TypeError) as exc: cmp.cosine_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) def test_NumMinHashComparison_downsample(track_abundance): @@ -471,8 +593,8 @@ def test_NumMinHashComparison_downsample(track_abundance): a = MinHash(10, 21, scaled=0, track_abundance=track_abundance) b = MinHash(10, 21, scaled=0, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -487,7 +609,7 @@ def test_NumMinHashComparison_downsample(track_abundance): ds_a = a.downsample(num=cmp_num) ds_b = b.downsample(num=cmp_num) # build NumMinHashComparison - cmp = NumMinHashComparison(a, b, cmp_num = cmp_num) + cmp = NumMinHashComparison(a, b, cmp_num=cmp_num) assert cmp.mh1 == a assert cmp.mh2 == b assert cmp.ignore_abundance == False @@ -496,19 +618,35 @@ def test_NumMinHashComparison_downsample(track_abundance): assert cmp.moltype == "DNA" assert cmp.jaccard == ds_a.jaccard(ds_b) == ds_b.jaccard(ds_a) intersect_mh = ds_a.flatten().intersection(ds_b.flatten()) - assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + assert ( + cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + ) if track_abundance: - assert cmp.angular_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a) - assert cmp.cosine_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a) + assert ( + cmp.angular_similarity + == ds_a.angular_similarity(ds_b) + == ds_b.angular_similarity(ds_a) + ) + assert ( + cmp.cosine_similarity + == ds_a.angular_similarity(ds_b) + == ds_b.angular_similarity(ds_a) + ) else: with pytest.raises(TypeError) as exc: cmp.angular_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) with pytest.raises(TypeError) as exc: cmp.cosine_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) def test_NumMinHashComparison_autodownsample(track_abundance): @@ -516,8 +654,8 @@ def test_NumMinHashComparison_autodownsample(track_abundance): a = MinHash(10, 21, scaled=0, track_abundance=track_abundance) b = MinHash(5, 21, scaled=0, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -541,27 +679,43 @@ def test_NumMinHashComparison_autodownsample(track_abundance): assert cmp.moltype == "DNA" assert cmp.jaccard == ds_a.jaccard(ds_b) == ds_b.jaccard(ds_a) intersect_mh = ds_a.flatten().intersection(ds_b.flatten()) - assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + assert ( + cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten()) + ) if track_abundance: - assert cmp.angular_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a) - assert cmp.cosine_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a) + assert ( + cmp.angular_similarity + == ds_a.angular_similarity(ds_b) + == ds_b.angular_similarity(ds_a) + ) + assert ( + cmp.cosine_similarity + == ds_a.angular_similarity(ds_b) + == ds_b.angular_similarity(ds_a) + ) else: with pytest.raises(TypeError) as exc: cmp.angular_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) with pytest.raises(TypeError) as exc: cmp.cosine_similarity print(str(exc)) - assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc) + assert ( + "Error: Angular (cosine) similarity requires both sketches to track hash abundance." + in str(exc) + ) def test_NumMinHashComparison_incompatible_ksize(track_abundance): a_num = MinHash(20, 31, track_abundance=track_abundance) b_num = MinHash(10, 21, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a_num.set_abundances(a_values) @@ -581,8 +735,8 @@ def test_NumMinHashComparison_incompatible_moltype(track_abundance): a_num = MinHash(20, 31, track_abundance=track_abundance) b_num = MinHash(10, 31, is_protein=True, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a_num.set_abundances(a_values) @@ -601,8 +755,8 @@ def test_NumMinHashComparison_incompatible_sketchtype(track_abundance): a = MinHash(0, 31, scaled=1, track_abundance=track_abundance) b = MinHash(10, 31, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -621,8 +775,8 @@ def test_NumMinHashComparison_redownsample_without_num(track_abundance): a = MinHash(10, 31, track_abundance=track_abundance) b = MinHash(5, 31, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -645,8 +799,8 @@ def test_NumMinHashComparison_incompatible_cmp_num(track_abundance): a = MinHash(200, 31, track_abundance=track_abundance) b = MinHash(100, 31, track_abundance=track_abundance) - a_values = { 1:5, 3:3, 5:2, 8:2} - b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } + a_values = {1: 5, 3: 3, 5: 2, 8: 2} + b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} if track_abundance: a.set_abundances(a_values) @@ -656,7 +810,7 @@ def test_NumMinHashComparison_incompatible_cmp_num(track_abundance): b.add_many(b_values.keys()) with pytest.raises(ValueError) as exc: - NumMinHashComparison(a, b, cmp_num = 150) + NumMinHashComparison(a, b, cmp_num=150) print(str(exc)) assert "new sample num is higher than current sample num" in str(exc) @@ -664,11 +818,11 @@ def test_NumMinHashComparison_incompatible_cmp_num(track_abundance): def test_FracMinHashComparison_ANI(track_abundance): # need real mh here, small test data fails if track_abundance: - f1 = utils.get_test_data('track_abund/47.fa.sig') - f2 = utils.get_test_data('track_abund/63.fa.sig') + f1 = utils.get_test_data("track_abund/47.fa.sig") + f2 = utils.get_test_data("track_abund/63.fa.sig") else: - f1 = utils.get_test_data('47.fa.sig') - f2 = utils.get_test_data('63.fa.sig') + f1 = utils.get_test_data("47.fa.sig") + f2 = utils.get_test_data("63.fa.sig") a = load_one_signature(f1, ksize=31).minhash b = load_one_signature(f2, ksize=31).minhash @@ -677,40 +831,54 @@ def test_FracMinHashComparison_ANI(track_abundance): # check jaccard ani cmp.estimate_jaccard_ani() assert cmp.jaccard_ani == a.jaccard_ani(b).ani == b.jaccard_ani(a).ani - assert cmp.potential_false_negative == a.jaccard_ani(b).p_exceeds_threshold == b.jaccard_ani(a).p_exceeds_threshold - assert cmp.jaccard_ani_untrustworthy == a.jaccard_ani(b).je_exceeds_threshold == b.jaccard_ani(a).je_exceeds_threshold + assert ( + cmp.potential_false_negative + == a.jaccard_ani(b).p_exceeds_threshold + == b.jaccard_ani(a).p_exceeds_threshold + ) + assert ( + cmp.jaccard_ani_untrustworthy + == a.jaccard_ani(b).je_exceeds_threshold + == b.jaccard_ani(a).je_exceeds_threshold + ) cmp.estimate_ani_from_mh1_containment_in_mh2() a_cont_ani_manual = a.containment_ani(b) assert cmp.ani_from_mh1_containment_in_mh2 == a_cont_ani_manual.ani assert cmp.potential_false_negative == a_cont_ani_manual.p_exceeds_threshold -# assert cmp.mh1_containment_ani_low is None -# assert cmp.mh1_containment_ani_high is None + # assert cmp.mh1_containment_ani_low is None + # assert cmp.mh1_containment_ani_high is None cmp.estimate_ani_from_mh2_containment_in_mh1() b_cont_ani_manual = b.containment_ani(a) assert cmp.ani_from_mh2_containment_in_mh1 == b_cont_ani_manual.ani assert cmp.potential_false_negative == b_cont_ani_manual.p_exceeds_threshold -# assert cmp.mh2_containment_ani_low is None -# assert cmp.mh2_containment_ani_high is None + # assert cmp.mh2_containment_ani_low is None + # assert cmp.mh2_containment_ani_high is None cmp.estimate_max_containment_ani() mc_ani_manual = a.max_containment_ani(b) - assert cmp.max_containment_ani == max(a.containment_ani(b).ani, b.containment_ani(a).ani) == mc_ani_manual.ani + assert ( + cmp.max_containment_ani + == max(a.containment_ani(b).ani, b.containment_ani(a).ani) + == mc_ani_manual.ani + ) assert cmp.potential_false_negative == mc_ani_manual.p_exceeds_threshold -# assert cmp.max_containment_ani_low is None -# assert cmp.max_containment_ani_high is None - assert cmp.avg_containment_ani == np.mean([a.containment_ani(b).ani, b.containment_ani(a).ani]) + # assert cmp.max_containment_ani_low is None + # assert cmp.max_containment_ani_high is None + assert cmp.avg_containment_ani == np.mean( + [a.containment_ani(b).ani, b.containment_ani(a).ani] + ) def test_FracMinHashComparison_ANI_provide_similarity(track_abundance): # need real mh here, small test data fails if track_abundance: - f1 = utils.get_test_data('track_abund/47.fa.sig') - f2 = utils.get_test_data('track_abund/63.fa.sig') + f1 = utils.get_test_data("track_abund/47.fa.sig") + f2 = utils.get_test_data("track_abund/63.fa.sig") else: - f1 = utils.get_test_data('47.fa.sig') - f2 = utils.get_test_data('63.fa.sig') + f1 = utils.get_test_data("47.fa.sig") + f2 = utils.get_test_data("63.fa.sig") a = load_one_signature(f1, ksize=31).minhash b = load_one_signature(f2, ksize=31).minhash @@ -720,8 +888,16 @@ def test_FracMinHashComparison_ANI_provide_similarity(track_abundance): jaccard = a.jaccard(b) cmp.estimate_jaccard_ani(jaccard=jaccard) assert cmp.jaccard_ani == a.jaccard_ani(b).ani == b.jaccard_ani(a).ani - assert cmp.potential_false_negative == a.jaccard_ani(b).p_exceeds_threshold == b.jaccard_ani(a).p_exceeds_threshold - assert cmp.jaccard_ani_untrustworthy == a.jaccard_ani(b).je_exceeds_threshold == b.jaccard_ani(a).je_exceeds_threshold + assert ( + cmp.potential_false_negative + == a.jaccard_ani(b).p_exceeds_threshold + == b.jaccard_ani(a).p_exceeds_threshold + ) + assert ( + cmp.jaccard_ani_untrustworthy + == a.jaccard_ani(b).je_exceeds_threshold + == b.jaccard_ani(a).je_exceeds_threshold + ) a_cont = a.contained_by(b) b_cont = b.contained_by(a) @@ -739,19 +915,25 @@ def test_FracMinHashComparison_ANI_provide_similarity(track_abundance): cmp.estimate_max_containment_ani(max_containment=mc) mc_ani_manual = a.max_containment_ani(b) - assert cmp.max_containment_ani == max(a.containment_ani(b).ani, b.containment_ani(a).ani) == mc_ani_manual.ani + assert ( + cmp.max_containment_ani + == max(a.containment_ani(b).ani, b.containment_ani(a).ani) + == mc_ani_manual.ani + ) assert cmp.potential_false_negative == mc_ani_manual.p_exceeds_threshold - assert cmp.avg_containment_ani == np.mean([a.containment_ani(b).ani, b.containment_ani(a).ani]) + assert cmp.avg_containment_ani == np.mean( + [a.containment_ani(b).ani, b.containment_ani(a).ani] + ) def test_FracMinHashComparison_ANI_estimate_CI(track_abundance): # need real mh here, small test data fails if track_abundance: - f1 = utils.get_test_data('track_abund/47.fa.sig') - f2 = utils.get_test_data('track_abund/63.fa.sig') + f1 = utils.get_test_data("track_abund/47.fa.sig") + f2 = utils.get_test_data("track_abund/63.fa.sig") else: - f1 = utils.get_test_data('47.fa.sig') - f2 = utils.get_test_data('63.fa.sig') + f1 = utils.get_test_data("47.fa.sig") + f2 = utils.get_test_data("63.fa.sig") a = load_one_signature(f1, ksize=31).minhash b = load_one_signature(f2, ksize=31).minhash @@ -759,8 +941,16 @@ def test_FracMinHashComparison_ANI_estimate_CI(track_abundance): cmp = FracMinHashComparison(a, b, estimate_ani_ci=True) cmp.estimate_jaccard_ani() assert cmp.jaccard_ani == a.jaccard_ani(b).ani == b.jaccard_ani(a).ani - assert cmp.potential_false_negative == a.jaccard_ani(b).p_exceeds_threshold == b.jaccard_ani(a).p_exceeds_threshold - assert cmp.jaccard_ani_untrustworthy == a.jaccard_ani(b).je_exceeds_threshold == b.jaccard_ani(a).je_exceeds_threshold + assert ( + cmp.potential_false_negative + == a.jaccard_ani(b).p_exceeds_threshold + == b.jaccard_ani(a).p_exceeds_threshold + ) + assert ( + cmp.jaccard_ani_untrustworthy + == a.jaccard_ani(b).je_exceeds_threshold + == b.jaccard_ani(a).je_exceeds_threshold + ) cmp.estimate_ani_from_mh1_containment_in_mh2() a_cont_ani_manual = a.containment_ani(b, estimate_ci=True) @@ -778,20 +968,24 @@ def test_FracMinHashComparison_ANI_estimate_CI(track_abundance): cmp.estimate_max_containment_ani() mc_ani_manual = a.max_containment_ani(b, estimate_ci=True) - assert cmp.max_containment_ani == max(a.containment_ani(b).ani, b.containment_ani(a).ani) == mc_ani_manual.ani + assert ( + cmp.max_containment_ani + == max(a.containment_ani(b).ani, b.containment_ani(a).ani) + == mc_ani_manual.ani + ) assert cmp.potential_false_negative == mc_ani_manual.p_exceeds_threshold assert cmp.max_containment_ani_low == mc_ani_manual.ani_low - assert cmp.max_containment_ani_high ==mc_ani_manual.ani_high + assert cmp.max_containment_ani_high == mc_ani_manual.ani_high def test_FracMinHashComparison_ANI_estimate_CI_ci99(track_abundance): # need real mh here, small test data fails if track_abundance: - f1 = utils.get_test_data('track_abund/47.fa.sig') - f2 = utils.get_test_data('track_abund/63.fa.sig') + f1 = utils.get_test_data("track_abund/47.fa.sig") + f2 = utils.get_test_data("track_abund/63.fa.sig") else: - f1 = utils.get_test_data('47.fa.sig') - f2 = utils.get_test_data('63.fa.sig') + f1 = utils.get_test_data("47.fa.sig") + f2 = utils.get_test_data("63.fa.sig") a = load_one_signature(f1, ksize=31).minhash b = load_one_signature(f2, ksize=31).minhash @@ -814,20 +1008,24 @@ def test_FracMinHashComparison_ANI_estimate_CI_ci99(track_abundance): cmp.estimate_max_containment_ani() mc_ani_manual = a.max_containment_ani(b, estimate_ci=True, confidence=0.99) - assert cmp.max_containment_ani == max(a.containment_ani(b).ani, b.containment_ani(a).ani) == mc_ani_manual.ani + assert ( + cmp.max_containment_ani + == max(a.containment_ani(b).ani, b.containment_ani(a).ani) + == mc_ani_manual.ani + ) assert cmp.potential_false_negative == mc_ani_manual.p_exceeds_threshold assert cmp.max_containment_ani_low == mc_ani_manual.ani_low - assert cmp.max_containment_ani_high ==mc_ani_manual.ani_high + assert cmp.max_containment_ani_high == mc_ani_manual.ani_high def test_FracMinHashComparison_ANI_downsample(track_abundance): # need real mh here, small test data fails if track_abundance: - f1 = utils.get_test_data('track_abund/47.fa.sig') - f2 = utils.get_test_data('track_abund/63.fa.sig') + f1 = utils.get_test_data("track_abund/47.fa.sig") + f2 = utils.get_test_data("track_abund/63.fa.sig") else: - f1 = utils.get_test_data('47.fa.sig') - f2 = utils.get_test_data('63.fa.sig') + f1 = utils.get_test_data("47.fa.sig") + f2 = utils.get_test_data("63.fa.sig") a = load_one_signature(f1, ksize=31).minhash b = load_one_signature(f2, ksize=31).minhash @@ -841,8 +1039,16 @@ def test_FracMinHashComparison_ANI_downsample(track_abundance): # check jaccard ani cmp.estimate_jaccard_ani() assert cmp.jaccard_ani == a.jaccard_ani(b).ani == b.jaccard_ani(a).ani - assert cmp.potential_false_negative == a.jaccard_ani(b).p_exceeds_threshold == b.jaccard_ani(a).p_exceeds_threshold - assert cmp.jaccard_ani_untrustworthy == a.jaccard_ani(b).je_exceeds_threshold == b.jaccard_ani(a).je_exceeds_threshold + assert ( + cmp.potential_false_negative + == a.jaccard_ani(b).p_exceeds_threshold + == b.jaccard_ani(a).p_exceeds_threshold + ) + assert ( + cmp.jaccard_ani_untrustworthy + == a.jaccard_ani(b).je_exceeds_threshold + == b.jaccard_ani(a).je_exceeds_threshold + ) cmp.estimate_ani_from_mh1_containment_in_mh2() a_cont_ani_manual = a.containment_ani(b, estimate_ci=True) @@ -860,7 +1066,11 @@ def test_FracMinHashComparison_ANI_downsample(track_abundance): cmp.estimate_max_containment_ani() mc_ani_manual = a.max_containment_ani(b, estimate_ci=True) - assert cmp.max_containment_ani == max(a.containment_ani(b).ani, b.containment_ani(a).ani) == mc_ani_manual.ani + assert ( + cmp.max_containment_ani + == max(a.containment_ani(b).ani, b.containment_ani(a).ani) + == mc_ani_manual.ani + ) assert cmp.potential_false_negative == mc_ani_manual.p_exceeds_threshold assert cmp.max_containment_ani_low == mc_ani_manual.ani_low - assert cmp.max_containment_ani_high ==mc_ani_manual.ani_high + assert cmp.max_containment_ani_high == mc_ani_manual.ani_high diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 9ee703f6f7..7aaac0446e 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -26,7 +26,8 @@ try: import matplotlib - matplotlib.use('Agg') + + matplotlib.use("Agg") except ImportError: pass @@ -40,30 +41,30 @@ def test_citation_file(): import yaml thisdir = os.path.dirname(__file__) - citation_file = os.path.join(thisdir, '../CITATION.cff') + citation_file = os.path.join(thisdir, "../CITATION.cff") with open(citation_file) as fp: x = yaml.safe_load(fp) - assert x['title'] == "sourmash: a library for MinHash sketching of DNA", x + assert x["title"] == "sourmash: a library for MinHash sketching of DNA", x def test_run_sourmash(): - status, out, err = utils.runscript('sourmash', [], fail_ok=True) - assert status != 0 # no args provided, ok ;) + status, out, err = utils.runscript("sourmash", [], fail_ok=True) + assert status != 0 # no args provided, ok ;) def test_run_sourmash_badcmd(): - status, out, err = utils.runscript('sourmash', ['foobarbaz'], fail_ok=True) - assert status != 0 # bad arg! + status, out, err = utils.runscript("sourmash", ["foobarbaz"], fail_ok=True) + assert status != 0 # bad arg! assert "cmd: invalid choice" in err def test_run_sourmash_subcmd_help(): - status, out, err = utils.runscript('sourmash', ['sbt'], fail_ok=True) + status, out, err = utils.runscript("sourmash", ["sbt"], fail_ok=True) print(out) print(err) - assert status != 0 # should fail + assert status != 0 # should fail assert "invalid choice:" in err assert "'sbt' (choose from" in err @@ -73,7 +74,7 @@ def test_run_sourmash_subcmd_help(): def test_sourmash_info(): - status, out, err = utils.runscript('sourmash', ['info'], fail_ok=False) + status, out, err = utils.runscript("sourmash", ["info"], fail_ok=False) # no output to stdout assert not out @@ -83,7 +84,7 @@ def test_sourmash_info(): def test_sourmash_info_verbose(): - status, out, err = utils.runscript('sourmash', ['info', '-v']) + status, out, err = utils.runscript("sourmash", ["info", "-v"]) # no output to stdout assert not out @@ -94,6 +95,7 @@ def test_sourmash_info_verbose(): def test_load_pathlist_from_file_does_not_exist(): from sourmash.sourmash_args import load_pathlist_from_file + with pytest.raises(ValueError) as e: load_pathlist_from_file("") assert "file '' does not exist" in str(e.value) @@ -122,7 +124,7 @@ def test_load_pathlist_from_file_badly_formatted(c): @utils.in_tempdir def test_load_pathlist_from_file_badly_formatted_2(c): file_list = c.output("file_list") - sig1 = utils.get_test_data('compare/genome-s10.fa.gz.sig') + sig1 = utils.get_test_data("compare/genome-s10.fa.gz.sig") with open(file_list, "w") as fp: fp.write(sig1 + "\n") fp.write("{'a':1}") @@ -134,12 +136,12 @@ def test_load_pathlist_from_file_badly_formatted_2(c): @utils.in_tempdir def test_load_pathlist_from_file_duplicate(c): file_list = c.output("file_list") - sig1 = utils.get_test_data('compare/genome-s10.fa.gz.sig') + sig1 = utils.get_test_data("compare/genome-s10.fa.gz.sig") with open(file_list, "w") as fp: fp.write(sig1 + "\n") fp.write(sig1 + "\n") check = load_pathlist_from_file(file_list) - print (check) + print(check) assert len(check) == 1 @@ -147,19 +149,18 @@ def test_compare_serial(runtmp): # try doing a compare serially c = runtmp - testsigs = utils.get_test_data('genome-s1*.sig') + testsigs = utils.get_test_data("genome-s1*.sig") testsigs = glob.glob(testsigs) - c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--dna', *testsigs) + c.run_sourmash("compare", "-o", "cmp", "-k", "21", "--dna", *testsigs) - cmp_outfile = c.output('cmp') + cmp_outfile = c.output("cmp") assert os.path.exists(cmp_outfile) cmp_out = numpy.load(cmp_outfile) sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) cmp_calc = numpy.zeros([len(sigs), len(sigs)]) for i, si in enumerate(sigs): @@ -168,8 +169,7 @@ def test_compare_serial(runtmp): sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) assert (cmp_out == cmp_calc).all() @@ -177,20 +177,18 @@ def test_compare_serial_distance(runtmp): # try doing a compare serially, with --distance output c = runtmp - testsigs = utils.get_test_data('genome-s1*.sig') + testsigs = utils.get_test_data("genome-s1*.sig") testsigs = glob.glob(testsigs) - c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--dna', *testsigs, - '--distance') + c.run_sourmash("compare", "-o", "cmp", "-k", "21", "--dna", *testsigs, "--distance") - cmp_outfile = c.output('cmp') + cmp_outfile = c.output("cmp") assert os.path.exists(cmp_outfile) cmp_out = numpy.load(cmp_outfile) sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) cmp_calc = numpy.zeros([len(sigs), len(sigs)]) for i, si in enumerate(sigs): @@ -199,8 +197,7 @@ def test_compare_serial_distance(runtmp): sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) assert (cmp_out == cmp_calc).all() @@ -208,20 +205,20 @@ def test_compare_parallel(runtmp): # try doing a compare parallel c = runtmp - testsigs = utils.get_test_data('genome-s1*.sig') + testsigs = utils.get_test_data("genome-s1*.sig") testsigs = glob.glob(testsigs) - c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--dna', - "--processes", "2", *testsigs) + c.run_sourmash( + "compare", "-o", "cmp", "-k", "21", "--dna", "--processes", "2", *testsigs + ) - cmp_outfile = c.output('cmp') + cmp_outfile = c.output("cmp") assert os.path.exists(cmp_outfile) cmp_out = numpy.load(cmp_outfile) sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) cmp_calc = numpy.zeros([len(sigs), len(sigs)]) for i, si in enumerate(sigs): @@ -230,32 +227,31 @@ def test_compare_parallel(runtmp): sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) assert (cmp_out == cmp_calc).all() def test_compare_do_serial_compare_with_from_file(runtmp): # try doing a compare serial c = runtmp - testsigs = utils.get_test_data('genome-s1*.sig') + testsigs = utils.get_test_data("genome-s1*.sig") testsigs = glob.glob(testsigs) - file_list = c.output('file.list') - with open(file_list, 'wt') as fp: + file_list = c.output("file.list") + with open(file_list, "w") as fp: print("\n".join(testsigs), file=fp) - c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--dna', - '--from-file', file_list) + c.run_sourmash( + "compare", "-o", "cmp", "-k", "21", "--dna", "--from-file", file_list + ) - cmp_outfile = c.output('cmp') + cmp_outfile = c.output("cmp") assert os.path.exists(cmp_outfile) cmp_out = numpy.load(cmp_outfile) sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) cmp_calc = numpy.zeros([len(sigs), len(sigs)]) for i, si in enumerate(sigs): @@ -264,8 +260,7 @@ def test_compare_do_serial_compare_with_from_file(runtmp): sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) assert numpy.array_equal(numpy.sort(cmp_out.flat), numpy.sort(cmp_calc.flat)) @@ -274,19 +269,18 @@ def test_compare_do_basic_compare_using_rna_arg(runtmp): # try doing a basic compare using --rna instead of --dna c = runtmp - testsigs = utils.get_test_data('genome-s1*.sig') + testsigs = utils.get_test_data("genome-s1*.sig") testsigs = glob.glob(testsigs) - c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--rna', *testsigs) + c.run_sourmash("compare", "-o", "cmp", "-k", "21", "--rna", *testsigs) - cmp_outfile = c.output('cmp') + cmp_outfile = c.output("cmp") assert os.path.exists(cmp_outfile) cmp_out = numpy.load(cmp_outfile) sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) cmp_calc = numpy.zeros([len(sigs), len(sigs)]) for i, si in enumerate(sigs): @@ -299,19 +293,18 @@ def test_compare_do_basic_compare_using_rna_arg(runtmp): def test_compare_do_basic_using_nucleotide_arg(runtmp): # try doing a basic compare using --nucleotide instead of --dna/--rna c = runtmp - testsigs = utils.get_test_data('genome-s1*.sig') + testsigs = utils.get_test_data("genome-s1*.sig") testsigs = glob.glob(testsigs) - c.run_sourmash('compare', '-o', 'cmp', '-k', '21', '--nucleotide', *testsigs) + c.run_sourmash("compare", "-o", "cmp", "-k", "21", "--nucleotide", *testsigs) - cmp_outfile = c.output('cmp') + cmp_outfile = c.output("cmp") assert os.path.exists(cmp_outfile) cmp_out = numpy.load(cmp_outfile) sigs = [] for fn in testsigs: - sigs.append(sourmash.load_one_signature(fn, ksize=21, - select_moltype='dna')) + sigs.append(sourmash.load_one_signature(fn, ksize=21, select_moltype="dna")) cmp_calc = numpy.zeros([len(sigs), len(sigs)]) for i, si in enumerate(sigs): @@ -324,22 +317,24 @@ def test_compare_do_basic_using_nucleotide_arg(runtmp): def test_compare_quiet(runtmp): # test 'compare -q' has no output c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1, testdata2) + c.run_sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1, testdata2) - c.run_sourmash('compare', 'short.fa.sig', - 'short2.fa.sig', '--csv', 'xxx', '-q') + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "--csv", "xxx", "-q") assert not c.last_result.out assert not c.last_result.err def test_compare_do_traverse_directory_parse_args(runtmp): # test 'compare' on a directory, using sourmash.cli.parse_args. - import sourmash.commands, sourmash.cli - args = sourmash.cli.parse_args(['compare', '-k', '21', '--dna', - utils.get_test_data('compare')]) + import sourmash.commands + import sourmash.cli + + args = sourmash.cli.parse_args( + ["compare", "-k", "21", "--dna", utils.get_test_data("compare")] + ) sourmash.commands.compare(args) @@ -347,41 +342,39 @@ def test_compare_do_traverse_directory_parse_args(runtmp): def test_compare_do_traverse_directory(runtmp): # test 'compare' on a directory c = runtmp - c.run_sourmash('compare', '-k 21', - '--dna', utils.get_test_data('compare')) + c.run_sourmash("compare", "-k 21", "--dna", utils.get_test_data("compare")) print(c.last_result.out) - assert 'genome-s10.fa.gz' in c.last_result.out - assert 'genome-s11.fa.gz' in c.last_result.out + assert "genome-s10.fa.gz" in c.last_result.out + assert "genome-s11.fa.gz" in c.last_result.out def test_compare_do_traverse_directory_compare_force(runtmp): # test 'compare' on a directory, with -f c = runtmp - sig1 = utils.get_test_data('compare/genome-s10.fa.gz.sig') - sig2 = utils.get_test_data('compare/genome-s11.fa.gz.sig') - newdir = c.output('newdir') + sig1 = utils.get_test_data("compare/genome-s10.fa.gz.sig") + sig2 = utils.get_test_data("compare/genome-s11.fa.gz.sig") + newdir = c.output("newdir") os.mkdir(newdir) - shutil.copyfile(sig1, os.path.join(newdir, 'sig1')) - shutil.copyfile(sig2, os.path.join(newdir, 'sig2')) + shutil.copyfile(sig1, os.path.join(newdir, "sig1")) + shutil.copyfile(sig2, os.path.join(newdir, "sig2")) - c.run_sourmash('compare', '-k 21', - '--dna', newdir, '-f') + c.run_sourmash("compare", "-k 21", "--dna", newdir, "-f") print(c.last_result.out) - assert 'genome-s10.fa.gz' in c.last_result.out - assert 'genome-s11.fa.gz' in c.last_result.out + assert "genome-s10.fa.gz" in c.last_result.out + assert "genome-s11.fa.gz" in c.last_result.out def test_compare_output_csv(runtmp): # test 'sourmash compare --csv' c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1, testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx') + c.run_sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "--csv", "xxx") - with open(c.output('xxx')) as fp: + with open(c.output("xxx")) as fp: r = iter(csv.reader(fp)) row = next(r) print(row) @@ -394,21 +387,20 @@ def test_compare_output_csv(runtmp): assert float(row[1]) == 1.0 # exactly three lines - with pytest.raises(StopIteration) as e: + with pytest.raises(StopIteration): next(r) def test_compare_output_csv_gz(runtmp): # test 'sourmash compare --csv' with a .gz file c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1, testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', - '--csv', 'xxx.gz') + c.run_sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "--csv", "xxx.gz") - with gzip.open(c.output('xxx.gz'), 'rt', newline='') as fp: + with gzip.open(c.output("xxx.gz"), "rt", newline="") as fp: r = iter(csv.reader(fp)) row = next(r) print(row) @@ -421,85 +413,97 @@ def test_compare_output_csv_gz(runtmp): assert float(row[1]) == 1.0 # exactly three lines - with pytest.raises(StopIteration) as e: + with pytest.raises(StopIteration): next(r) def test_compare_downsample(runtmp): # test 'compare' with implicit downsampling c = runtmp - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=200', testdata1) + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=200", testdata1) - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=100', testdata2) + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=100", testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx') + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "--csv", "xxx") print(c.last_result.status, c.last_result.out, c.last_result.err) - assert 'downsampling to scaled value of 200' in c.last_result.err - with open(c.output('xxx')) as fp: + assert "downsampling to scaled value of 200" in c.last_result.err + with open(c.output("xxx")) as fp: lines = fp.readlines() assert len(lines) == 3 - assert lines[1].startswith('1.0,0.6666') - assert lines[2].startswith('0.6666') + assert lines[1].startswith("1.0,0.6666") + assert lines[2].startswith("0.6666") def test_compare_downsample_scaled(runtmp): # test 'compare' with explicit --scaled downsampling c = runtmp - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=200', testdata1) + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=200", testdata1) - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=100', testdata2) + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=100", testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx', - '--scaled', '300') + c.run_sourmash( + "compare", "short.fa.sig", "short2.fa.sig", "--csv", "xxx", "--scaled", "300" + ) print(c.last_result.status, c.last_result.out, c.last_result.err) - assert 'downsampling to scaled value of 300' in c.last_result.err - with open(c.output('xxx')) as fp: + assert "downsampling to scaled value of 300" in c.last_result.err + with open(c.output("xxx")) as fp: lines = fp.readlines() assert len(lines) == 3 - assert lines[1].startswith('1.0,0.0') - assert lines[2].startswith('0.0') + assert lines[1].startswith("1.0,0.0") + assert lines[2].startswith("0.0") def test_compare_downsample_scaled_too_low(runtmp): # test 'compare' with explicit --scaled downsampling, but lower than min c = runtmp - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=200', testdata1) + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=200", testdata1) - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=100', testdata2) + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=100", testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx', - '--scaled', '100') + c.run_sourmash( + "compare", "short.fa.sig", "short2.fa.sig", "--csv", "xxx", "--scaled", "100" + ) print(c.last_result.status, c.last_result.out, c.last_result.err) - assert 'downsampling to scaled value of 200' in c.last_result.err - assert "WARNING: --scaled specified 100, but max scaled of sketches is 200" in c.last_result.err - with open(c.output('xxx')) as fp: + assert "downsampling to scaled value of 200" in c.last_result.err + assert ( + "WARNING: --scaled specified 100, but max scaled of sketches is 200" + in c.last_result.err + ) + with open(c.output("xxx")) as fp: lines = fp.readlines() assert len(lines) == 3 - assert lines[1].startswith('1.0,0.6666') - assert lines[2].startswith('0.6666') + assert lines[1].startswith("1.0,0.6666") + assert lines[2].startswith("0.6666") def test_compare_downsample_scaled_fail_num(runtmp): # test 'compare' with explicit --scaled downsampling; fail on num sketch c = runtmp - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=20', testdata1) + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,num=20", testdata1) - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=100', testdata2) + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=100", testdata2) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', - '--csv', 'xxx', '--scaled', '300') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compare", + "short.fa.sig", + "short2.fa.sig", + "--csv", + "xxx", + "--scaled", + "300", + ) print(c.last_result.status, c.last_result.out, c.last_result.err) assert "cannot mix scaled signatures with num signatures" in c.last_result.err @@ -508,75 +512,88 @@ def test_compare_downsample_scaled_fail_num(runtmp): def test_compare_downsample_scaled_fail_all_num(runtmp): # test 'compare' with explicit --scaled downsampling; fail on all num sketches c = runtmp - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=20', testdata1) + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,num=20", testdata1) - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=30', testdata2) + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,num=30", testdata2) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', - '--csv', 'xxx', '--scaled', '300') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compare", + "short.fa.sig", + "short2.fa.sig", + "--csv", + "xxx", + "--scaled", + "300", + ) print(c.last_result.status, c.last_result.out, c.last_result.err) - assert "ERROR: cannot specify --scaled with non-scaled signatures." in c.last_result.err + assert ( + "ERROR: cannot specify --scaled with non-scaled signatures." + in c.last_result.err + ) def test_compare_output_multiple_k(runtmp): # test 'compare' when given multiple k-mer sizes -> should fail c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', testdata1) - c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "translate", "-p", "k=21,num=500", testdata1) + c.run_sourmash("sketch", "translate", "-p", "k=31,num=500", testdata2) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx', - fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compare", "short.fa.sig", "short2.fa.sig", "--csv", "xxx", fail_ok=True + ) print(c.last_result.status, c.last_result.out, c.last_result.err) assert c.last_result.status == -1 - assert 'multiple k-mer sizes loaded; please specify one' in c.last_result.err - assert '(saw k-mer sizes 21, 31)' in c.last_result.err + assert "multiple k-mer sizes loaded; please specify one" in c.last_result.err + assert "(saw k-mer sizes 21, 31)" in c.last_result.err def test_compare_output_multiple_moltype(runtmp): # 'compare' should fail when given multiple moltypes c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=21,num=500', testdata1) - c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=21,num=500", testdata1) + c.run_sourmash("sketch", "translate", "-p", "k=21,num=500", testdata2) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx', - fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compare", "short.fa.sig", "short2.fa.sig", "--csv", "xxx", fail_ok=True + ) assert c.last_result.status == -1 print(c.last_result.err) - assert 'multiple molecule types loaded;' in c.last_result.err + assert "multiple molecule types loaded;" in c.last_result.err def test_compare_dayhoff(runtmp): # test 'compare' works with dayhoff moltype c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', '--dayhoff', testdata1) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "translate", "-p", "k=21,num=500", "--dayhoff", testdata1) assert c.last_result.status == 0 - c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', '--dayhoff', testdata2) + c.run_sourmash("sketch", "translate", "-p", "k=21,num=500", "--dayhoff", testdata2) assert c.last_result.status == 0 - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', - '--dayhoff', '--csv', 'xxx') - true_out = '''[1. 0.94] + c.run_sourmash( + "compare", "short.fa.sig", "short2.fa.sig", "--dayhoff", "--csv", "xxx" + ) + true_out = """[1. 0.94] [0.94 1. ] -min similarity in matrix: 0.940'''.splitlines() +min similarity in matrix: 0.940""".splitlines() for line in c.last_result.out: - cleaned_line = line.split('...')[-1].strip() + cleaned_line = line.split("...")[-1].strip() cleaned_line in true_out assert c.last_result.status == 0 @@ -584,21 +601,20 @@ def test_compare_dayhoff(runtmp): def test_compare_hp(runtmp): # test that 'compare' works with --hp moltype c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', '--hp', testdata1) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "translate", "-p", "k=21,num=500", "--hp", testdata1) assert c.last_result.status == 0 - c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', '--hp', testdata2) + c.run_sourmash("sketch", "translate", "-p", "k=21,num=500", "--hp", testdata2) assert c.last_result.status == 0 - c.run_sourmash('compare', 'short.fa.sig', - 'short2.fa.sig', '--hp', '--csv', 'xxx') - true_out = '''[1. 0.94] + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "--hp", "--csv", "xxx") + true_out = """[1. 0.94] [0.94 1. ] -min similarity in matrix: 0.940'''.splitlines() +min similarity in matrix: 0.940""".splitlines() for line in c.last_result.out: - cleaned_line = line.split('...')[-1].strip() + cleaned_line = line.split("...")[-1].strip() cleaned_line in true_out assert c.last_result.status == 0 @@ -607,7 +623,7 @@ def _load_compare_matrix_and_sigs(compare_csv, sigfiles, *, ksize=31): # load in the output of 'compare' together with sigs # load compare CSV - with open(compare_csv, 'rt', newline="") as fp: + with open(compare_csv, newline="") as fp: r = iter(csv.reader(fp)) headers = next(r) @@ -619,7 +635,7 @@ def _load_compare_matrix_and_sigs(compare_csv, sigfiles, *, ksize=31): print(mat) # load in all the input signatures - idx_to_sig = dict() + idx_to_sig = {} for idx, filename in enumerate(sigfiles): ss = sourmash.load_one_signature(filename, ksize=ksize) idx_to_sig[idx] = ss @@ -631,15 +647,17 @@ def test_compare_containment(runtmp): # test compare --containment c = runtmp - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - c.run_sourmash('compare', '--containment', '-k', '31', - '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", "--containment", "-k", "31", "--csv", "output.csv", *testdata_sigs + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -657,15 +675,24 @@ def test_compare_containment_distance(runtmp): # test compare --containment --distance-matrix c = runtmp - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - c.run_sourmash('compare', '--containment', '--distance-matrix', '-k', '31', - '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--containment", + "--distance-matrix", + "-k", + "31", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -683,15 +710,23 @@ def test_compare_max_containment(runtmp): # test compare --max-containment c = runtmp - testdata_glob = utils.get_test_data('scaled/*.sig') + testdata_glob = utils.get_test_data("scaled/*.sig") testdata_sigs = glob.glob(testdata_glob) - c.run_sourmash('compare', '--max-containment', '-k', '31', - '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--max-containment", + "-k", + "31", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -709,15 +744,23 @@ def test_compare_avg_containment(runtmp): # test compare --avg-containment c = runtmp - testdata_glob = utils.get_test_data('scaled/*.sig') + testdata_glob = utils.get_test_data("scaled/*.sig") testdata_sigs = glob.glob(testdata_glob) - c.run_sourmash('compare', '--avg-containment', '-k', '31', - '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--avg-containment", + "-k", + "31", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -735,93 +778,125 @@ def test_compare_max_containment_and_containment(runtmp): # make sure that can't specify both --max-containment and --containment c = runtmp - testdata_glob = utils.get_test_data('scaled/*.sig') + testdata_glob = utils.get_test_data("scaled/*.sig") testdata_sigs = glob.glob(testdata_glob) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', '--max-containment', '-k', '31', - '--containment', - '--csv', 'output.csv', *testdata_sigs) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compare", + "--max-containment", + "-k", + "31", + "--containment", + "--csv", + "output.csv", + *testdata_sigs, + ) print(c.last_result.err) - assert "ERROR: cannot specify more than one containment argument!" in c.last_result.err + assert ( + "ERROR: cannot specify more than one containment argument!" in c.last_result.err + ) def test_compare_avg_containment_and_containment(runtmp): # make sure that can't specify both --avg-containment and --containment c = runtmp - testdata_glob = utils.get_test_data('scaled/*.sig') + testdata_glob = utils.get_test_data("scaled/*.sig") testdata_sigs = glob.glob(testdata_glob) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', '--avg-containment', '-k', '31', - '--containment', - '--csv', 'output.csv', *testdata_sigs) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compare", + "--avg-containment", + "-k", + "31", + "--containment", + "--csv", + "output.csv", + *testdata_sigs, + ) print(c.last_result.err) - assert "ERROR: cannot specify more than one containment argument!" in c.last_result.err + assert ( + "ERROR: cannot specify more than one containment argument!" in c.last_result.err + ) def test_compare_avg_containment_and_max_containment(runtmp): # make sure that can't specify both --avg-containment and --max-containment c = runtmp - testdata_glob = utils.get_test_data('scaled/*.sig') + testdata_glob = utils.get_test_data("scaled/*.sig") testdata_sigs = glob.glob(testdata_glob) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', '--avg-containment', '-k', '31', - '--max-containment', - '--csv', 'output.csv', *testdata_sigs) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compare", + "--avg-containment", + "-k", + "31", + "--max-containment", + "--csv", + "output.csv", + *testdata_sigs, + ) print(c.last_result.err) - assert "ERROR: cannot specify more than one containment argument!" in c.last_result.err + assert ( + "ERROR: cannot specify more than one containment argument!" in c.last_result.err + ) def test_compare_containment_abund_flatten_warning(runtmp): # check warning message about ignoring abund signatures - c = runtmp - s47 = utils.get_test_data('track_abund/47.fa.sig') - s63 = utils.get_test_data('track_abund/63.fa.sig') + c = runtmp + s47 = utils.get_test_data("track_abund/47.fa.sig") + s63 = utils.get_test_data("track_abund/63.fa.sig") - c.run_sourmash('compare', '--containment', '-k', '31', s47, s63) + c.run_sourmash("compare", "--containment", "-k", "31", s47, s63) print(c.last_result.out) print(c.last_result.err) - assert 'NOTE: --containment, --max-containment, --avg-containment, and --estimate-ani ignore signature abundances.' in \ - c.last_result.err + assert ( + "NOTE: --containment, --max-containment, --avg-containment, and --estimate-ani ignore signature abundances." + in c.last_result.err + ) def test_compare_ani_abund_flatten(runtmp): # check warning message about ignoring abund signatures c = runtmp - s47 = utils.get_test_data('track_abund/47.fa.sig') - s63 = utils.get_test_data('track_abund/63.fa.sig') + s47 = utils.get_test_data("track_abund/47.fa.sig") + s63 = utils.get_test_data("track_abund/63.fa.sig") - c.run_sourmash('compare', '--estimate-ani', '-k', '31', s47, s63) + c.run_sourmash("compare", "--estimate-ani", "-k", "31", s47, s63) print(c.last_result.out) print(c.last_result.err) - assert 'NOTE: --containment, --max-containment, --avg-containment, and --estimate-ani ignore signature abundances.' in \ - c.last_result.err + assert ( + "NOTE: --containment, --max-containment, --avg-containment, and --estimate-ani ignore signature abundances." + in c.last_result.err + ) def test_compare_containment_require_scaled(runtmp): # check warning message about scaled signatures & containment c = runtmp - s47 = utils.get_test_data('num/47.fa.sig') - s63 = utils.get_test_data('num/63.fa.sig') + s47 = utils.get_test_data("num/47.fa.sig") + s63 = utils.get_test_data("num/63.fa.sig") - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', '--containment', '-k', '31', s47, s63, - fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("compare", "--containment", "-k", "31", s47, s63, fail_ok=True) - assert 'must use scaled signatures with --containment, --max-containment, and --avg-containment' in \ - c.last_result.err + assert ( + "must use scaled signatures with --containment, --max-containment, and --avg-containment" + in c.last_result.err + ) assert c.last_result.status != 0 @@ -829,13 +904,13 @@ def test_do_plot_comparison(runtmp): # make sure 'plot' outputs files ;) c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '-o', 'cmp') + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "-o", "cmp") - c.run_sourmash('plot', 'cmp') + c.run_sourmash("plot", "cmp") assert os.path.exists(c.output("cmp.dendro.png")) assert os.path.exists(c.output("cmp.matrix.png")) @@ -845,13 +920,13 @@ def test_do_plot_comparison_2_pdf(runtmp): # test plot --pdf c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1, testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '-o', 'cmp') + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "-o", "cmp") - c.run_sourmash('plot', 'cmp', '--pdf') + c.run_sourmash("plot", "cmp", "--pdf") assert os.path.exists(c.output("cmp.dendro.pdf")) assert os.path.exists(c.output("cmp.matrix.pdf")) @@ -860,13 +935,13 @@ def test_do_plot_comparison_3(runtmp): # test plot --labels c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1, testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '-o', 'cmp') + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "-o", "cmp") - c.run_sourmash('plot', 'cmp', '--labels') + c.run_sourmash("plot", "cmp", "--labels") assert os.path.exists(c.output("cmp.dendro.png")) assert os.path.exists(c.output("cmp.matrix.png")) @@ -876,15 +951,15 @@ def test_do_plot_comparison_4_output_dir(runtmp): # test plot --output-dir c = runtmp - output_dir = c.output('xyz_test') + output_dir = c.output("xyz_test") - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1, testdata2) - c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '-o', 'cmp') + c.run_sourmash("compare", "short.fa.sig", "short2.fa.sig", "-o", "cmp") - c.run_sourmash('plot', 'cmp', '--labels', '--output-dir', output_dir) + c.run_sourmash("plot", "cmp", "--labels", "--output-dir", output_dir) assert os.path.exists(os.path.join(output_dir, "cmp.dendro.png")) assert os.path.exists(os.path.join(output_dir, "cmp.matrix.png")) @@ -896,13 +971,13 @@ def test_do_plot_comparison_5_force(runtmp): D = numpy.zeros([2, 2]) D[0, 0] = 5 - with open(c.output('cmp'), 'wb') as fp: + with open(c.output("cmp"), "wb") as fp: numpy.save(fp, D) - with open(c.output('cmp.labels.txt'), 'wt') as fp: + with open(c.output("cmp.labels.txt"), "w") as fp: fp.write("a\nb\n") - c.run_sourmash('plot', 'cmp', '--labels', '-f') + c.run_sourmash("plot", "cmp", "--labels", "-f") print(c.last_result.status, c.last_result.out, c.last_result.err) assert c.last_result.status == 0 @@ -913,14 +988,14 @@ def test_do_plot_comparison_4_fail_not_distance(runtmp): D = numpy.zeros([2, 2]) D[0, 0] = 5 - with open(c.output('cmp'), 'wb') as fp: + with open(c.output("cmp"), "wb") as fp: numpy.save(fp, D) - with open(c.output('cmp.labels.txt'), 'wt') as fp: + with open(c.output("cmp.labels.txt"), "w") as fp: fp.write("a\nb\n") - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('plot', 'cmp', '--labels', fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("plot", "cmp", "--labels", fail_ok=True) print(c.last_result.status, c.last_result.out, c.last_result.err) assert c.last_result.status != 0 @@ -928,14 +1003,25 @@ def test_do_plot_comparison_4_fail_not_distance(runtmp): def test_plot_6_labels_default(runtmp): # plot --labels is default - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--labels') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--labels") print(runtmp.last_result.out) @@ -949,14 +1035,25 @@ def test_plot_6_labels_default(runtmp): def test_plot_6_labels(runtmp): # specifing --labels gives the right result - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--labels') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--labels") print(runtmp.last_result.out) @@ -970,14 +1067,25 @@ def test_plot_6_labels(runtmp): def test_plot_6_indices(runtmp): # test plot --indices - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--indices') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--indices") print(runtmp.last_result.out) @@ -991,14 +1099,25 @@ def test_plot_6_indices(runtmp): def test_plot_6_no_labels(runtmp): # test plot --no-labels - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--no-labels') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--no-labels") print(runtmp.last_result.out) @@ -1012,14 +1131,25 @@ def test_plot_6_no_labels(runtmp): def test_plot_6_no_indices(runtmp): # test plot --no-labels - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--no-labels') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--no-labels") print(runtmp.last_result.out) @@ -1033,14 +1163,25 @@ def test_plot_6_no_indices(runtmp): def test_plot_6_no_labels_no_indices(runtmp): # test plot --no-labels --no-indices - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--no-labels', '--no-indices') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--no-labels", "--no-indices") print((runtmp.last_result.out,)) @@ -1054,14 +1195,25 @@ def test_plot_6_no_labels_no_indices(runtmp): def test_plot_6_indices_labels(runtmp): # check that --labels --indices => --labels - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--labels', '--indices') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--labels", "--indices") print(runtmp.last_result.out) @@ -1075,21 +1227,32 @@ def test_plot_6_indices_labels(runtmp): def test_plot_override_labeltext(runtmp): # test overriding labeltext - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.run_sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - with open(runtmp.output('new.labels.txt'), 'wt') as fp: - fp.write('a\nb\nc\nd\n') - - runtmp.sourmash('plot', 'cmp', '--labeltext', 'new.labels.txt') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.run_sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + with open(runtmp.output("new.labels.txt"), "w") as fp: + fp.write("a\nb\nc\nd\n") + + runtmp.sourmash("plot", "cmp", "--labeltext", "new.labels.txt") print(runtmp.last_result.out) - assert 'loading labels from new.labels.txt' in runtmp.last_result.err + assert "loading labels from new.labels.txt" in runtmp.last_result.err expected = """\ 0\ta @@ -1101,46 +1264,59 @@ def test_plot_override_labeltext(runtmp): def test_plot_override_labeltext_fail(runtmp): # test failed override of labeltext - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - with open(runtmp.output('new.labels.txt'), 'wt') as fp: - fp.write('a\nb\nc\n') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + with open(runtmp.output("new.labels.txt"), "w") as fp: + fp.write("a\nb\nc\n") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('plot', 'cmp', '--labeltext', 'new.labels.txt') + runtmp.sourmash("plot", "cmp", "--labeltext", "new.labels.txt") print(runtmp.last_result.out) print(runtmp.last_result.err) assert runtmp.last_result.status != 0 - assert 'loading labels from new.labels.txt' in runtmp.last_result.err - assert '3 labels != matrix size, exiting' in runtmp.last_result.err + assert "loading labels from new.labels.txt" in runtmp.last_result.err + assert "3 labels != matrix size, exiting" in runtmp.last_result.err def test_plot_reordered_labels_csv(runtmp): # test 'plot --csv' & correct ordering of labels c = runtmp - ss2 = utils.get_test_data('2.fa.sig') - ss47 = utils.get_test_data('47.fa.sig') - ss63 = utils.get_test_data('63.fa.sig') + ss2 = utils.get_test_data("2.fa.sig") + ss47 = utils.get_test_data("47.fa.sig") + ss63 = utils.get_test_data("63.fa.sig") - c.run_sourmash('compare', '-k', '31', '-o', 'cmp', ss2, ss47, ss63) - c.run_sourmash('plot', 'cmp', '--csv', 'neworder.csv') + c.run_sourmash("compare", "-k", "31", "-o", "cmp", ss2, ss47, ss63) + c.run_sourmash("plot", "cmp", "--csv", "neworder.csv") - with open(c.output('neworder.csv'), newline="") as fp: + with open(c.output("neworder.csv"), newline="") as fp: r = csv.DictReader(fp) akker_vals = set() for row in r: - akker_vals.add(row['CP001071.1 Akkermansia muciniphila ATCC BAA-835, complete genome']) + akker_vals.add( + row["CP001071.1 Akkermansia muciniphila ATCC BAA-835, complete genome"] + ) - assert '1.0' in akker_vals - assert '0.0' in akker_vals + assert "1.0" in akker_vals + assert "0.0" in akker_vals assert len(akker_vals) == 2 @@ -1148,35 +1324,48 @@ def test_plot_reordered_labels_csv_gz(runtmp): # test 'plot --csv' with a .gz output c = runtmp - ss2 = utils.get_test_data('2.fa.sig') - ss47 = utils.get_test_data('47.fa.sig') - ss63 = utils.get_test_data('63.fa.sig') + ss2 = utils.get_test_data("2.fa.sig") + ss47 = utils.get_test_data("47.fa.sig") + ss63 = utils.get_test_data("63.fa.sig") - c.run_sourmash('compare', '-k', '31', '-o', 'cmp', ss2, ss47, ss63) - c.run_sourmash('plot', 'cmp', '--csv', 'neworder.csv.gz') + c.run_sourmash("compare", "-k", "31", "-o", "cmp", ss2, ss47, ss63) + c.run_sourmash("plot", "cmp", "--csv", "neworder.csv.gz") - with gzip.open(c.output('neworder.csv.gz'), 'rt', newline="") as fp: + with gzip.open(c.output("neworder.csv.gz"), "rt", newline="") as fp: r = csv.DictReader(fp) akker_vals = set() for row in r: - akker_vals.add(row['CP001071.1 Akkermansia muciniphila ATCC BAA-835, complete genome']) + akker_vals.add( + row["CP001071.1 Akkermansia muciniphila ATCC BAA-835, complete genome"] + ) - assert '1.0' in akker_vals - assert '0.0' in akker_vals + assert "1.0" in akker_vals + assert "0.0" in akker_vals assert len(akker_vals) == 2 def test_plot_subsample_1(runtmp): # test plotting with --subsample - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--subsample', '3') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--subsample", "3") print(runtmp.last_result.out) @@ -1189,14 +1378,25 @@ def test_plot_subsample_1(runtmp): def test_plot_subsample_2(runtmp): # test plotting --subsample with --subsample-seed - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') - - runtmp.sourmash('compare', testdata1, testdata2, testdata3, testdata4, '-o', 'cmp', '-k', '21', '--dna') - - runtmp.sourmash('plot', 'cmp', '--subsample', '3', '--subsample-seed=2') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") + + runtmp.sourmash( + "compare", + testdata1, + testdata2, + testdata3, + testdata4, + "-o", + "cmp", + "-k", + "21", + "--dna", + ) + + runtmp.sourmash("plot", "cmp", "--subsample", "3", "--subsample-seed=2") print(runtmp.last_result.out) expected = """\ @@ -1208,25 +1408,25 @@ def test_plot_subsample_2(runtmp): @utils.in_tempdir def test_search_query_sig_does_not_exist(c): - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1) + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('search', 'short2.fa.sig', 'short.fa.sig', fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("search", "short2.fa.sig", "short.fa.sig", fail_ok=True) print(c.last_result.status, c.last_result.out, c.last_result.err) assert c.last_result.status == -1 assert "Cannot open query file 'short2.fa.sig'" in c.last_result.err - assert len(c.last_result.err.split('\n\r')) < 5 + assert len(c.last_result.err.split("\n\r")) < 5 @utils.in_tempdir def test_search_subject_sig_does_not_exist(c): - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1) + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('search', 'short.fa.sig', 'short2.fa.sig', fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("search", "short.fa.sig", "short2.fa.sig", fail_ok=True) print(c.last_result.status, c.last_result.out, c.last_result.err) assert c.last_result.status == -1 @@ -1235,12 +1435,13 @@ def test_search_subject_sig_does_not_exist(c): @utils.in_tempdir def test_search_second_subject_sig_does_not_exist(c): - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1) + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('search', 'short.fa.sig', 'short.fa.sig', - 'short2.fa.sig', fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "search", "short.fa.sig", "short.fa.sig", "short2.fa.sig", fail_ok=True + ) print(c.last_result.status, c.last_result.out, c.last_result.err) assert c.last_result.status == -1 @@ -1249,35 +1450,35 @@ def test_search_second_subject_sig_does_not_exist(c): @utils.in_tempdir def test_search(c): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - c.run_sourmash('search', 'short.fa.sig', 'short2.fa.sig') + c.run_sourmash("search", "short.fa.sig", "short2.fa.sig") print(c.last_result.status, c.last_result.out, c.last_result.err) - assert '1 matches' in c.last_result.out - assert '93.0%' in c.last_result.out + assert "1 matches" in c.last_result.out + assert "93.0%" in c.last_result.out def test_search_ignore_abundance(runtmp): # note: uses num signatures. - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - runtmp.sourmash('sketch', 'dna', '-p','k=31,num=500,abund', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500,abund", testdata1, testdata2) # Make sure there's different percent matches when using or # not using abundance - runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("search", "short.fa.sig", "short2.fa.sig") out1 = runtmp.last_result.out print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '81.5%' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "81.5%" in runtmp.last_result.out - runtmp.sourmash('search', '--ignore-abundance', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("search", "--ignore-abundance", "short.fa.sig", "short2.fa.sig") out2 = runtmp.last_result.out print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '93.0%' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "93.0%" in runtmp.last_result.out # Make sure results are different! assert out1 != out2 @@ -1285,102 +1486,104 @@ def test_search_ignore_abundance(runtmp): def test_search_abund_subj_flat(runtmp): # test Index.search_abund requires an abund subj - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('search', sig47, sig63) + runtmp.sourmash("search", sig47, sig63) - assert "'search_abund' requires subject signatures with abundance information" in str(exc.value) + assert ( + "'search_abund' requires subject signatures with abundance information" + in str(exc.value) + ) def test_search_abund_csv(runtmp): # test search with abundance signatures, look at CSV output - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - runtmp.sourmash('sketch', 'dna', '-p','k=31,scaled=1,abund', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + runtmp.sourmash("sketch", "dna", "-p", "k=31,scaled=1,abund", testdata1, testdata2) - runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv') - out1 = runtmp.last_result.out + runtmp.sourmash("search", "short.fa.sig", "short2.fa.sig", "-o", "xxx.csv") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '82.7%' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "82.7%" in runtmp.last_result.out - with open(runtmp.output('xxx.csv'), newline="") as fp: + with open(runtmp.output("xxx.csv"), newline="") as fp: r = csv.DictReader(fp) row = next(r) print(row) - assert float(row['similarity']) == 0.8266277454288367 - assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db' - assert row['filename'].endswith('short2.fa.sig') - assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db' - assert row['query_filename'].endswith('short.fa') - assert row['query_name'] == '' - assert row['query_md5'] == '9191284a' - assert row['filename'] == 'short2.fa.sig', row['filename'] + assert float(row["similarity"]) == 0.8266277454288367 + assert row["md5"] == "bf752903d635b1eb83c53fe4aae951db" + assert row["filename"].endswith("short2.fa.sig") + assert row["md5"] == "bf752903d635b1eb83c53fe4aae951db" + assert row["query_filename"].endswith("short.fa") + assert row["query_name"] == "" + assert row["query_md5"] == "9191284a" + assert row["filename"] == "short2.fa.sig", row["filename"] @utils.in_tempdir def test_search_csv(c): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - c.run_sourmash('search', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv') + c.run_sourmash("search", "short.fa.sig", "short2.fa.sig", "-o", "xxx.csv") print(c.last_result.status, c.last_result.out, c.last_result.err) - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) - assert float(row['similarity']) == 0.93 - assert row['filename'].endswith('short2.fa.sig') - assert row['md5'] == '914591cd1130aa915fe0c0c63db8f19d' - assert row['query_filename'].endswith('short.fa') - assert row['query_name'] == '' - assert row['query_md5'] == 'e26a306d' + assert float(row["similarity"]) == 0.93 + assert row["filename"].endswith("short2.fa.sig") + assert row["md5"] == "914591cd1130aa915fe0c0c63db8f19d" + assert row["query_filename"].endswith("short.fa") + assert row["query_name"] == "" + assert row["query_md5"] == "e26a306d" @utils.in_tempdir def test_search_lca_db(c): # can we do a 'sourmash search' on an LCA database? - query = utils.get_test_data('47.fa.sig') - lca_db = utils.get_test_data('lca/47+63.lca.json') + query = utils.get_test_data("47.fa.sig") + lca_db = utils.get_test_data("lca/47+63.lca.json") - c.run_sourmash('search', query, lca_db) + c.run_sourmash("search", query, lca_db) print(c) - assert 'NC_009665.1 Shewanella baltica OS185, complete genome' in str(c) + assert "NC_009665.1 Shewanella baltica OS185, complete genome" in str(c) def test_search_query_db_md5(runtmp): # pull a search query out of a database with an md5sum - db = utils.get_test_data('prot/protein.sbt.zip') - runtmp.run_sourmash('search', db, db, '--md5', '16869d2c8a1') + db = utils.get_test_data("prot/protein.sbt.zip") + runtmp.run_sourmash("search", db, db, "--md5", "16869d2c8a1") - assert '100.0% GCA_001593925' in str(runtmp) + assert "100.0% GCA_001593925" in str(runtmp) def test_gather_query_db_md5(runtmp, linear_gather, prefetch_gather): # pull a search query out of a database with an md5sum - db = utils.get_test_data('prot/protein.sbt.zip') - runtmp.run_sourmash('gather', db, db, '--md5', '16869d2c8a1', - linear_gather, prefetch_gather) + db = utils.get_test_data("prot/protein.sbt.zip") + runtmp.run_sourmash( + "gather", db, db, "--md5", "16869d2c8a1", linear_gather, prefetch_gather + ) - assert '340.9 kbp 100.0% 100.0% GCA_001593925' in str(runtmp) + assert "340.9 kbp 100.0% 100.0% GCA_001593925" in str(runtmp) def test_gather_query_db_md5_ambiguous(runtmp, linear_gather, prefetch_gather): c = runtmp # what if we give an ambiguous md5 prefix? - db = utils.get_test_data('prot/protein.sbt.zip') + db = utils.get_test_data("prot/protein.sbt.zip") - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('gather', db, db, '--md5', '1', linear_gather, - prefetch_gather) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("gather", db, db, "--md5", "1", linear_gather, prefetch_gather) err = c.last_result.err assert "Error! Multiple signatures start with md5 '1'" in err @@ -1388,38 +1591,46 @@ def test_gather_query_db_md5_ambiguous(runtmp, linear_gather, prefetch_gather): def test_gather_lca_db(runtmp, linear_gather, prefetch_gather): # can we do a 'sourmash gather' on an LCA database? - query = utils.get_test_data('47+63.fa.sig') - lca_db = utils.get_test_data('lca/47+63.lca.json') + query = utils.get_test_data("47+63.fa.sig") + lca_db = utils.get_test_data("lca/47+63.lca.json") - runtmp.sourmash('gather', query, lca_db, linear_gather, prefetch_gather) + runtmp.sourmash("gather", query, lca_db, linear_gather, prefetch_gather) print(runtmp) out = runtmp.last_result.out - assert 'NC_009665.1 Shewanella baltica OS185' in out - assert 'WARNING: final scaled was 10000, vs query scaled of 1000' in out + assert "NC_009665.1 Shewanella baltica OS185" in out + assert "WARNING: final scaled was 10000, vs query scaled of 1000" in out def test_gather_csv_output_filename_bug(runtmp, linear_gather, prefetch_gather): c = runtmp # check a bug where the database filename in the output CSV was incorrect - query = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - lca_db_1 = utils.get_test_data('lca/delmont-1.lca.json') - lca_db_2 = utils.get_test_data('lca/delmont-2.lca.json') - - c.run_sourmash('gather', query, lca_db_1, lca_db_2, '-o', 'out.csv', - linear_gather, prefetch_gather) - with open(c.output('out.csv'), 'rt') as fp: + query = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + lca_db_1 = utils.get_test_data("lca/delmont-1.lca.json") + lca_db_2 = utils.get_test_data("lca/delmont-2.lca.json") + + c.run_sourmash( + "gather", + query, + lca_db_1, + lca_db_2, + "-o", + "out.csv", + linear_gather, + prefetch_gather, + ) + with open(c.output("out.csv")) as fp: r = csv.DictReader(fp) row = next(r) - assert row['filename'] == lca_db_1 + assert row["filename"] == lca_db_1 def test_compare_no_such_file(runtmp): # 'compare' fails on nonexistent files c = runtmp - with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('compare', 'nosuchfile.sig') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("compare", "nosuchfile.sig") assert "Error while reading signatures from 'nosuchfile.sig'." in c.last_result.err @@ -1427,8 +1638,8 @@ def test_compare_no_such_file(runtmp): def test_compare_no_such_file_force(runtmp): # can still run compare on nonexistent with -f c = runtmp - with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('compare', 'nosuchfile.sig', '-f') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("compare", "nosuchfile.sig", "-f") print(c.last_result.err) assert "Error while reading signatures from 'nosuchfile.sig'." @@ -1437,191 +1648,197 @@ def test_compare_no_such_file_force(runtmp): def test_compare_no_matching_sigs(runtmp): # compare fails when no sketches found with desired ksize c = runtmp - query = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + query = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") - with pytest.raises(SourmashCommandFailed) as exc: - c.last_result.status, c.last_result.out, c.last_result.err = \ - c.run_sourmash('compare', '-k', '100', query, fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.last_result.status, c.last_result.out, c.last_result.err = c.run_sourmash( + "compare", "-k", "100", query, fail_ok=True + ) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status - assert 'warning: no signatures loaded at given ksize/molecule type' in c.last_result.err - assert 'no signatures found! exiting.' in c.last_result.err + assert ( + "warning: no signatures loaded at given ksize/molecule type" + in c.last_result.err + ) + assert "no signatures found! exiting." in c.last_result.err def test_compare_deduce_molecule(runtmp): # deduce DNA vs protein from query, if it is unique - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=10,num=500', testdata1,testdata2) + runtmp.sourmash("sketch", "translate", "-p", "k=10,num=500", testdata1, testdata2) - runtmp.sourmash('compare', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("compare", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert 'min similarity in matrix: 0.91' in runtmp.last_result.out + assert "min similarity in matrix: 0.91" in runtmp.last_result.out def test_compare_choose_molecule_dna(runtmp): # choose molecule type - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('compute', '-k', '30', '--dna', '--protein', testdata1, testdata2) + runtmp.sourmash("compute", "-k", "30", "--dna", "--protein", testdata1, testdata2) - runtmp.sourmash('compare', '--dna', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("compare", "--dna", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert 'min similarity in matrix: 0.938' in runtmp.last_result.out + assert "min similarity in matrix: 0.938" in runtmp.last_result.out def test_compare_choose_molecule_protein(runtmp): # choose molecule type - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('compute', '-k', '30', '--dna', '--protein', testdata1, testdata2) + runtmp.sourmash("compute", "-k", "30", "--dna", "--protein", testdata1, testdata2) - runtmp.sourmash('compare', '--protein', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("compare", "--protein", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert 'min similarity in matrix: 0.91' in runtmp.last_result.out + assert "min similarity in matrix: 0.91" in runtmp.last_result.out def test_compare_no_choose_molecule_fail(runtmp): # choose molecule type - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'k=30,num=500',testdata1) + runtmp.sourmash("sketch", "dna", "-p", "k=30,num=500", testdata1) - runtmp.sourmash('sketch', 'protein', '-p', 'k=30,num=500', testdata2) + runtmp.sourmash("sketch", "protein", "-p", "k=30,num=500", testdata2) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('compare', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("compare", "short.fa.sig", "short2.fa.sig") - assert 'multiple molecule types loaded; please specify' in runtmp.last_result.err + assert "multiple molecule types loaded; please specify" in runtmp.last_result.err assert runtmp.last_result.status != 0 def test_compare_deduce_ksize(runtmp): # deduce ksize, if it is unique - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'k=29,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=29,num=500", testdata1, testdata2) - runtmp.sourmash('compare', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("compare", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert 'min similarity in matrix: 0.938' in runtmp.last_result.out + assert "min similarity in matrix: 0.938" in runtmp.last_result.out def test_search_deduce_molecule(runtmp): # deduce DNA vs protein from query, if it is unique - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=10,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "translate", "-p", "k=10,num=500", testdata1, testdata2) - runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("search", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '(k=10, protein)' in runtmp.last_result.err + assert "1 matches" in runtmp.last_result.out + assert "(k=10, protein)" in runtmp.last_result.err def test_search_deduce_ksize(runtmp): # deduce ksize from query, if it is unique - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=23,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "translate", "-p", "k=23,num=500", testdata1, testdata2) - runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("search", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert 'k=23' in runtmp.last_result.err + assert "1 matches" in runtmp.last_result.out + assert "k=23" in runtmp.last_result.err def test_do_sourmash_index_multik_fail(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1) + runtmp.sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1) - runtmp.sourmash('sketch', 'translate', '-p', 'k=32,num=500', testdata2) + runtmp.sourmash("sketch", "translate", "-p", "k=32,num=500", testdata2) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) assert runtmp.last_result.status == -1 def test_do_sourmash_index_multimol_fail(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', testdata1) + runtmp.sourmash("sketch", "translate", testdata1) - runtmp.sourmash('sketch', 'translate', '-p', 'k=30,num=500', testdata2) + runtmp.sourmash("sketch", "translate", "-p", "k=30,num=500", testdata2) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) assert runtmp.last_result.status == -1 def test_do_sourmash_index_multinum_fail(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1) + runtmp.sourmash("sketch", "translate", "-p", "k=31,num=500", testdata1) - runtmp.sourmash('sketch', 'translate', '-p', 'k=31,num=1000', testdata2) + runtmp.sourmash("sketch", "translate", "-p", "k=31,num=1000", testdata2) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) assert runtmp.last_result.status == -1 - assert 'trying to build an SBT with incompatible signatures.' in runtmp.last_result.err + assert ( + "trying to build an SBT with incompatible signatures." in runtmp.last_result.err + ) def test_do_sourmash_index_multiscaled_fail(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'scaled=10', testdata1) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", testdata1) - runtmp.sourmash('sketch', 'dna', '-p', 'scaled=1', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1", testdata2) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) assert runtmp.last_result.status == -1 - assert 'trying to build an SBT with incompatible signatures.' in runtmp.last_result.err + assert ( + "trying to build an SBT with incompatible signatures." in runtmp.last_result.err + ) @utils.in_tempdir def test_do_sourmash_index_multiscaled_rescale(c): # test sourmash index --scaled - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - c.run_sourmash('sketch', 'dna', '-p', 'scaled=10', testdata1) - c.run_sourmash('sketch', 'dna', '-p', 'scaled=1', testdata2) + c.run_sourmash("sketch", "dna", "-p", "scaled=10", testdata1) + c.run_sourmash("sketch", "dna", "-p", "scaled=1", testdata2) - c.run_sourmash('index', 'zzz', - 'short.fa.sig', - 'short2.fa.sig', - '-k', '31', - '--scaled', '10') + c.run_sourmash( + "index", "zzz", "short.fa.sig", "short2.fa.sig", "-k", "31", "--scaled", "10" + ) print(c) assert c.last_result.status == 0 @@ -1630,190 +1847,202 @@ def test_do_sourmash_index_multiscaled_rescale(c): @utils.in_tempdir def test_do_sourmash_index_multiscaled_rescale_fail(c): # test sourmash index --scaled with invalid rescaling (10 -> 5) - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - c.run_sourmash('sketch', 'dna', '-p', 'scaled=10', testdata1) - c.run_sourmash('sketch', 'dna', '-p', 'scaled=1', testdata2) + c.run_sourmash("sketch", "dna", "-p", "scaled=10", testdata1) + c.run_sourmash("sketch", "dna", "-p", "scaled=1", testdata2) # this should fail: cannot go from a scaled value of 10 to 5 with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('index', 'zzz', - 'short.fa.sig', - 'short2.fa.sig', - '-k', '31', - '--scaled', '5') + c.run_sourmash( + "index", "zzz", "short.fa.sig", "short2.fa.sig", "-k", "31", "--scaled", "5" + ) print(e.value) assert c.last_result.status == -1 - assert 'new scaled 5 is lower than current sample scaled 10' in c.last_result.err + assert "new scaled 5 is lower than current sample scaled 10" in c.last_result.err def test_do_sourmash_sbt_search_output(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1,testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', 'short.fa.sig', 'zzz', '-o', 'foo') + runtmp.sourmash("search", "short.fa.sig", "zzz", "-o", "foo") - output = Path(runtmp.output('foo')).read_text() + output = Path(runtmp.output("foo")).read_text() print(output) - assert 'e26a306d26512' in output - assert '914591cd1130aa915' in output + assert "e26a306d26512" in output + assert "914591cd1130aa915" in output # check against a bug in sbt search triggered by incorrect max Jaccard # calculation. def test_do_sourmash_sbt_search_check_bug(runtmp): # mins: 431 - testdata1 = utils.get_test_data('sbt-search-bug/nano.sig') + testdata1 = utils.get_test_data("sbt-search-bug/nano.sig") # mins: 6264 - testdata2 = utils.get_test_data('sbt-search-bug/bacteroides.sig') + testdata2 = utils.get_test_data("sbt-search-bug/bacteroides.sig") - runtmp.sourmash('index', 'zzz', testdata1, testdata2, '-k', '31') + runtmp.sourmash("index", "zzz", testdata1, testdata2, "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', testdata1, 'zzz') + runtmp.sourmash("search", testdata1, "zzz") - assert '1 matches' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out - tree = load_sbt_index(runtmp.output('zzz.sbt.zip')) - assert tree._nodes[0].metadata['min_n_below'] == 431 + tree = load_sbt_index(runtmp.output("zzz.sbt.zip")) + assert tree._nodes[0].metadata["min_n_below"] == 431 def test_do_sourmash_sbt_search_empty_sig(runtmp): # mins: 431 - testdata1 = utils.get_test_data('sbt-search-bug/nano.sig') + testdata1 = utils.get_test_data("sbt-search-bug/nano.sig") # mins: 0 - testdata2 = utils.get_test_data('sbt-search-bug/empty.sig') + testdata2 = utils.get_test_data("sbt-search-bug/empty.sig") - runtmp.sourmash('index', 'zzz', testdata1, testdata2, '-k', '31') + runtmp.sourmash("index", "zzz", testdata1, testdata2, "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', testdata1, 'zzz') + runtmp.sourmash("search", testdata1, "zzz") - assert '1 matches' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out - tree = load_sbt_index(runtmp.output('zzz.sbt.zip')) - assert tree._nodes[0].metadata['min_n_below'] == 1 + tree = load_sbt_index(runtmp.output("zzz.sbt.zip")) + assert tree._nodes[0].metadata["min_n_below"] == 1 def test_do_sourmash_sbt_move_and_search_output(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1,testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', 'zzz.sbt.json', 'short.fa.sig', 'short2.fa.sig', '-k', '31') + runtmp.sourmash( + "index", "zzz.sbt.json", "short.fa.sig", "short2.fa.sig", "-k", "31" + ) - assert os.path.exists(runtmp.output('zzz.sbt.json')) + assert os.path.exists(runtmp.output("zzz.sbt.json")) print(runtmp.last_result.out) - with open(runtmp.output('zzz.sbt.json')) as fp: + with open(runtmp.output("zzz.sbt.json")) as fp: d = json.load(fp) - assert d['storage']['args']['path'] == '.sbt.zzz' + assert d["storage"]["args"]["path"] == ".sbt.zzz" - newpath = runtmp.output('subdir') + newpath = runtmp.output("subdir") os.mkdir(newpath) # move both JSON file and subdirectory. - shutil.move(runtmp.output('zzz.sbt.json'), newpath) - shutil.move(runtmp.output('.sbt.zzz'), newpath) + shutil.move(runtmp.output("zzz.sbt.json"), newpath) + shutil.move(runtmp.output(".sbt.zzz"), newpath) - status, out, err = utils.runscript('sourmash', - ['search', '../short.fa.sig', - 'zzz.sbt.json', '-o', 'foo'], - in_directory=newpath) + status, out, err = utils.runscript( + "sourmash", + ["search", "../short.fa.sig", "zzz.sbt.json", "-o", "foo"], + in_directory=newpath, + ) - output = Path(os.path.join(newpath, 'foo')).read_text() + output = Path(os.path.join(newpath, "foo")).read_text() print(output) - assert '914591cd1130aa91' in output - assert 'e26a306d2651' in output + assert "914591cd1130aa91" in output + assert "e26a306d2651" in output def test_search_deduce_ksize_and_select_appropriate(runtmp): # deduce ksize from query and select correct signature from DB - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=24,num=500', testdata1) + runtmp.sourmash("sketch", "translate", "-p", "k=24,num=500", testdata1) # The DB contains signatres for multiple ksizes - runtmp.sourmash('sketch', 'translate', '-p', 'k=23,num=500', '-p', 'k=24,num=500', testdata2) + runtmp.sourmash( + "sketch", "translate", "-p", "k=23,num=500", "-p", "k=24,num=500", testdata2 + ) - runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("search", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert 'k=24' in runtmp.last_result.err + assert "1 matches" in runtmp.last_result.out + assert "k=24" in runtmp.last_result.err def test_search_deduce_ksize_not_unique(runtmp): # deduce ksize from query, fail because it is not unique - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - - runtmp.sourmash('sketch', 'translate', '-p', 'k=23,num=500', '-p', 'k=25,num=500', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + + runtmp.sourmash( + "sketch", + "translate", + "-p", + "k=23,num=500", + "-p", + "k=25,num=500", + testdata1, + testdata2, + ) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("search", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) assert runtmp.last_result.status == -1 - assert '2 signatures matching ksize' in runtmp.last_result.err + assert "2 signatures matching ksize" in runtmp.last_result.err @utils.in_tempdir def test_search_deduce_ksize_no_match(c): # no matching sigs in search sig list - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - c.run_sourmash('sketch', 'translate', '-p', 'k=23,num=500', testdata1) - c.run_sourmash('sketch', 'translate', '-p', 'k=25,num=500', testdata2) + c.run_sourmash("sketch", "translate", "-p", "k=23,num=500", testdata1) + c.run_sourmash("sketch", "translate", "-p", "k=25,num=500", testdata2) with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('search', 'short.fa.sig', 'short2.fa.sig') + c.run_sourmash("search", "short.fa.sig", "short2.fa.sig") assert "no compatible signatures found in 'short2.fa.sig'" in str(exc.value) def test_search_deduce_ksize_vs_user_specified(runtmp): # user specified ksize is not available - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=23,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "translate", "-p", "k=23,num=500", testdata1, testdata2) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', '-k', '24', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("search", "-k", "24", "short.fa.sig", "short2.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) assert runtmp.last_result.status == -1 - assert '0 signatures matching ksize' in runtmp.last_result.err + assert "0 signatures matching ksize" in runtmp.last_result.err def test_search_containment(runtmp): # search with --containment in signatures - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'scaled=1', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1", testdata1, testdata2) - runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig', '--containment') + runtmp.sourmash("search", "short.fa.sig", "short2.fa.sig", "--containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '95.6%' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "95.6%" in runtmp.last_result.out def test_search_containment_abund(runtmp): @@ -1830,28 +2059,34 @@ def test_search_containment_abund(runtmp): mh2.add_many((1, 5)) # build signatures - x = sourmash.SourmashSignature(mh1, name='a') - y = sourmash.SourmashSignature(mh2, name='b') + x = sourmash.SourmashSignature(mh1, name="a") + y = sourmash.SourmashSignature(mh2, name="b") # save! - with open(runtmp.output('a.sig'), 'wt') as fp: + with open(runtmp.output("a.sig"), "w") as fp: sourmash.save_signatures([x], fp) - with open(runtmp.output('b.sig'), 'wt') as fp: + with open(runtmp.output("b.sig"), "w") as fp: sourmash.save_signatures([y], fp) # run sourmash search --containment with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('search', 'a.sig', 'b.sig', '-o', 'xxx.csv', - '--containment') + runtmp.sourmash("search", "a.sig", "b.sig", "-o", "xxx.csv", "--containment") - assert "ERROR: cannot do containment searches on an abund signature; maybe specify --ignore-abundance?" in str(exc) + assert ( + "ERROR: cannot do containment searches on an abund signature; maybe specify --ignore-abundance?" + in str(exc) + ) # run sourmash search --max-containment with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('search', 'a.sig', 'b.sig', '-o', 'xxx.csv', - '--max-containment') + runtmp.sourmash( + "search", "a.sig", "b.sig", "-o", "xxx.csv", "--max-containment" + ) - assert "ERROR: cannot do containment searches on an abund signature; maybe specify --ignore-abundance?" in str(exc) + assert ( + "ERROR: cannot do containment searches on an abund signature; maybe specify --ignore-abundance?" + in str(exc) + ) def test_search_containment_abund_ignore(runtmp): @@ -1868,25 +2103,32 @@ def test_search_containment_abund_ignore(runtmp): mh2.add_many((1, 5)) # build signatures - x = sourmash.SourmashSignature(mh1, name='a') - y = sourmash.SourmashSignature(mh2, name='b') + x = sourmash.SourmashSignature(mh1, name="a") + y = sourmash.SourmashSignature(mh2, name="b") # save! - with open(runtmp.output('a.sig'), 'wt') as fp: + with open(runtmp.output("a.sig"), "w") as fp: sourmash.save_signatures([x], fp) - with open(runtmp.output('b.sig'), 'wt') as fp: + with open(runtmp.output("b.sig"), "w") as fp: sourmash.save_signatures([y], fp) # run sourmash search - runtmp.sourmash('search', 'a.sig', 'b.sig', '-o', 'xxx.csv', - '--containment', '--ignore-abundance') + runtmp.sourmash( + "search", + "a.sig", + "b.sig", + "-o", + "xxx.csv", + "--containment", + "--ignore-abundance", + ) # check results - with open(runtmp.output('xxx.csv'), 'rt') as fp: + with open(runtmp.output("xxx.csv")) as fp: r = csv.DictReader(fp) row = next(r) - similarity = row['similarity'] - print(f'search output: similarity is {similarity}') + similarity = row["similarity"] + print(f"search output: similarity is {similarity}") print(mh1.contained_by(mh2)) assert float(similarity) == mh1.contained_by(mh2) @@ -1895,150 +2137,154 @@ def test_search_containment_abund_ignore(runtmp): def test_search_containment_sbt(runtmp): # search with --containment in an SBT - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'scaled=1', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1", testdata1, testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', 'short.fa.sig', 'zzz', '--containment') + runtmp.sourmash("search", "short.fa.sig", "zzz", "--containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '95.6%' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "95.6%" in runtmp.last_result.out def test_search_containment_s10(runtmp): # check --containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/genome-s10-small.fa.gz.sig') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/genome-s10-small.fa.gz.sig") - runtmp.sourmash('search', q1, q2, '--containment') + runtmp.sourmash("search", q1, q2, "--containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '16.7%' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "16.7%" in runtmp.last_result.out def test_search_containment_s10_no_max(run): # check --containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/genome-s10-small.fa.gz.sig') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/genome-s10-small.fa.gz.sig") - with pytest.raises(SourmashCommandFailed) as exc: - run.run_sourmash('search', q1, q2, '--containment', - '--max-containment') + with pytest.raises(SourmashCommandFailed): + run.run_sourmash("search", q1, q2, "--containment", "--max-containment") print(run.last_result.out) print(run.last_result.err) - assert "ERROR: cannot specify both --containment and --max-containment!" in run.last_result.err + assert ( + "ERROR: cannot specify both --containment and --max-containment!" + in run.last_result.err + ) def test_search_max_containment_s10_pairwise(runtmp): # check --max-containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/genome-s10-small.fa.gz.sig') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/genome-s10-small.fa.gz.sig") - runtmp.sourmash('search', q1, q2,'--max-containment') + runtmp.sourmash("search", q1, q2, "--max-containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '100.0%' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "100.0%" in runtmp.last_result.out def test_search_containment_s10_siglist(runtmp): # check --containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/*.sig') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/*.sig") q2 = glob.glob(q2) - runtmp.sourmash('search', q1, *q2, '--containment') + runtmp.sourmash("search", q1, *q2, "--containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '3 matches' in runtmp.last_result.out - assert ' 16.7% ../genome-s10-small.fa.gz' in runtmp.last_result.out - assert '100.0% ../genome-s10.fa.gz' in runtmp.last_result.out - assert '100.0% ../genome-s10+s11.fa.gz' in runtmp.last_result.out + assert "3 matches" in runtmp.last_result.out + assert " 16.7% ../genome-s10-small.fa.gz" in runtmp.last_result.out + assert "100.0% ../genome-s10.fa.gz" in runtmp.last_result.out + assert "100.0% ../genome-s10+s11.fa.gz" in runtmp.last_result.out def test_search_max_containment_s10_siglist(runtmp): # check --max-containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/*.sig') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/*.sig") q2 = glob.glob(q2) - runtmp.sourmash('search', q1, *q2, '--max-containment') + runtmp.sourmash("search", q1, *q2, "--max-containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '3 matches' in runtmp.last_result.out - assert '100.0% ../genome-s10-small.fa.gz' in runtmp.last_result.out - assert '100.0% ../genome-s10.fa.gz' in runtmp.last_result.out - assert '100.0% ../genome-s10+s11.fa.gz' in runtmp.last_result.out + assert "3 matches" in runtmp.last_result.out + assert "100.0% ../genome-s10-small.fa.gz" in runtmp.last_result.out + assert "100.0% ../genome-s10.fa.gz" in runtmp.last_result.out + assert "100.0% ../genome-s10+s11.fa.gz" in runtmp.last_result.out def test_search_containment_s10_sbt(runtmp): # check --containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/all.sbt.zip') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/all.sbt.zip") - runtmp.sourmash('search', q1, q2, '--containment') + runtmp.sourmash("search", q1, q2, "--containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '3 matches' in runtmp.last_result.out - assert '100.0% ../genome-s10+s11.fa.gz' in runtmp.last_result.out - assert '100.0% ../genome-s10.fa.gz' in runtmp.last_result.out - assert ' 16.7% ../genome-s10-small.fa.gz' in runtmp.last_result.out + assert "3 matches" in runtmp.last_result.out + assert "100.0% ../genome-s10+s11.fa.gz" in runtmp.last_result.out + assert "100.0% ../genome-s10.fa.gz" in runtmp.last_result.out + assert " 16.7% ../genome-s10-small.fa.gz" in runtmp.last_result.out def test_search_containment_s10_sbt_best_only(runtmp): # check --containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/all.sbt.zip') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/all.sbt.zip") - runtmp.sourmash('search', q1, q2, '--containment', '--best-only') + runtmp.sourmash("search", q1, q2, "--containment", "--best-only") print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '100.0% ' in runtmp.last_result.out # there are at least two perfect matches! + assert ( + "100.0% " in runtmp.last_result.out + ) # there are at least two perfect matches! assert runtmp.last_result.status == 0 def test_search_containment_s10_sbt_empty(runtmp): # check --containment for s10/s10-small at absurd scaled/empty mh - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/all.sbt.zip') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/all.sbt.zip") - runtmp.sourmash('search', q1, q2, '--scaled', '1e7', '--containment') + runtmp.sourmash("search", q1, q2, "--scaled", "1e7", "--containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '0 matches' in runtmp.last_result.out + assert "0 matches" in runtmp.last_result.out def test_search_max_containment_s10_sbt(runtmp): # check --max-containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/all.sbt.zip') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/all.sbt.zip") - runtmp.sourmash('search', q1, q2, '--max-containment') + runtmp.sourmash("search", q1, q2, "--max-containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '3 matches' in runtmp.last_result.out - assert '100.0% ../genome-s10-small.fa.gz' in runtmp.last_result.out - assert '100.0% ../genome-s10.fa.gz' in runtmp.last_result.out - assert '100.0% ../genome-s10+s11.fa.gz' in runtmp.last_result.out + assert "3 matches" in runtmp.last_result.out + assert "100.0% ../genome-s10-small.fa.gz" in runtmp.last_result.out + assert "100.0% ../genome-s10.fa.gz" in runtmp.last_result.out + assert "100.0% ../genome-s10+s11.fa.gz" in runtmp.last_result.out def test_search_max_containment_s10_sbt_best_only(runtmp): # check --max-containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/all.sbt.zip') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/all.sbt.zip") - runtmp.sourmash('search', q1, q2, '--max-containment', '--best-only') + runtmp.sourmash("search", q1, q2, "--max-containment", "--best-only") print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -2048,120 +2294,142 @@ def test_search_max_containment_s10_sbt_best_only(runtmp): def test_search_max_containment_s10_sbt_empty(runtmp): # check --max-containment for s10/s10-small at absurd scaled/empty mh. - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/all.sbt.zip') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/all.sbt.zip") - runtmp.sourmash('search', q1, q2, '--scaled', '1e7', '--max-containment') + runtmp.sourmash("search", q1, q2, "--scaled", "1e7", "--max-containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '0 matches' in runtmp.last_result.out + assert "0 matches" in runtmp.last_result.out def test_search_containment_s10_lca(runtmp): # check --containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/all.lca.json') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/all.lca.json") - runtmp.sourmash('search', q1, q2, '--containment') + runtmp.sourmash("search", q1, q2, "--containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '3 matches' in runtmp.last_result.out - assert '100.0% 455c2f95' in runtmp.last_result.out - assert '100.0% 684aa226' in runtmp.last_result.out - assert ' 16.7% 7f7835d2' in runtmp.last_result.out + assert "3 matches" in runtmp.last_result.out + assert "100.0% 455c2f95" in runtmp.last_result.out + assert "100.0% 684aa226" in runtmp.last_result.out + assert " 16.7% 7f7835d2" in runtmp.last_result.out def test_search_max_containment_s10_lca(runtmp): # check --max-containment for s10/s10-small - q1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig') - q2 = utils.get_test_data('scaled/all.lca.json') + q1 = utils.get_test_data("scaled/genome-s10.fa.gz.sig") + q2 = utils.get_test_data("scaled/all.lca.json") - runtmp.sourmash('search', q1, q2, '--max-containment') + runtmp.sourmash("search", q1, q2, "--max-containment") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '3 matches' in runtmp.last_result.out - assert '100.0% 455c2f95' in runtmp.last_result.out - assert '100.0% 684aa226' in runtmp.last_result.out - assert '100.0% 7f7835d2' in runtmp.last_result.out + assert "3 matches" in runtmp.last_result.out + assert "100.0% 455c2f95" in runtmp.last_result.out + assert "100.0% 684aa226" in runtmp.last_result.out + assert "100.0% 7f7835d2" in runtmp.last_result.out def test_search_gzip(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - data = Path(runtmp.output('short.fa.sig')).read_bytes() - with gzip.open(runtmp.output('zzz.gz'), 'wb') as fp: + data = Path(runtmp.output("short.fa.sig")).read_bytes() + with gzip.open(runtmp.output("zzz.gz"), "wb") as fp: fp.write(data) - data = Path(runtmp.output('short2.fa.sig')).read_bytes() - with gzip.open(runtmp.output('yyy.gz'), 'wb') as fp: + data = Path(runtmp.output("short2.fa.sig")).read_bytes() + with gzip.open(runtmp.output("yyy.gz"), "wb") as fp: fp.write(data) - runtmp.sourmash('search', 'zzz.gz', 'yyy.gz') + runtmp.sourmash("search", "zzz.gz", "yyy.gz") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '93.0%' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "93.0%" in runtmp.last_result.out def test_search_2(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2, testdata3) + runtmp.sourmash( + "sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2, testdata3 + ) - runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig', 'short3.fa.sig') + runtmp.sourmash("search", "short.fa.sig", "short2.fa.sig", "short3.fa.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '2 matches' in runtmp.last_result.out - assert '93.0%' in runtmp.last_result.out - assert '89.6%' in runtmp.last_result.out + assert "2 matches" in runtmp.last_result.out + assert "93.0%" in runtmp.last_result.out + assert "89.6%" in runtmp.last_result.out def test_search_3(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2, testdata3) + runtmp.sourmash( + "sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2, testdata3 + ) - runtmp.sourmash('search', '-n', '1', 'short.fa.sig', 'short2.fa.sig', 'short3.fa.sig') + runtmp.sourmash( + "search", "-n", "1", "short.fa.sig", "short2.fa.sig", "short3.fa.sig" + ) print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '2 matches above threshold 0.080; showing first 1:' in runtmp.last_result.out + assert "2 matches above threshold 0.080; showing first 1:" in runtmp.last_result.out def test_search_4(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2, testdata3) + runtmp.sourmash( + "sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2, testdata3 + ) - runtmp.sourmash('search', '-n', '0', 'short.fa.sig', 'short2.fa.sig', 'short3.fa.sig') + runtmp.sourmash( + "search", "-n", "0", "short.fa.sig", "short2.fa.sig", "short3.fa.sig" + ) print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '2 matches above threshold 0.080:' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out - assert 'short3.fa' in runtmp.last_result.out + assert "2 matches above threshold 0.080:" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out + assert "short3.fa" in runtmp.last_result.out def test_search_5_num_results(runtmp): - query = utils.get_test_data('gather/combined.sig') - against = glob.glob(utils.get_test_data('gather/GCF*.sig')) + query = utils.get_test_data("gather/combined.sig") + against = glob.glob(utils.get_test_data("gather/GCF*.sig")) - runtmp.sourmash('search', '-n', '5', query, *against) + runtmp.sourmash("search", "-n", "5", query, *against) print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '12 matches above threshold 0.080; showing first 5:' in runtmp.last_result.out + assert ( + "12 matches above threshold 0.080; showing first 5:" in runtmp.last_result.out + ) def test_index_check_scaled_bounds_negative(runtmp): with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '-k', '31', '--scaled', '-5', '--dna') + runtmp.sourmash( + "index", + "zzz", + "short.fa.sig", + "short2.fa.sig", + "-k", + "31", + "--scaled", + "-5", + "--dna", + ) print(runtmp.last_result.err) @@ -2170,37 +2438,70 @@ def test_index_check_scaled_bounds_negative(runtmp): def test_index_check_scaled_bounds_less_than_minimum(runtmp): with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '-k', '31', '--scaled', '50', '--dna') - - assert "WARNING: scaled value should be >= 100. Continuing anyway." in runtmp.last_result.err + runtmp.sourmash( + "index", + "zzz", + "short.fa.sig", + "short2.fa.sig", + "-k", + "31", + "--scaled", + "50", + "--dna", + ) + + assert ( + "WARNING: scaled value should be >= 100. Continuing anyway." + in runtmp.last_result.err + ) def test_index_check_scaled_bounds_more_than_maximum(runtmp): with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '-k', '31', '--scaled', '1e9', '--dna') - - assert "WARNING: scaled value should be <= 1e6. Continuing anyway." in runtmp.last_result.err + runtmp.sourmash( + "index", + "zzz", + "short.fa.sig", + "short2.fa.sig", + "-k", + "31", + "--scaled", + "1e9", + "--dna", + ) + + assert ( + "WARNING: scaled value should be <= 1e6. Continuing anyway." + in runtmp.last_result.err + ) @utils.in_tempdir def test_index_metagenome_fromfile(c): # test index --from-file - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") # construct a file list - with open(c.output('sig.list'), 'wt') as fp: + with open(c.output("sig.list"), "w") as fp: fp.write("\n".join(testdata_sigs)) - cmd = ['index', 'gcf_all', testdata_sigs[0], '-k', '21', - '--from-file', c.output('sig.list')] + cmd = [ + "index", + "gcf_all", + testdata_sigs[0], + "-k", + "21", + "--from-file", + c.output("sig.list"), + ] c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) - cmd = 'search {} gcf_all -k 21'.format(query_sig) + cmd = f"search {query_sig} gcf_all -k 21" cmd = cmd.split() c.run_sourmash(*cmd) @@ -2208,28 +2509,31 @@ def test_index_metagenome_fromfile(c): print(out) print(c.last_result.err) - assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T...' in out - assert '12 matches above threshold 0.080; showing first 3:' in out + assert ( + " 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T..." + in out + ) + assert "12 matches above threshold 0.080; showing first 3:" in out + @utils.in_tempdir def test_index_metagenome_fromfile_no_cmdline_sig(c): # test index --from-file - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") # construct a file list - with open(c.output('sig.list'), 'wt') as fp: + with open(c.output("sig.list"), "w") as fp: fp.write("\n".join(testdata_sigs)) - cmd = ['index', 'gcf_all', '-k', '21', - '--from-file', c.output('sig.list')] + cmd = ["index", "gcf_all", "-k", "21", "--from-file", c.output("sig.list")] c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) - cmd = 'search {} gcf_all -k 21'.format(query_sig) + cmd = f"search {query_sig} gcf_all -k 21" cmd = cmd.split() c.run_sourmash(*cmd) @@ -2237,81 +2541,98 @@ def test_index_metagenome_fromfile_no_cmdline_sig(c): print(out) print(c.last_result.err) - assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in out - assert '12 matches above threshold 0.080; showing first 3:' in out + assert ( + " 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T" in out + ) + assert "12 matches above threshold 0.080; showing first 3:" in out def test_search_metagenome(runtmp): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) - runtmp.sourmash('search', query_sig, 'gcf_all', '-k', '21') + runtmp.sourmash("search", query_sig, "gcf_all", "-k", "21") print(runtmp.last_result.out) print(runtmp.last_result.err) - assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in runtmp.last_result.out - assert '12 matches above threshold 0.080; showing first 3:' in runtmp.last_result.out + assert ( + " 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T" + in runtmp.last_result.out + ) + assert ( + "12 matches above threshold 0.080; showing first 3:" in runtmp.last_result.out + ) def test_search_metagenome_traverse(runtmp): - testdata_dir = utils.get_test_data('gather') + testdata_dir = utils.get_test_data("gather") - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - runtmp.sourmash('search', query_sig, testdata_dir, '-k', '21') + runtmp.sourmash("search", query_sig, testdata_dir, "-k", "21") print(runtmp.last_result.out) print(runtmp.last_result.err) - assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in runtmp.last_result.out - assert '13 matches above threshold 0.080; showing first 3:' in runtmp.last_result.out + assert ( + " 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T" + in runtmp.last_result.out + ) + assert ( + "13 matches above threshold 0.080; showing first 3:" in runtmp.last_result.out + ) def test_search_metagenome_traverse_check_csv(runtmp): # this test confirms that the CSV 'filename' output for signatures loaded # via directory traversal properly contains the actual path to the # signature file from which the signature was loaded. - testdata_dir = utils.get_test_data('gather') + testdata_dir = utils.get_test_data("gather") - query_sig = utils.get_test_data('gather/combined.sig') - out_csv = runtmp.output('out.csv') + query_sig = utils.get_test_data("gather/combined.sig") + out_csv = runtmp.output("out.csv") - runtmp.sourmash('search', query_sig, testdata_dir, '-k', '21', '-o', out_csv) + runtmp.sourmash("search", query_sig, testdata_dir, "-k", "21", "-o", out_csv) print(runtmp.last_result.out) print(runtmp.last_result.err) - with open(out_csv, 'rt') as fp: + with open(out_csv) as fp: prefix_len = len(testdata_dir) r = csv.DictReader(fp) for row in r: print(row) - filename = row['filename'] + filename = row["filename"] assert filename.startswith(testdata_dir), filename # should have full path to file sig was loaded from assert len(filename) > prefix_len - assert ' 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in runtmp.last_result.out - assert '13 matches above threshold 0.080; showing first 3:' in runtmp.last_result.out + assert ( + " 33.2% NC_003198.1 Salmonella enterica subsp. enterica serovar T" + in runtmp.last_result.out + ) + assert ( + "13 matches above threshold 0.080; showing first 3:" in runtmp.last_result.out + ) @utils.in_thisdir def test_search_incompatible(c): - num_sig = utils.get_test_data('num/47.fa.sig') - scaled_sig = utils.get_test_data('47.fa.sig') + num_sig = utils.get_test_data("num/47.fa.sig") + scaled_sig = utils.get_test_data("47.fa.sig") - with pytest.raises(SourmashCommandFailed) as exc: + with pytest.raises(SourmashCommandFailed): c.run_sourmash("search", scaled_sig, num_sig, fail_ok=True) assert c.last_result.status != 0 print(c.last_result.out) @@ -2324,52 +2645,61 @@ def test_search_incompatible(c): def test_search_traverse_incompatible(c): # build a directory with some signatures in it, search for compatible # signatures. - searchdir = c.output('searchme') + searchdir = c.output("searchme") os.mkdir(searchdir) - num_sig = utils.get_test_data('num/47.fa.sig') - scaled_sig = utils.get_test_data('47.fa.sig') - shutil.copyfile(num_sig, c.output('searchme/num.sig')) - shutil.copyfile(scaled_sig, c.output('searchme/scaled.sig')) + num_sig = utils.get_test_data("num/47.fa.sig") + scaled_sig = utils.get_test_data("47.fa.sig") + shutil.copyfile(num_sig, c.output("searchme/num.sig")) + shutil.copyfile(scaled_sig, c.output("searchme/scaled.sig")) - c.run_sourmash("search", scaled_sig, c.output('searchme')) - assert '100.0% NC_009665.1 Shewanella baltica OS185, complete genome' in c.last_result.out + c.run_sourmash("search", scaled_sig, c.output("searchme")) + assert ( + "100.0% NC_009665.1 Shewanella baltica OS185, complete genome" + in c.last_result.out + ) def test_search_check_scaled_bounds_negative(runtmp): - testdata_glob = utils.get_test_data('gather/GCF*.sig') - testdata_sigs = glob.glob(testdata_glob) + testdata_glob = utils.get_test_data("gather/GCF*.sig") + glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', query_sig, 'gcf_all', '-k', '21', '--scaled', '-5') + runtmp.sourmash("search", query_sig, "gcf_all", "-k", "21", "--scaled", "-5") assert "ERROR: scaled value must be positive" in runtmp.last_result.err def test_search_check_scaled_bounds_less_than_minimum(runtmp): - testdata_glob = utils.get_test_data('gather/GCF*.sig') - testdata_sigs = glob.glob(testdata_glob) + testdata_glob = utils.get_test_data("gather/GCF*.sig") + glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', query_sig, 'gcf_all', '-k', '21', '--scaled', '50') + runtmp.sourmash("search", query_sig, "gcf_all", "-k", "21", "--scaled", "50") - assert "WARNING: scaled value should be >= 100. Continuing anyway." in runtmp.last_result.err + assert ( + "WARNING: scaled value should be >= 100. Continuing anyway." + in runtmp.last_result.err + ) def test_search_check_scaled_bounds_more_than_maximum(runtmp): - testdata_glob = utils.get_test_data('gather/GCF*.sig') - testdata_sigs = glob.glob(testdata_glob) + testdata_glob = utils.get_test_data("gather/GCF*.sig") + glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', query_sig, 'gcf_all', '-k', '21', '--scaled', '1e9') + runtmp.sourmash("search", query_sig, "gcf_all", "-k", "21", "--scaled", "1e9") - assert "WARNING: scaled value should be <= 1e6. Continuing anyway." in runtmp.last_result.err + assert ( + "WARNING: scaled value should be <= 1e6. Continuing anyway." + in runtmp.last_result.err + ) # explanation: you cannot downsample a scaled SBT to match a scaled @@ -2377,77 +2707,108 @@ def test_search_check_scaled_bounds_more_than_maximum(runtmp): # (you *can* downsample a signature to match an SBT.) def test_search_metagenome_sbt_downsample_fail(runtmp): # test downsample on SBT => failure, with --fail-on-empty-databases - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', query_sig, 'gcf_all', '-k', '21', '--scaled', '100000') + runtmp.sourmash( + "search", query_sig, "gcf_all", "-k", "21", "--scaled", "100000" + ) print(runtmp.last_result.out) print(runtmp.last_result.err) assert runtmp.last_result.status == -1 assert "ERROR: cannot use 'gcf_all' for this query." in runtmp.last_result.err - assert "search scaled value 100000 is less than database scaled value of 10000" in runtmp.last_result.err + assert ( + "search scaled value 100000 is less than database scaled value of 10000" + in runtmp.last_result.err + ) def test_search_metagenome_sbt_downsample_nofail(runtmp): # test downsample on SBT => failure but ok with --no-fail-on-empty-database - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) - runtmp.sourmash('search', query_sig, 'gcf_all', '-k', '21', '--scaled', '100000', '--no-fail-on-empty-database') + runtmp.sourmash( + "search", + query_sig, + "gcf_all", + "-k", + "21", + "--scaled", + "100000", + "--no-fail-on-empty-database", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) assert runtmp.last_result.status == 0 assert "ERROR: cannot use 'gcf_all' for this query." in runtmp.last_result.err - assert "search scaled value 100000 is less than database scaled value of 10000" in runtmp.last_result.err + assert ( + "search scaled value 100000 is less than database scaled value of 10000" + in runtmp.last_result.err + ) assert "0 matches" in runtmp.last_result.out def test_search_metagenome_downsample_containment(runtmp): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) - runtmp.sourmash('search', query_sig, 'gcf_all', '-k', '21', '--scaled', '100000', '--containment') + runtmp.sourmash( + "search", + query_sig, + "gcf_all", + "-k", + "21", + "--scaled", + "100000", + "--containment", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert ' 32.9% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in runtmp.last_result.out - assert '12 matches above threshold 0.080; showing first 3:' in runtmp.last_result.out + assert ( + " 32.9% NC_003198.1 Salmonella enterica subsp. enterica serovar T" + in runtmp.last_result.out + ) + assert ( + "12 matches above threshold 0.080; showing first 3:" in runtmp.last_result.out + ) @utils.in_tempdir @@ -2455,36 +2816,46 @@ def test_search_metagenome_downsample_index(c): # does same search as search_metagenome_downsample_containment but # rescales during indexing - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") # downscale during indexing, rather than during search. - c.run_sourmash('index', 'gcf_all', *testdata_sigs, '-k', '21', - '--scaled', '100000') + c.run_sourmash("index", "gcf_all", *testdata_sigs, "-k", "21", "--scaled", "100000") - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) - c.run_sourmash('search', query_sig, 'gcf_all', '-k', '21', - '--containment') + c.run_sourmash("search", query_sig, "gcf_all", "-k", "21", "--containment") print(c) - assert ' 32.9% NC_003198.1 Salmonella enterica subsp. enterica serovar T' in str( - c) - assert ' 29.7% NC_003197.2 Salmonella enterica subsp. enterica serovar T' in str( - c) - assert '12 matches above threshold 0.080; showing first 3:' in str(c) + assert ( + " 32.9% NC_003198.1 Salmonella enterica subsp. enterica serovar T" + in str(c) + ) + assert ( + " 29.7% NC_003197.2 Salmonella enterica subsp. enterica serovar T" + in str(c) + ) + assert "12 matches above threshold 0.080; showing first 3:" in str(c) def test_search_with_picklist(runtmp): # test 'sourmash search' with picklists - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') - - runtmp.sourmash('search', metag_sig, *gcf_sigs, '--containment', - '-k', '21', '--picklist', f"{picklist}:md5:md5") + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") + + runtmp.sourmash( + "search", + metag_sig, + *gcf_sigs, + "--containment", + "-k", + "21", + "--picklist", + f"{picklist}:md5:md5", + ) err = runtmp.last_result.err print(err) @@ -2502,12 +2873,20 @@ def test_search_with_picklist(runtmp): def test_search_with_picklist_exclude(runtmp): # test 'sourmash search' with picklists - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') - - runtmp.sourmash('search', metag_sig, *gcf_sigs, '--containment', - '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude") + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") + + runtmp.sourmash( + "search", + metag_sig, + *gcf_sigs, + "--containment", + "-k", + "21", + "--picklist", + f"{picklist}:md5:md5:exclude", + ) err = runtmp.last_result.err print(err) @@ -2524,11 +2903,19 @@ def test_search_with_picklist_exclude(runtmp): def test_search_with_pattern_include(runtmp): # test 'sourmash search' with --include-db-pattern - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - - runtmp.sourmash('search', metag_sig, *gcf_sigs, '--containment', - '-k', '21', '--include', "thermotoga") + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + + runtmp.sourmash( + "search", + metag_sig, + *gcf_sigs, + "--containment", + "-k", + "21", + "--include", + "thermotoga", + ) err = runtmp.last_result.err print(err) @@ -2543,11 +2930,19 @@ def test_search_with_pattern_include(runtmp): def test_search_with_pattern_exclude(runtmp): # test 'sourmash search' with --exclude-db-pattern - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - - runtmp.sourmash('search', metag_sig, *gcf_sigs, '--containment', - '-k', '21', '--exclude', "thermotoga") + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + + runtmp.sourmash( + "search", + metag_sig, + *gcf_sigs, + "--containment", + "-k", + "21", + "--exclude", + "thermotoga", + ) err = runtmp.last_result.err print(err) @@ -2562,13 +2957,12 @@ def test_search_with_pattern_exclude(runtmp): def test_search_empty_db_fail(runtmp): # search should fail on empty db with --fail-on-empty-database - query = utils.get_test_data('2.fa.sig') - against = utils.get_test_data('47.fa.sig') - against2 = utils.get_test_data('lca/47+63.lca.json') + query = utils.get_test_data("2.fa.sig") + against = utils.get_test_data("47.fa.sig") + against2 = utils.get_test_data("lca/47+63.lca.json") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', query, against, against2, '-k', '51') - + runtmp.sourmash("search", query, against, against2, "-k", "51") err = runtmp.last_result.err assert "no compatible signatures found in " in err @@ -2576,12 +2970,13 @@ def test_search_empty_db_fail(runtmp): def test_search_empty_db_nofail(runtmp): # search should not fail on empty db with --no-fail-on-empty-database - query = utils.get_test_data('2.fa.sig') - against = utils.get_test_data('47.fa.sig') - against2 = utils.get_test_data('lca/47+63.lca.json') + query = utils.get_test_data("2.fa.sig") + against = utils.get_test_data("47.fa.sig") + against2 = utils.get_test_data("lca/47+63.lca.json") - runtmp.sourmash('search', query, against, against2, '-k', '51', - '--no-fail-on-empty-data') + runtmp.sourmash( + "search", query, against, against2, "-k", "51", "--no-fail-on-empty-data" + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -2589,206 +2984,239 @@ def test_search_empty_db_nofail(runtmp): print(err) assert "no compatible signatures found in " in err - assert "ksize on this database is 31; this is different from requested ksize of 51" in err + assert ( + "ksize on this database is 31; this is different from requested ksize of 51" + in err + ) assert "loaded 50 total signatures from 2 locations" in err assert "after selecting signatures compatible with search, 0 remain." in err def test_mash_csv_to_sig(runtmp): - testdata1 = utils.get_test_data('short.fa.msh.dump') - testdata2 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa.msh.dump") + testdata2 = utils.get_test_data("short.fa") - runtmp.sourmash('import_csv', testdata1, '-o', 'xxx.sig') + runtmp.sourmash("import_csv", testdata1, "-o", "xxx.sig") - runtmp.sourmash('sketch', 'dna', '-p','k=31,num=970',testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=970", testdata2) - runtmp.sourmash('search', '-k', '31', 'short.fa.sig', 'xxx.sig') + runtmp.sourmash("search", "-k", "31", "short.fa.sig", "xxx.sig") print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err) - assert '1 matches' in runtmp.last_result.out - assert '100.0% short.fa' in runtmp.last_result.out + assert "1 matches" in runtmp.last_result.out + assert "100.0% short.fa" in runtmp.last_result.out def test_do_sourmash_index_bad_args(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '-k', '31', '--dna', '--protein') + runtmp.sourmash( + "index", + "zzz", + "short.fa.sig", + "short2.fa.sig", + "-k", + "31", + "--dna", + "--protein", + ) print(runtmp.last_result.out, runtmp.last_result.err) - assert 'cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff' in runtmp.last_result.err + assert ( + "cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff" + in runtmp.last_result.err + ) assert runtmp.last_result.status != 0 def test_do_sourmash_sbt_search(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', 'short.fa.sig', 'zzz') + runtmp.sourmash("search", "short.fa.sig", "zzz") print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out def test_do_sourmash_sbt_search_wrong_ksize(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=31,num=500', '-p', 'k=51,num=500', testdata1, testdata2) + runtmp.sourmash( + "sketch", + "translate", + "-p", + "k=31,num=500", + "-p", + "k=51,num=500", + testdata1, + testdata2, + ) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', '-k', '51', 'short.fa.sig', 'zzz') + runtmp.sourmash("search", "-k", "51", "short.fa.sig", "zzz") assert runtmp.last_result.status == -1 print(runtmp.last_result.out) print(runtmp.last_result.err) assert "ERROR: cannot use 'zzz' for this query." in runtmp.last_result.err - assert "search ksize 51 is different from database ksize 31" in runtmp.last_result.err + assert ( + "search ksize 51 is different from database ksize 31" in runtmp.last_result.err + ) def test_do_sourmash_sbt_search_multiple(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('index', 'zzz2', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz2", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz2.sbt.zip')) + assert os.path.exists(runtmp.output("zzz2.sbt.zip")) - runtmp.sourmash('search', 'short.fa.sig', 'zzz', 'zzz2') + runtmp.sourmash("search", "short.fa.sig", "zzz", "zzz2") print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out def test_do_sourmash_sbt_search_and_sigs(runtmp): # search an SBT and a signature at same time. - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', 'short.fa.sig', 'zzz', 'short2.fa.sig') + runtmp.sourmash("search", "short.fa.sig", "zzz", "short2.fa.sig") print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out def test_do_sourmash_sbt_search_downsample(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'k=31,scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,scaled=10", testdata1, testdata2) - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") - runtmp.sourmash('sketch','dna','-p','k=31,scaled=5', '-o', 'query.sig', testdata1) + runtmp.sourmash( + "sketch", "dna", "-p", "k=31,scaled=5", "-o", "query.sig", testdata1 + ) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', 'query.sig', 'zzz') + runtmp.sourmash("search", "query.sig", "zzz") print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out def test_do_sourmash_sbt_search_downsample_2(runtmp): - testdata1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') - testdata2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') + testdata1 = utils.get_test_data("lca-root/TARA_MED_MAG_00029.fa.sig") + testdata2 = utils.get_test_data("lca-root/TOBG_MED-875.fna.gz.sig") - sbtname = 'foo' + sbtname = "foo" - runtmp.sourmash('index', '-k', '31', sbtname, testdata2) + runtmp.sourmash("index", "-k", "31", sbtname, testdata2) assert runtmp.last_result.status == 0 with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', testdata1, sbtname, '--scaled=100000', '--threshold=0.01') + runtmp.sourmash( + "search", testdata1, sbtname, "--scaled=100000", "--threshold=0.01" + ) assert runtmp.last_result.status == -1 print(runtmp.last_result.out) print(runtmp.last_result.err) assert "ERROR: cannot use 'foo' for this query." in runtmp.last_result.err - assert "search scaled value 100000 is less than database scaled value of 2000" in runtmp.last_result.err + assert ( + "search scaled value 100000 is less than database scaled value of 2000" + in runtmp.last_result.err + ) @utils.in_tempdir def test_do_sourmash_index_abund(c): # 'sourmash index' should flatten signatures w/track_abund. - testdata2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') + testdata2 = utils.get_test_data("lca-root/TOBG_MED-875.fna.gz.sig") - with open(testdata2, 'rt') as fp: + with open(testdata2): ss = sourmash.load_one_signature(testdata2, ksize=31) assert ss.minhash.track_abundance == True - sbtname = 'foo' + sbtname = "foo" - c.run_sourmash('index', '-k', '31', sbtname, testdata2) + c.run_sourmash("index", "-k", "31", sbtname, testdata2) for kk in sourmash.load_file_as_signatures(c.output(sbtname)): assert kk.minhash.track_abundance == False def test_do_sourmash_index_single(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', 'short.fa.sig', 'zzz') + runtmp.sourmash("search", "short.fa.sig", "zzz") print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out def test_do_sourmash_sbt_search_selectprot(runtmp): # index should fail when run on signatures with multiple types - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - args = ['sketch', 'dna', '-p', 'k=30,num=500',testdata1, testdata2] + args = ["sketch", "dna", "-p", "k=30,num=500", testdata1, testdata2] runtmp.sourmash(*args) - args = ['index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig'] + args = ["index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig"] with pytest.raises(SourmashCommandFailed): runtmp.sourmash(*args) @@ -2801,122 +3229,130 @@ def test_do_sourmash_sbt_search_selectprot(runtmp): def test_do_sourmash_search_multimoltype_query(runtmp): # 'search' should fail if multiple sigs are given as query, due to # multiple molecule types. - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") # first, calculate signatures with multiple molecule types - args = ['sketch', 'translate', testdata1, testdata2, - '-p', 'protein', '-p', 'dayhoff'] + args = [ + "sketch", + "translate", + testdata1, + testdata2, + "-p", + "protein", + "-p", + "dayhoff", + ] runtmp.sourmash(*args) # now, index one of 'em - args = ['index', 'zzz', 'short.fa.sig', 'short2.fa.sig', '--protein'] + args = ["index", "zzz", "short.fa.sig", "short2.fa.sig", "--protein"] runtmp.sourmash(*args) # output exists, yes? - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) # now, try searching. Should raise error. - args = ['search', 'short.fa.sig', 'zzz'] - with pytest.raises(SourmashCommandFailed) as exc: + args = ["search", "short.fa.sig", "zzz"] + with pytest.raises(SourmashCommandFailed): runtmp.sourmash(*args) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'need exactly one' in runtmp.last_result.err + assert "need exactly one" in runtmp.last_result.err def test_do_sourmash_index_traverse(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', '.') + runtmp.sourmash("index", "-k", "31", "zzz", ".") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) - assert 'loaded 2 sigs; saving SBT under' in runtmp.last_result.err + assert os.path.exists(runtmp.output("zzz.sbt.zip")) + assert "loaded 2 sigs; saving SBT under" in runtmp.last_result.err - runtmp.sourmash('search', 'short.fa.sig', 'zzz') + runtmp.sourmash("search", "short.fa.sig", "zzz") print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out @utils.in_tempdir def test_do_sourmash_index_traverse_force(c): # test loading of files that don't end with .sig with -f - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - outdir = c.output('sigs') + outdir = c.output("sigs") os.mkdir(outdir) - out1 = os.path.join(outdir, 'short1') - out2 = os.path.join(outdir, 'short2') + out1 = os.path.join(outdir, "short1") + out2 = os.path.join(outdir, "short2") - c.run_sourmash('sketch','dna','-p','k=31,scaled=5', '-o', out1, testdata1) - c.run_sourmash('sketch','dna','-p','k=31,scaled=5', '-o', out2, testdata2) + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=5", "-o", out1, testdata1) + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=5", "-o", out2, testdata2) - c.run_sourmash('index', '-k', '31', 'zzz', '.', '-f') + c.run_sourmash("index", "-k", "31", "zzz", ".", "-f") err = c.last_result.err - assert os.path.exists(c.output('zzz.sbt.zip')) - assert 'loaded 2 sigs; saving SBT under' in err + assert os.path.exists(c.output("zzz.sbt.zip")) + assert "loaded 2 sigs; saving SBT under" in err - c.run_sourmash('search', out1, 'zzz') + c.run_sourmash("search", out1, "zzz") out = c.last_result.out print(out) - assert 'short.fa' in out - assert 'short2.fa' in out + assert "short.fa" in out + assert "short2.fa" in out def test_do_sourmash_index_sparseness(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna','-p','k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz.sbt.json', '.', '--sparseness', '1.0') + runtmp.sourmash("index", "-k", "31", "zzz.sbt.json", ".", "--sparseness", "1.0") - assert os.path.exists(runtmp.output('zzz.sbt.json')) - assert 'loaded 2 sigs; saving SBT under' in runtmp.last_result.err + assert os.path.exists(runtmp.output("zzz.sbt.json")) + assert "loaded 2 sigs; saving SBT under" in runtmp.last_result.err - runtmp.sourmash('search', 'short.fa.sig', 'zzz.sbt.json') + runtmp.sourmash("search", "short.fa.sig", "zzz.sbt.json") print(runtmp.last_result.out) - assert len(glob.glob(runtmp.output('.sbt.zzz/*'))) == 3 - assert not glob.glob(runtmp.output('.sbt.zzz/*internal*')) + assert len(glob.glob(runtmp.output(".sbt.zzz/*"))) == 3 + assert not glob.glob(runtmp.output(".sbt.zzz/*internal*")) - assert 'short.fa' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out def test_do_sourmash_sbt_combine(runtmp): files = [utils.get_test_data(f) for f in utils.SIG_FILES] - runtmp.sourmash('index', '-k', '31', 'zzz', *files) + runtmp.sourmash("index", "-k", "31", "zzz", *files) - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('sbt_combine', 'joined', 'zzz.sbt.zip', 'zzz.sbt.zip') + runtmp.sourmash("sbt_combine", "joined", "zzz.sbt.zip", "zzz.sbt.zip") - assert os.path.exists(runtmp.output('joined.sbt.zip')) + assert os.path.exists(runtmp.output("joined.sbt.zip")) filename = os.path.splitext(os.path.basename(utils.SIG_FILES[0]))[0] - runtmp.sourmash('search', files[0], 'zzz') + runtmp.sourmash("search", files[0], "zzz") print(runtmp.last_result.out) # we get notification of signature loading, too - so notify + result. assert runtmp.last_result.out.count(filename) == 1 - runtmp.sourmash('search', files[0], 'joined') + runtmp.sourmash("search", files[0], "joined") print(runtmp.last_result.out) @@ -2924,130 +3360,148 @@ def test_do_sourmash_sbt_combine(runtmp): def test_do_sourmash_index_append(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") - runtmp.sourmash('sketch','dna', '-p', 'k=31,num=500', testdata1, testdata2, testdata3) + runtmp.sourmash( + "sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2, testdata3 + ) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - sbt_name = runtmp.output('zzz',) - sig_loc = runtmp.output('short3.fa.sig') + sbt_name = runtmp.output( + "zzz", + ) + sig_loc = runtmp.output("short3.fa.sig") - runtmp.sourmash('search', sig_loc, sbt_name) + runtmp.sourmash("search", sig_loc, sbt_name) print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out - assert 'short3.fa' not in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out + assert "short3.fa" not in runtmp.last_result.out - runtmp.sourmash('index', '-k', '31', '--append', 'zzz', 'short3.fa.sig') + runtmp.sourmash("index", "-k", "31", "--append", "zzz", "short3.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - sbt_name = runtmp.output('zzz',) - sig_loc = runtmp.output('short3.fa.sig') + sbt_name = runtmp.output( + "zzz", + ) + sig_loc = runtmp.output("short3.fa.sig") - runtmp.sourmash('search', '--threshold', '0.95', sig_loc, sbt_name) + runtmp.sourmash("search", "--threshold", "0.95", sig_loc, sbt_name) print(runtmp.last_result.out) - assert 'short.fa' not in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out - assert 'short3.fa' in runtmp.last_result.out + assert "short.fa" not in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out + assert "short3.fa" in runtmp.last_result.out def test_do_sourmash_sbt_search_otherdir(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', '-k', '31', 'xxx/zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "xxx/zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('xxx/zzz.sbt.zip')) + assert os.path.exists(runtmp.output("xxx/zzz.sbt.zip")) - sbt_name = runtmp.output('xxx/zzz',) - sig_loc = runtmp.output('short.fa.sig') + sbt_name = runtmp.output( + "xxx/zzz", + ) + sig_loc = runtmp.output("short.fa.sig") - runtmp.sourmash('search', sig_loc, sbt_name) + runtmp.sourmash("search", sig_loc, sbt_name) print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out - assert 'short2.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out + assert "short2.fa" in runtmp.last_result.out def test_do_sourmash_sbt_search_scaled_vs_num_1(runtmp): # should not work: scaled query against num tree - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'k=31,num=500', testdata1) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1) - runtmp.sourmash('sketch','dna', '-p', 'scaled=1000', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1000", testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - sbt_name = runtmp.output('zzz',) - sig_loc = runtmp.output('short2.fa.sig') + sbt_name = runtmp.output( + "zzz", + ) + sig_loc = runtmp.output("short2.fa.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', sig_loc, sbt_name) + runtmp.sourmash("search", sig_loc, sbt_name) assert runtmp.last_result.status == -1 print(runtmp.last_result.out) print(runtmp.last_result.err) assert "ERROR: cannot use '" in runtmp.last_result.err - assert "this database was created with 'num' MinHash sketches, not 'scaled'" in runtmp.last_result.err + assert ( + "this database was created with 'num' MinHash sketches, not 'scaled'" + in runtmp.last_result.err + ) def test_do_sourmash_sbt_search_scaled_vs_num_2(runtmp): # should not work: num query against scaled tree - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'k=31,num=500', testdata1) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1) - runtmp.sourmash('sketch','dna', '-p', 'scaled=1000', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1000", testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - sbt_name = runtmp.output('zzz',) - sig_loc = runtmp.output('short.fa.sig') + sbt_name = runtmp.output( + "zzz", + ) + sig_loc = runtmp.output("short.fa.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', sig_loc, sbt_name) + runtmp.sourmash("search", sig_loc, sbt_name) assert runtmp.last_result.status == -1 print(runtmp.last_result.out) print(runtmp.last_result.err) assert "ERROR: cannot use '" in runtmp.last_result.err - assert "this database was created with 'scaled' MinHash sketches, not 'num'" in runtmp.last_result.err + assert ( + "this database was created with 'scaled' MinHash sketches, not 'num'" + in runtmp.last_result.err + ) def test_do_sourmash_sbt_search_scaled_vs_num_3(runtmp): # should not work: scaled query against num signature - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'k=31,num=500', testdata1) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1) - runtmp.sourmash('sketch','dna', '-p', 'scaled=1000', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1000", testdata2) - sig_loc = runtmp.output('short.fa.sig') - sig_loc2 = runtmp.output('short2.fa.sig') + sig_loc = runtmp.output("short.fa.sig") + sig_loc2 = runtmp.output("short2.fa.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', sig_loc, sig_loc2) + runtmp.sourmash("search", sig_loc, sig_loc2) assert runtmp.last_result.status == -1 print(runtmp.last_result.out) @@ -3057,18 +3511,18 @@ def test_do_sourmash_sbt_search_scaled_vs_num_3(runtmp): def test_do_sourmash_sbt_search_scaled_vs_num_4(runtmp): # should not work: num query against scaled signature - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'k=31,num=500', testdata1) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1) - runtmp.sourmash('sketch','dna', '-p', 'scaled=1000', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1000", testdata2) - sig_loc = runtmp.output('short.fa.sig') - sig_loc2 = runtmp.output('short2.fa.sig') + sig_loc = runtmp.output("short.fa.sig") + sig_loc2 = runtmp.output("short2.fa.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', sig_loc2, sig_loc) + runtmp.sourmash("search", sig_loc2, sig_loc) assert runtmp.last_result.status == -1 print(runtmp.last_result.out) @@ -3079,13 +3533,13 @@ def test_do_sourmash_sbt_search_scaled_vs_num_4(runtmp): def test_do_sourmash_check_search_vs_actual_similarity(runtmp): files = [utils.get_test_data(f) for f in utils.SIG_FILES] - runtmp.sourmash('index', '-k', '31', 'zzz', *files) + runtmp.sourmash("index", "-k", "31", "zzz", *files) - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - filename = os.path.splitext(os.path.basename(utils.SIG_FILES[0]))[0] + os.path.splitext(os.path.basename(utils.SIG_FILES[0]))[0] - runtmp.sourmash('search', files[0], 'zzz') + runtmp.sourmash("search", files[0], "zzz") assert runtmp.last_result.status == 0 @@ -3093,9 +3547,9 @@ def test_do_sourmash_check_search_vs_actual_similarity(runtmp): def test_do_sourmash_check_sbt_filenames(runtmp): files = [utils.get_test_data(f) for f in utils.SIG_FILES] - runtmp.sourmash('index', '-k', '31', 'zzz.sbt.json', *files) + runtmp.sourmash("index", "-k", "31", "zzz.sbt.json", *files) - assert os.path.exists(runtmp.output('zzz.sbt.json')) + assert os.path.exists(runtmp.output("zzz.sbt.json")) sig_names = set() sig_md5s = set() @@ -3104,11 +3558,11 @@ def test_do_sourmash_check_sbt_filenames(runtmp): sig_names.add(sig.name) sig_md5s.add(sig.md5sum()) - sbt_files = glob.glob(runtmp.output('.sbt.zzz/*')) + sbt_files = glob.glob(runtmp.output(".sbt.zzz/*")) assert len(sbt_files) == 14 for f in sbt_files: - if 'internal' in f or f.endswith('zzz.manifest.csv'): + if "internal" in f or f.endswith("zzz.manifest.csv"): continue f = os.path.basename(f) assert f not in sig_names @@ -3116,161 +3570,208 @@ def test_do_sourmash_check_sbt_filenames(runtmp): def test_do_sourmash_sbt_search_bestonly(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', '--best-only', 'short.fa.sig', 'zzz') + runtmp.sourmash("search", "--best-only", "short.fa.sig", "zzz") print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out def test_do_sourmash_sbt_search_bestonly_scaled(runtmp): # as currently implemented, the query signature will be automatically # downsampled to match the tree. - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'scaled=1', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1", testdata1, testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig', '--scaled', '10') + runtmp.sourmash( + "index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig", "--scaled", "10" + ) - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('search', '--best-only', 'short.fa.sig', 'zzz') + runtmp.sourmash("search", "--best-only", "short.fa.sig", "zzz") print(runtmp.last_result.out) - assert 'short.fa' in runtmp.last_result.out + assert "short.fa" in runtmp.last_result.out def test_sbt_search_order_dependence(runtmp): - testdata1 = utils.get_test_data('genome-s10.fa.gz') - testdata2 = utils.get_test_data('genome-s11.fa.gz') - testdata3 = utils.get_test_data('genome-s12.fa.gz') - testdata4 = utils.get_test_data('genome-s10+s11.fa.gz') - - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,scaled=10000', '-p', 'k=31,scaled=10000', testdata1, testdata2, testdata3, testdata4) - - runtmp.sourmash('index', '-k', '21', '134', 'genome-s10+s11.fa.gz.sig', 'genome-s11.fa.gz.sig', 'genome-s12.fa.gz.sig') - - runtmp.sourmash('search', '-k', '21', 'genome-s11.fa.gz.sig', '134', '--best-only', '-k', '21', '--dna') + testdata1 = utils.get_test_data("genome-s10.fa.gz") + testdata2 = utils.get_test_data("genome-s11.fa.gz") + testdata3 = utils.get_test_data("genome-s12.fa.gz") + testdata4 = utils.get_test_data("genome-s10+s11.fa.gz") + + runtmp.sourmash( + "sketch", + "dna", + "-p", + "k=21,scaled=10000", + "-p", + "k=31,scaled=10000", + testdata1, + testdata2, + testdata3, + testdata4, + ) + + runtmp.sourmash( + "index", + "-k", + "21", + "134", + "genome-s10+s11.fa.gz.sig", + "genome-s11.fa.gz.sig", + "genome-s12.fa.gz.sig", + ) + + runtmp.sourmash( + "search", + "-k", + "21", + "genome-s11.fa.gz.sig", + "134", + "--best-only", + "-k", + "21", + "--dna", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '100.0%' in runtmp.last_result.out + assert "100.0%" in runtmp.last_result.out def test_sbt_search_order_dependence_2(runtmp): # *should* return the same result as test_sbt_search_order_dependence, # but does not due to a bug. - testdata1 = utils.get_test_data('genome-s10.fa.gz') - testdata2 = utils.get_test_data('genome-s11.fa.gz') - testdata3 = utils.get_test_data('genome-s12.fa.gz') - testdata4 = utils.get_test_data('genome-s10+s11.fa.gz') - - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,scaled=10000', '-p', 'k=31,scaled=10000', testdata1, testdata2, testdata3, testdata4) - - runtmp.sourmash('index', '-k', '21', '314', 'genome-s11.fa.gz.sig', 'genome-s10+s11.fa.gz.sig', 'genome-s12.fa.gz.sig') - - runtmp.sourmash('search', '-k', '21', 'genome-s11.fa.gz.sig', '314', '--best-only', '--dna') + testdata1 = utils.get_test_data("genome-s10.fa.gz") + testdata2 = utils.get_test_data("genome-s11.fa.gz") + testdata3 = utils.get_test_data("genome-s12.fa.gz") + testdata4 = utils.get_test_data("genome-s10+s11.fa.gz") + + runtmp.sourmash( + "sketch", + "dna", + "-p", + "k=21,scaled=10000", + "-p", + "k=31,scaled=10000", + testdata1, + testdata2, + testdata3, + testdata4, + ) + + runtmp.sourmash( + "index", + "-k", + "21", + "314", + "genome-s11.fa.gz.sig", + "genome-s10+s11.fa.gz.sig", + "genome-s12.fa.gz.sig", + ) + + runtmp.sourmash( + "search", "-k", "21", "genome-s11.fa.gz.sig", "314", "--best-only", "--dna" + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '100.0%' in runtmp.last_result.out + assert "100.0%" in runtmp.last_result.out def test_compare_with_abundance_1(runtmp): # create two signatures - E1 = MinHash(ksize=5, n=5, is_protein=False, - track_abundance=True) - E2 = MinHash(ksize=5, n=5, is_protein=False, - track_abundance=True) + E1 = MinHash(ksize=5, n=5, is_protein=False, track_abundance=True) + E2 = MinHash(ksize=5, n=5, is_protein=False, track_abundance=True) - E1.add_sequence('ATGGA') - E2.add_sequence('ATGGA') + E1.add_sequence("ATGGA") + E2.add_sequence("ATGGA") - s1 = signature.SourmashSignature(E1, filename='e1', name='e1') - s2 = signature.SourmashSignature(E2, filename='e2', name='e2') + s1 = signature.SourmashSignature(E1, filename="e1", name="e1") + s2 = signature.SourmashSignature(E2, filename="e2", name="e2") - with open(runtmp.output('e1.sig'), 'w') as f: + with open(runtmp.output("e1.sig"), "w") as f: signature.save_signatures([s1], f) - with open(runtmp.output('e2.sig'), 'w') as f: + with open(runtmp.output("e2.sig"), "w") as f: signature.save_signatures([s2], f) - runtmp.sourmash('search', 'e1.sig', 'e2.sig', '-k', '5') + runtmp.sourmash("search", "e1.sig", "e2.sig", "-k", "5") - assert '100.0%' in runtmp.last_result.out + assert "100.0%" in runtmp.last_result.out def test_compare_with_abundance_2(runtmp): # create two signatures - E1 = MinHash(ksize=5, n=5, is_protein=False, - track_abundance=True) - E2 = MinHash(ksize=5, n=5, is_protein=False, - track_abundance=True) + E1 = MinHash(ksize=5, n=5, is_protein=False, track_abundance=True) + E2 = MinHash(ksize=5, n=5, is_protein=False, track_abundance=True) - E1.add_sequence('ATGGA') + E1.add_sequence("ATGGA") - E1.add_sequence('ATGGA') - E2.add_sequence('ATGGA') + E1.add_sequence("ATGGA") + E2.add_sequence("ATGGA") - s1 = signature.SourmashSignature(E1, filename='e1', name='e1') - s2 = signature.SourmashSignature(E2, filename='e2', name='e2') + s1 = signature.SourmashSignature(E1, filename="e1", name="e1") + s2 = signature.SourmashSignature(E2, filename="e2", name="e2") - with open(runtmp.output('e1.sig'), 'w') as f: + with open(runtmp.output("e1.sig"), "w") as f: signature.save_signatures([s1], f) - with open(runtmp.output('e2.sig'), 'w') as f: + with open(runtmp.output("e2.sig"), "w") as f: signature.save_signatures([s2], f) - runtmp.sourmash('search', 'e1.sig', 'e2.sig', '-k', '5') + runtmp.sourmash("search", "e1.sig", "e2.sig", "-k", "5") - assert '100.0%' in runtmp.last_result.out + assert "100.0%" in runtmp.last_result.out def test_compare_with_abundance_3(runtmp): # create two signatures - E1 = MinHash(ksize=5, n=5, is_protein=False, - track_abundance=True) - E2 = MinHash(ksize=5, n=5, is_protein=False, - track_abundance=True) + E1 = MinHash(ksize=5, n=5, is_protein=False, track_abundance=True) + E2 = MinHash(ksize=5, n=5, is_protein=False, track_abundance=True) - E1.add_sequence('ATGGA') - E1.add_sequence('GGACA') + E1.add_sequence("ATGGA") + E1.add_sequence("GGACA") - E1.add_sequence('ATGGA') - E2.add_sequence('ATGGA') + E1.add_sequence("ATGGA") + E2.add_sequence("ATGGA") - s1 = signature.SourmashSignature(E1, filename='e1', name='e1') - s2 = signature.SourmashSignature(E2, filename='e2', name='e2') + s1 = signature.SourmashSignature(E1, filename="e1", name="e1") + s2 = signature.SourmashSignature(E2, filename="e2", name="e2") - with open(runtmp.output('e1.sig'), 'w') as f: + with open(runtmp.output("e1.sig"), "w") as f: signature.save_signatures([s1], f) - with open(runtmp.output('e2.sig'), 'w') as f: + with open(runtmp.output("e2.sig"), "w") as f: signature.save_signatures([s2], f) - runtmp.sourmash('search', 'e1.sig', 'e2.sig', '-k', '5') + runtmp.sourmash("search", "e1.sig", "e2.sig", "-k", "5") - assert '70.5%' in runtmp.last_result.out + assert "70.5%" in runtmp.last_result.out def test_compare_with_picklist(runtmp): # test 'sourmash compare' with picklists - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") - runtmp.sourmash('compare', *gcf_sigs, - '-k', '21', '--picklist', f"{picklist}:md5:md5") + runtmp.sourmash( + "compare", *gcf_sigs, "-k", "21", "--picklist", f"{picklist}:md5:md5" + ) err = runtmp.last_result.err out = runtmp.last_result.out @@ -3287,11 +3788,12 @@ def test_compare_with_picklist(runtmp): def test_compare_with_picklist_exclude(runtmp): # test 'sourmash compare' with picklists - exclude - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") - runtmp.sourmash('compare', *gcf_sigs, - '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude") + runtmp.sourmash( + "compare", *gcf_sigs, "-k", "21", "--picklist", f"{picklist}:md5:md5:exclude" + ) err = runtmp.last_result.err out = runtmp.last_result.out @@ -3309,12 +3811,10 @@ def test_compare_with_picklist_exclude(runtmp): def test_compare_with_pattern_include(runtmp): # test 'sourmash compare' with --include-db-pattern - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) - runtmp.sourmash('compare', *gcf_sigs, - '-k', '21', '--include', "thermotoga") + runtmp.sourmash("compare", *gcf_sigs, "-k", "21", "--include", "thermotoga") - err = runtmp.last_result.err out = runtmp.last_result.out print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -3326,12 +3826,10 @@ def test_compare_with_pattern_include(runtmp): def test_compare_with_pattern_exclude(runtmp): # test 'sourmash compare' with picklists - exclude - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) - runtmp.sourmash('compare', *gcf_sigs, - '-k', '21', '--exclude', "thermotoga") + runtmp.sourmash("compare", *gcf_sigs, "-k", "21", "--exclude", "thermotoga") - err = runtmp.last_result.err out = runtmp.last_result.out print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -3344,310 +3842,443 @@ def test_compare_with_pattern_exclude(runtmp): def test_gather(runtmp, linear_gather, prefetch_gather): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", "-o", "query.fa.sig", testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', '-o', 'foo.csv', '--threshold-bp=1', linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "-o", + "foo.csv", + "--threshold-bp=1", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out def test_gather_csv(runtmp, linear_gather, prefetch_gather): # test 'gather -o csvfile' - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - - runtmp.sourmash('sketch','dna','-p','scaled=10', '--name-from-first', testdata1, testdata2) - - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', '--name-from-first', testdata2) - - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') - - assert os.path.exists(runtmp.output('zzz.sbt.zip')) - - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', '-o', 'foo.csv', '--threshold-bp=1', linear_gather, prefetch_gather) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + + runtmp.sourmash( + "sketch", "dna", "-p", "scaled=10", "--name-from-first", testdata1, testdata2 + ) + + runtmp.sourmash( + "sketch", + "dna", + "-p", + "scaled=10", + "-o", + "query.fa.sig", + "--name-from-first", + testdata2, + ) + + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") + + assert os.path.exists(runtmp.output("zzz.sbt.zip")) + + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "-o", + "foo.csv", + "--threshold-bp=1", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - csv_file = runtmp.output('foo.csv') + csv_file = runtmp.output("foo.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) - assert float(row['intersect_bp']) == 910 - assert float(row['unique_intersect_bp']) == 910 - assert float(row['remaining_bp']) == 0 - assert float(row['f_orig_query']) == 1.0 - assert float(row['f_unique_to_query']) == 1.0 - assert float(row['f_match']) == 1.0 - assert row['filename'] == 'zzz' - assert row['name'] == 'tr1 4' - assert row['md5'] == 'c9d5a795eeaaf58e286fb299133e1938' - assert row['gather_result_rank'] == '0' - assert row['query_filename'].endswith('short2.fa') - assert row['query_name'] == 'tr1 4' - assert row['query_md5'] == 'c9d5a795' - assert row['query_bp'] == '910' - - assert row['query_abundance'] == 'False' - assert row['n_unique_weighted_found'] == '' + assert float(row["intersect_bp"]) == 910 + assert float(row["unique_intersect_bp"]) == 910 + assert float(row["remaining_bp"]) == 0 + assert float(row["f_orig_query"]) == 1.0 + assert float(row["f_unique_to_query"]) == 1.0 + assert float(row["f_match"]) == 1.0 + assert row["filename"] == "zzz" + assert row["name"] == "tr1 4" + assert row["md5"] == "c9d5a795eeaaf58e286fb299133e1938" + assert row["gather_result_rank"] == "0" + assert row["query_filename"].endswith("short2.fa") + assert row["query_name"] == "tr1 4" + assert row["query_md5"] == "c9d5a795" + assert row["query_bp"] == "910" + + assert row["query_abundance"] == "False" + assert row["n_unique_weighted_found"] == "" def test_gather_csv_gz(runtmp, linear_gather, prefetch_gather): # test 'gather -o csvfile.gz' - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - - runtmp.sourmash('sketch','dna','-p','scaled=10', '--name-from-first', testdata1, testdata2) - - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', '--name-from-first', testdata2) - - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') - - assert os.path.exists(runtmp.output('zzz.sbt.zip')) - - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', '-o', 'foo.csv.gz', '--threshold-bp=1', linear_gather, prefetch_gather) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + + runtmp.sourmash( + "sketch", "dna", "-p", "scaled=10", "--name-from-first", testdata1, testdata2 + ) + + runtmp.sourmash( + "sketch", + "dna", + "-p", + "scaled=10", + "-o", + "query.fa.sig", + "--name-from-first", + testdata2, + ) + + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") + + assert os.path.exists(runtmp.output("zzz.sbt.zip")) + + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "-o", + "foo.csv.gz", + "--threshold-bp=1", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - csv_file = runtmp.output('foo.csv.gz') + csv_file = runtmp.output("foo.csv.gz") with gzip.open(csv_file, "rt", newline="") as fp: reader = csv.DictReader(fp) row = next(reader) print(row) - assert float(row['intersect_bp']) == 910 - assert float(row['unique_intersect_bp']) == 910 - assert float(row['remaining_bp']) == 0 - assert float(row['f_orig_query']) == 1.0 - assert float(row['f_unique_to_query']) == 1.0 - assert float(row['f_match']) == 1.0 - assert row['filename'] == 'zzz' - assert row['name'] == 'tr1 4' - assert row['md5'] == 'c9d5a795eeaaf58e286fb299133e1938' - assert row['gather_result_rank'] == '0' - assert row['query_filename'].endswith('short2.fa') - assert row['query_name'] == 'tr1 4' - assert row['query_md5'] == 'c9d5a795' - assert row['query_bp'] == '910' + assert float(row["intersect_bp"]) == 910 + assert float(row["unique_intersect_bp"]) == 910 + assert float(row["remaining_bp"]) == 0 + assert float(row["f_orig_query"]) == 1.0 + assert float(row["f_unique_to_query"]) == 1.0 + assert float(row["f_match"]) == 1.0 + assert row["filename"] == "zzz" + assert row["name"] == "tr1 4" + assert row["md5"] == "c9d5a795eeaaf58e286fb299133e1938" + assert row["gather_result_rank"] == "0" + assert row["query_filename"].endswith("short2.fa") + assert row["query_name"] == "tr1 4" + assert row["query_md5"] == "c9d5a795" + assert row["query_bp"] == "910" def test_gather_abund_x_abund(runtmp, prefetch_gather, linear_gather): - sig47 = utils.get_test_data('track_abund/47.fa.sig') - sig63 = utils.get_test_data('track_abund/63.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") + sig63 = utils.get_test_data("track_abund/63.fa.sig") - runtmp.sourmash('gather', sig47, sig63, linear_gather, prefetch_gather) + runtmp.sourmash("gather", sig47, sig63, linear_gather, prefetch_gather) - assert '2.5 Mbp 49.2% 48.3% 1.0 NC_011663.1' in runtmp.last_result.out + assert ( + "2.5 Mbp 49.2% 48.3% 1.0 NC_011663.1" in runtmp.last_result.out + ) def test_gather_multiple_sbts(runtmp, prefetch_gather, linear_gather): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", "-o", "query.fa.sig", testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('index', 'zzz2', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz2", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', 'zzz2', '-o', 'foo.csv', '--threshold-bp=1', linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "zzz2", + "-o", + "foo.csv", + "--threshold-bp=1", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out def test_gather_multiple_sbts_save_prefetch(runtmp, linear_gather): # test --save-prefetch with multiple databases - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", "-o", "query.fa.sig", testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('index', 'zzz2', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz2", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', 'zzz2', '-o', 'foo.csv', '--save-prefetch', 'out.zip', '--threshold-bp=1', linear_gather) + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "zzz2", + "-o", + "foo.csv", + "--save-prefetch", + "out.zip", + "--threshold-bp=1", + linear_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out - assert os.path.exists(runtmp.output('out.zip')) + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out + assert os.path.exists(runtmp.output("out.zip")) def test_gather_multiple_sbts_save_prefetch_csv(runtmp, linear_gather): # test --save-prefetch-csv with multiple databases - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", "-o", "query.fa.sig", testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('index', 'zzz2', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz2", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', 'zzz2', '-o', 'foo.csv', '--save-prefetch-csv', 'prefetch.csv', '--threshold-bp=1', linear_gather) + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "zzz2", + "-o", + "foo.csv", + "--save-prefetch-csv", + "prefetch.csv", + "--threshold-bp=1", + linear_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out - assert os.path.exists(runtmp.output('prefetch.csv')) - with open(runtmp.output('prefetch.csv')) as f: + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out + assert os.path.exists(runtmp.output("prefetch.csv")) + with open(runtmp.output("prefetch.csv")) as f: output = f.read() print((output,)) - assert '870,0.925531914893617,0.9666666666666667' in output + assert "870,0.925531914893617,0.9666666666666667" in output def test_gather_multiple_sbts_save_prefetch_csv_gz(runtmp, linear_gather): # test --save-prefetch-csv to a .gz file, with multiple databases - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", "-o", "query.fa.sig", testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('index', 'zzz2', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz2", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', 'zzz2', '-o', 'foo.csv', '--save-prefetch-csv', 'prefetch.csv.gz', '--threshold-bp=1', linear_gather) + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "zzz2", + "-o", + "foo.csv", + "--save-prefetch-csv", + "prefetch.csv.gz", + "--threshold-bp=1", + linear_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out - assert os.path.exists(runtmp.output('prefetch.csv.gz')) - with gzip.open(runtmp.output('prefetch.csv.gz'), 'rt', newline="") as f: + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out + assert os.path.exists(runtmp.output("prefetch.csv.gz")) + with gzip.open(runtmp.output("prefetch.csv.gz"), "rt", newline="") as f: output = f.read() print((output,)) - assert '870,0.925531914893617,0.9666666666666667' in output + assert "870,0.925531914893617,0.9666666666666667" in output def test_gather_multiple_sbts_save_prefetch_and_prefetch_csv(runtmp, linear_gather): # test --save-prefetch-csv with multiple databases - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch','dna', '-p', 'scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", "-o", "query.fa.sig", testdata2) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz", "short.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('index', 'zzz2', 'short2.fa.sig', '-k', '31') + runtmp.sourmash("index", "zzz2", "short2.fa.sig", "-k", "31") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', 'zzz2', '-o', 'foo.csv', '--save-prefetch', 'out.zip', '--save-prefetch-csv', 'prefetch.csv', '--threshold-bp=1', linear_gather) + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "zzz2", + "-o", + "foo.csv", + "--save-prefetch", + "out.zip", + "--save-prefetch-csv", + "prefetch.csv", + "--threshold-bp=1", + linear_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out - assert os.path.exists(runtmp.output('prefetch.csv')) - with open(runtmp.output('prefetch.csv')) as f: + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out + assert os.path.exists(runtmp.output("prefetch.csv")) + with open(runtmp.output("prefetch.csv")) as f: output = f.read() print((output,)) - assert '870,0.925531914893617,0.9666666666666667' in output - assert os.path.exists(runtmp.output('out.zip')) + assert "870,0.925531914893617,0.9666666666666667" in output + assert os.path.exists(runtmp.output("out.zip")) def test_gather_sbt_and_sigs(runtmp, linear_gather, prefetch_gather): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'k=31,scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", "-o", "query.fa.sig", testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', 'short2.fa.sig', '-o', 'foo.csv', linear_gather, prefetch_gather, '--threshold-bp=1') + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "short2.fa.sig", + "-o", + "foo.csv", + linear_gather, + prefetch_gather, + "--threshold-bp=1", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out def test_gather_file_output(runtmp, linear_gather, prefetch_gather): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash("sketch", "dna", "-p", "scaled=10", "-o", "query.fa.sig", testdata2) - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', '--threshold-bp=500', linear_gather, prefetch_gather, '-o', 'foo.out') + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "--threshold-bp=500", + linear_gather, + prefetch_gather, + "-o", + "foo.out", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out - with open(runtmp.output('foo.out')) as f: + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out + with open(runtmp.output("foo.out")) as f: output = f.read() print((output,)) - assert '910,1.0,1.0' in output + assert "910,1.0,1.0" in output def test_gather_f_match_orig(runtmp, linear_gather, prefetch_gather): import copy - testdata_combined = utils.get_test_data('gather/combined.sig') - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_combined = utils.get_test_data("gather/combined.sig") + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - runtmp.sourmash('gather', testdata_combined, '-o', 'out.csv', - *testdata_sigs, linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + testdata_combined, + "-o", + "out.csv", + *testdata_sigs, + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -3658,25 +4289,25 @@ def test_gather_f_match_orig(runtmp, linear_gather, prefetch_gather): def approx_equal(a, b, n=5): return round(a, n) == round(b, n) - with open(runtmp.output('out.csv'), 'rt') as fp: + with open(runtmp.output("out.csv")) as fp: r = csv.DictReader(fp) for n, row in enumerate(r): - print(n, row['f_match'], row['f_match_orig']) + print(n, row["f_match"], row["f_match_orig"]) # each match is completely in the original query - assert row['f_match_orig'] == "1.0" + assert row["f_match_orig"] == "1.0" # double check -- should match 'search --containment'. # (this is kind of useless for a 1.0 contained_by, I guess) - filename = row['filename'] + filename = row["filename"] match = sourmash.load_one_signature(filename, ksize=21) assert match.contained_by(combined_sig) == 1.0 # check other fields, too. - f_orig_query = float(row['f_orig_query']) - f_match_orig = float(row['f_match_orig']) - f_match = float(row['f_match']) - f_unique_to_query = float(row['f_unique_to_query']) + f_orig_query = float(row["f_orig_query"]) + f_match_orig = float(row["f_match_orig"]) + f_match = float(row["f_match"]) + f_unique_to_query = float(row["f_unique_to_query"]) # f_orig_query is the containment of the query by the match. # (note, this only works because containment is 100% in combined). @@ -3687,8 +4318,7 @@ def approx_equal(a, b, n=5): assert approx_equal(match.contained_by(combined_sig), f_match_orig) # f_match is how much of the match is in the unallocated hashes - assert approx_equal(match.minhash.contained_by(remaining_mh), - f_match) + assert approx_equal(match.minhash.contained_by(remaining_mh), f_match) # f_unique_to_query is how much of the match is unique wrt # the original query. @@ -3704,14 +4334,21 @@ def approx_equal(a, b, n=5): def test_gather_nomatch(runtmp, linear_gather, prefetch_gather): testdata_query = utils.get_test_data( - 'gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig') - testdata_match = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - - out_csv = runtmp.output('results.csv') - - runtmp.sourmash('gather', testdata_query, testdata_match, - '-o', out_csv, - linear_gather, prefetch_gather) + "gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig" + ) + testdata_match = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + + out_csv = runtmp.output("results.csv") + + runtmp.sourmash( + "gather", + testdata_query, + testdata_match, + "-o", + out_csv, + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -3722,14 +4359,22 @@ def test_gather_nomatch(runtmp, linear_gather, prefetch_gather): def test_gather_nomatch_create_empty(runtmp, linear_gather, prefetch_gather): testdata_query = utils.get_test_data( - 'gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig') - testdata_match = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - - out_csv = runtmp.output('results.csv') - - runtmp.sourmash('gather', testdata_query, testdata_match, - '-o', out_csv, '--create-empty-results', - linear_gather, prefetch_gather) + "gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig" + ) + testdata_match = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + + out_csv = runtmp.output("results.csv") + + runtmp.sourmash( + "gather", + testdata_query, + testdata_match, + "-o", + out_csv, + "--create-empty-results", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -3737,17 +4382,20 @@ def test_gather_nomatch_create_empty(runtmp, linear_gather, prefetch_gather): assert "No matches found for --threshold-bp at 50.0 kbp." in runtmp.last_result.err assert os.path.exists(out_csv) - with open(out_csv, 'rt') as fp: + with open(out_csv) as fp: data = fp.read() assert not data def test_gather_abund_nomatch(runtmp, linear_gather, prefetch_gather): - testdata_query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - testdata_match = utils.get_test_data('gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig') + testdata_query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + testdata_match = utils.get_test_data( + "gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig" + ) - runtmp.sourmash('gather', testdata_query, testdata_match, - linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", testdata_query, testdata_match, linear_gather, prefetch_gather + ) print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -3756,50 +4404,58 @@ def test_gather_abund_nomatch(runtmp, linear_gather, prefetch_gather): def test_gather_metagenome(runtmp): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) - runtmp.sourmash('gather', query_sig, 'gcf_all', '-k', '21', '--threshold-bp=0') + runtmp.sourmash("gather", query_sig, "gcf_all", "-k", "21", "--threshold-bp=0") print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 12 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out - assert all(('4.9 Mbp 33.2% 100.0%' in runtmp.last_result.out, - 'NC_003198.1 Salmonella enterica subsp' in runtmp.last_result.out)) - assert all(('4.7 Mbp 0.5% 1.5%' in runtmp.last_result.out, - 'NC_011294.1 Salmonella enterica subs' in runtmp.last_result.out)) + assert "found 12 matches total" in runtmp.last_result.out + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in runtmp.last_result.out, + "NC_003198.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) + assert all( + ( + "4.7 Mbp 0.5% 1.5%" in runtmp.last_result.out, + "NC_011294.1 Salmonella enterica subs" in runtmp.last_result.out, + ) + ) @utils.in_tempdir def test_gather_metagenome_num_results(c): # set a threshold on the number of results to be reported by gather - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) - cmd = 'gather {} gcf_all -k 21 --num-results 10'.format(query_sig) - cmd = cmd.split(' ') + cmd = f"gather {query_sig} gcf_all -k 21 --num-results 10" + cmd = cmd.split(" ") c.run_sourmash(*cmd) print(c.last_result.out) @@ -3807,85 +4463,122 @@ def test_gather_metagenome_num_results(c): out = c.last_result.out - assert 'found 10 matches total' in out - assert '(truncated gather because --num-results=10)' in out - assert 'the recovered matches hit 99.4% of the query' in out - assert all(('4.9 Mbp 33.2% 100.0%' in out, - 'NC_003198.1 Salmonella enterica subsp' in out)) - assert '4.3 Mbp 2.1% 7.3% NC_006511.1 Salmonella enterica subsp' in out + assert "found 10 matches total" in out + assert "(truncated gather because --num-results=10)" in out + assert "the recovered matches hit 99.4% of the query" in out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in out, + "NC_003198.1 Salmonella enterica subsp" in out, + ) + ) + assert "4.3 Mbp 2.1% 7.3% NC_006511.1 Salmonella enterica subsp" in out def test_gather_metagenome_threshold_bp(runtmp, linear_gather, prefetch_gather): # set a threshold on the gather output - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) - runtmp.sourmash('gather', query_sig, 'gcf_all', '-k', '21', - '--threshold-bp', '2e6', linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + query_sig, + "gcf_all", + "-k", + "21", + "--threshold-bp", + "2e6", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 1 matches total' in runtmp.last_result.out - assert 'found less than 2.0 Mbp in common. => exiting' in runtmp.last_result.err - assert 'the recovered matches hit 33.2% of the query' in runtmp.last_result.out - assert all(('4.9 Mbp 33.2% 100.0%' in runtmp.last_result.out, - 'NC_003198.1 Salmonella enterica subsp' in runtmp.last_result.out)) + assert "found 1 matches total" in runtmp.last_result.out + assert "found less than 2.0 Mbp in common. => exiting" in runtmp.last_result.err + assert "the recovered matches hit 33.2% of the query" in runtmp.last_result.out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in runtmp.last_result.out, + "NC_003198.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) def test_gather_metagenome_threshold_bp_low(runtmp, linear_gather, prefetch_gather): # set a threshold on the gather output => too low - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) - runtmp.sourmash('gather', query_sig, 'gcf_all', '-k', '21', - '--threshold-bp', '1', linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + query_sig, + "gcf_all", + "-k", + "21", + "--threshold-bp", + "1", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 12 matches total' in runtmp.last_result.out - assert 'found less than 1 bp in common. => exiting' in runtmp.last_result.err - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out + assert "found 12 matches total" in runtmp.last_result.out + assert "found less than 1 bp in common. => exiting" in runtmp.last_result.err + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out -def test_gather_metagenome_threshold_bp_too_high(runtmp, linear_gather, prefetch_gather): +def test_gather_metagenome_threshold_bp_too_high( + runtmp, linear_gather, prefetch_gather +): # set a threshold on the gather output => no results - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) - runtmp.sourmash('gather', query_sig, 'gcf_all', '-k', '21', - '--threshold-bp', '5e6', linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + query_sig, + "gcf_all", + "-k", + "21", + "--threshold-bp", + "5e6", + linear_gather, + prefetch_gather, + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -3896,46 +4589,67 @@ def test_gather_metagenome_threshold_bp_too_high(runtmp, linear_gather, prefetch def test_multigather_metagenome(runtmp): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) - runtmp.sourmash('multigather', '--query', query_sig, '--db', 'gcf_all', '-k', '21', '--threshold-bp=0') + runtmp.sourmash( + "multigather", + "--query", + query_sig, + "--db", + "gcf_all", + "-k", + "21", + "--threshold-bp=0", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 12 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out - assert all(('4.9 Mbp 33.2% 100.0%' in runtmp.last_result.out, - 'NC_003198.1 Salmonella enterica subsp' in runtmp.last_result.out)) - assert all(('4.7 Mbp 0.5% 1.5%' in runtmp.last_result.out, - 'NC_011294.1 Salmonella enterica subsp' in runtmp.last_result.out)) + assert "found 12 matches total" in runtmp.last_result.out + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in runtmp.last_result.out, + "NC_003198.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) + assert all( + ( + "4.7 Mbp 0.5% 1.5%" in runtmp.last_result.out, + "NC_011294.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) def test_multigather_check_scaled_bounds_negative(runtmp): c = runtmp - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - cmd = 'multigather --query {} --db gcf_all -k 21 --scaled -5 --threshold-bp=0'.format(query_sig) - cmd = cmd.split(' ') + cmd = ( + "multigather --query {} --db gcf_all -k 21 --scaled -5 --threshold-bp=0".format( + query_sig + ) + ) + cmd = cmd.split(" ") with pytest.raises(SourmashCommandFailed) as exc: c.run_sourmash(*cmd) @@ -3944,67 +4658,80 @@ def test_multigather_check_scaled_bounds_negative(runtmp): def test_multigather_check_scaled_bounds_less_than_minimum(runtmp): c = runtmp - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - cmd = 'multigather --query {} --db gcf_all -k 21 --scaled 50 --threshold-bp=0'.format(query_sig) - cmd = cmd.split(' ') + cmd = ( + "multigather --query {} --db gcf_all -k 21 --scaled 50 --threshold-bp=0".format( + query_sig + ) + ) + cmd = cmd.split(" ") # Note: this is the value error that is emitted, but we want the Warning from below to be generated instead. (ValueError: new scaled 50.0 is lower than current sample scaled 10000) with pytest.raises(SourmashCommandFailed) as exc: c.run_sourmash(*cmd) - assert "WARNING: scaled value should be >= 100. Continuing anyway." in str(exc.value) + assert "WARNING: scaled value should be >= 100. Continuing anyway." in str( + exc.value + ) def test_multigather_check_scaled_bounds_more_than_maximum(runtmp): c = runtmp - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - cmd = 'multigather --query {} --db gcf_all -k 21 --scaled 1e9 --threshold-bp=0'.format(query_sig) - cmd = cmd.split(' ') + cmd = "multigather --query {} --db gcf_all -k 21 --scaled 1e9 --threshold-bp=0".format( + query_sig + ) + cmd = cmd.split(" ") c.run_sourmash(*cmd) - assert "WARNING: scaled value should be <= 1e6. Continuing anyway." in c.last_result.err + assert ( + "WARNING: scaled value should be <= 1e6. Continuing anyway." + in c.last_result.err + ) def test_multigather_metagenome_query_from_file(runtmp): # test multigather --query-from-file c = runtmp - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) # make list w/query sig - query_list = c.output('query.list') - with open(query_list, 'wt') as fp: + query_list = c.output("query.list") + with open(query_list, "w") as fp: print(query_sig, file=fp) - cmd = 'multigather --query-from-file {} --db gcf_all -k 21 --threshold-bp=0'.format(query_list) - cmd = cmd.split(' ') + cmd = "multigather --query-from-file {} --db gcf_all -k 21 --threshold-bp=0".format( + query_list + ) + cmd = cmd.split(" ") c.run_sourmash(*cmd) out = c.last_result.out @@ -4012,36 +4739,44 @@ def test_multigather_metagenome_query_from_file(runtmp): err = c.last_result.err print(err) - assert 'found 12 matches total' in out - assert 'the recovered matches hit 100.0% of the query' in out - assert all(('4.9 Mbp 33.2% 100.0%' in out, - 'NC_003198.1 Salmonella enterica subsp' in out)) - assert all(('4.7 Mbp 0.5% 1.5%' in out, - 'NC_011294.1 Salmonella enterica subsp' in out)) + assert "found 12 matches total" in out + assert "the recovered matches hit 100.0% of the query" in out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in out, + "NC_003198.1 Salmonella enterica subsp" in out, + ) + ) + assert all( + ( + "4.7 Mbp 0.5% 1.5%" in out, + "NC_011294.1 Salmonella enterica subsp" in out, + ) + ) def test_multigather_metagenome_output(runtmp): # test multigather CSV output has more than one output line c = runtmp - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) - cmd = f'multigather --query {query_sig} --db gcf_all -k 21 --threshold-bp=0' - cmd = cmd.split(' ') + cmd = f"multigather --query {query_sig} --db gcf_all -k 21 --threshold-bp=0" + cmd = cmd.split(" ") c.run_sourmash(*cmd) - output_csv = runtmp.output('-.csv') + output_csv = runtmp.output("-.csv") assert os.path.exists(output_csv) - with open(output_csv, newline='') as fp: + with open(output_csv, newline="") as fp: x = fp.readlines() assert len(x) == 13 @@ -4049,50 +4784,49 @@ def test_multigather_metagenome_output(runtmp): def test_multigather_metagenome_output_outdir(runtmp): # test multigather CSV output to different location c = runtmp - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) # create output directory - outdir = runtmp.output('savehere') + outdir = runtmp.output("savehere") os.mkdir(outdir) - cmd = f'multigather --query {query_sig} --db gcf_all -k 21 --threshold-bp=0 --output-dir {outdir}' - cmd = cmd.split(' ') + cmd = f"multigather --query {query_sig} --db gcf_all -k 21 --threshold-bp=0 --output-dir {outdir}" + cmd = cmd.split(" ") c.run_sourmash(*cmd) - output_csv = runtmp.output('savehere/-.csv') + output_csv = runtmp.output("savehere/-.csv") assert os.path.exists(output_csv) - with open(output_csv, newline='') as fp: + with open(output_csv, newline="") as fp: x = fp.readlines() assert len(x) == 13 @utils.in_tempdir def test_multigather_metagenome_query_with_sbt(c): - - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all.sbt.zip'] + cmd = ["index", "gcf_all.sbt.zip"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) - cmd = 'multigather --query gcf_all.sbt.zip --db gcf_all.sbt.zip -k 21 --threshold-bp=0' - cmd = cmd.split(' ') + cmd = "multigather --query gcf_all.sbt.zip --db gcf_all.sbt.zip -k 21 --threshold-bp=0" + cmd = cmd.split(" ") c.run_sourmash(*cmd) out = c.last_result.out @@ -4100,35 +4834,50 @@ def test_multigather_metagenome_query_with_sbt(c): err = c.last_result.err print(err) - assert 'conducted gather searches on 12 signatures' in err - assert 'the recovered matches hit 100.0% of the query' in out - assert all(('4.7 Mbp 100.0% 100.0%' in out, - 'NC_011080.1 Salmonella enterica subsp' in out)) - assert all(('4.5 Mbp 100.0% 100.0%' in out, - 'NC_004631.1 Salmonella enterica subsp' in out)) - assert all (('1.6 Mbp 100.0% 100.0%' in out, - 'NC_002163.1 Campylobacter jejuni subs' in out)) - assert all(('1.9 Mbp 100.0% 100.0%' in out, - 'NC_000853.1 Thermotoga maritima MSB8 ' in out)) + assert "conducted gather searches on 12 signatures" in err + assert "the recovered matches hit 100.0% of the query" in out + assert all( + ( + "4.7 Mbp 100.0% 100.0%" in out, + "NC_011080.1 Salmonella enterica subsp" in out, + ) + ) + assert all( + ( + "4.5 Mbp 100.0% 100.0%" in out, + "NC_004631.1 Salmonella enterica subsp" in out, + ) + ) + assert all( + ( + "1.6 Mbp 100.0% 100.0%" in out, + "NC_002163.1 Campylobacter jejuni subs" in out, + ) + ) + assert all( + ( + "1.9 Mbp 100.0% 100.0%" in out, + "NC_000853.1 Thermotoga maritima MSB8 " in out, + ) + ) @utils.in_tempdir def test_multigather_metagenome_query_with_lca(c): - - testdata_glob = utils.get_test_data('47*.fa.sig') + testdata_glob = utils.get_test_data("47*.fa.sig") testdata_sigs = glob.glob(testdata_glob) - lca_db = utils.get_test_data('lca/47+63.lca.json') + lca_db = utils.get_test_data("lca/47+63.lca.json") - cmd = ['index', '47+63.sbt.zip'] + cmd = ["index", "47+63.sbt.zip"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '31']) + cmd.extend(["-k", "31"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('47+63.sbt.zip')) + assert os.path.exists(c.output("47+63.sbt.zip")) - cmd = 'multigather --query {} --db 47+63.sbt.zip -k 31 --threshold-bp=0'.format(lca_db) - cmd = cmd.split(' ') + cmd = f"multigather --query {lca_db} --db 47+63.sbt.zip -k 31 --threshold-bp=0" + cmd = cmd.split(" ") c.run_sourmash(*cmd) out = c.last_result.out @@ -4136,21 +4885,22 @@ def test_multigather_metagenome_query_with_lca(c): err = c.last_result.err print(err) - assert 'conducted gather searches on 2 signatures' in err - assert 'the recovered matches hit 100.0% of the query' in out -# assert '5.1 Mbp 100.0% 64.9% 491c0a81' in out - assert '5.5 Mbp 100.0% 69.4% 491c0a81' in out + assert "conducted gather searches on 2 signatures" in err + assert "the recovered matches hit 100.0% of the query" in out + # assert '5.1 Mbp 100.0% 64.9% 491c0a81' in out + assert "5.5 Mbp 100.0% 69.4% 491c0a81" in out @utils.in_tempdir def test_multigather_metagenome_query_on_lca_db(c): - - testdata_sig1 = utils.get_test_data('47.fa.sig') - testdata_sig2 = utils.get_test_data('63.fa.sig') - lca_db = utils.get_test_data('lca/47+63.lca.json') - - cmd = 'multigather --query {} {} --db {} -k 31 --threshold-bp=0'.format(testdata_sig1, testdata_sig2, lca_db) - cmd = cmd.split(' ') + testdata_sig1 = utils.get_test_data("47.fa.sig") + testdata_sig2 = utils.get_test_data("63.fa.sig") + lca_db = utils.get_test_data("lca/47+63.lca.json") + + cmd = "multigather --query {} {} --db {} -k 31 --threshold-bp=0".format( + testdata_sig1, testdata_sig2, lca_db + ) + cmd = cmd.split(" ") c.run_sourmash(*cmd) out = c.last_result.out @@ -4158,33 +4908,44 @@ def test_multigather_metagenome_query_on_lca_db(c): err = c.last_result.err print(err) - assert 'conducted gather searches on 2 signatures' in err - assert 'the recovered matches hit 100.0% of the query' in out - assert all(('5.1 Mbp 100.0% 100.0%' in out, - 'NC_009665.1 Shewanella baltica OS185,' in out)) - assert all(('5.5 Mbp 100.0% 100.0%' in out, - 'NC_011663.1 Shewanella baltica OS223,' in out)) + assert "conducted gather searches on 2 signatures" in err + assert "the recovered matches hit 100.0% of the query" in out + assert all( + ( + "5.1 Mbp 100.0% 100.0%" in out, + "NC_009665.1 Shewanella baltica OS185," in out, + ) + ) + assert all( + ( + "5.5 Mbp 100.0% 100.0%" in out, + "NC_011663.1 Shewanella baltica OS223," in out, + ) + ) @utils.in_tempdir def test_multigather_metagenome_query_with_sbt_addl_query(c): - - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all.sbt.zip'] + cmd = ["index", "gcf_all.sbt.zip"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) - another_query = utils.get_test_data('gather/GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig') + another_query = utils.get_test_data( + "gather/GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig" + ) - cmd = 'multigather --query {} gcf_all.sbt.zip --db gcf_all.sbt.zip -k 21 --threshold-bp=0'.format(another_query) - cmd = cmd.split(' ') + cmd = "multigather --query {} gcf_all.sbt.zip --db gcf_all.sbt.zip -k 21 --threshold-bp=0".format( + another_query + ) + cmd = cmd.split(" ") c.run_sourmash(*cmd) out = c.last_result.out @@ -4192,47 +4953,70 @@ def test_multigather_metagenome_query_with_sbt_addl_query(c): err = c.last_result.err print(err) - assert 'conducted gather searches on 13 signatures' in err - assert 'the recovered matches hit 100.0% of the query' in out - #check for matches to some of the sbt signatures - assert all(('4.7 Mbp 100.0% 100.0%' in out, - 'NC_011080.1 Salmonella enterica subsp' in out)) - assert all(('4.5 Mbp 100.0% 100.0%' in out, - 'NC_004631.1 Salmonella enterica subsp' in out)) - assert all (('1.6 Mbp 100.0% 100.0%' in out, - 'NC_002163.1 Campylobacter jejuni subs' in out)) - assert all(('1.9 Mbp 100.0% 100.0%' in out, - 'NC_000853.1 Thermotoga maritima MSB8 ' in out)) - - #check additional query sig - assert all(('4.9 Mbp 100.0% 100.0%' in out, - 'NC_003198.1 Salmonella enterica subsp' in out)) + assert "conducted gather searches on 13 signatures" in err + assert "the recovered matches hit 100.0% of the query" in out + # check for matches to some of the sbt signatures + assert all( + ( + "4.7 Mbp 100.0% 100.0%" in out, + "NC_011080.1 Salmonella enterica subsp" in out, + ) + ) + assert all( + ( + "4.5 Mbp 100.0% 100.0%" in out, + "NC_004631.1 Salmonella enterica subsp" in out, + ) + ) + assert all( + ( + "1.6 Mbp 100.0% 100.0%" in out, + "NC_002163.1 Campylobacter jejuni subs" in out, + ) + ) + assert all( + ( + "1.9 Mbp 100.0% 100.0%" in out, + "NC_000853.1 Thermotoga maritima MSB8 " in out, + ) + ) + + # check additional query sig + assert all( + ( + "4.9 Mbp 100.0% 100.0%" in out, + "NC_003198.1 Salmonella enterica subsp" in out, + ) + ) @utils.in_tempdir def test_multigather_metagenome_sbt_query_from_file_with_addl_query(c): - - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all.sbt.zip'] + cmd = ["index", "gcf_all.sbt.zip"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) # make list w/query sbt - query_list = c.output('query.list') - with open(query_list, 'wt') as fp: - print('gcf_all.sbt.zip', file=fp) - - another_query = utils.get_test_data('gather/GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig') - - cmd = 'multigather --query {} --query-from-file {} --db gcf_all.sbt.zip -k 21 --threshold-bp=0'.format(another_query, query_list) - cmd = cmd.split(' ') + query_list = c.output("query.list") + with open(query_list, "w") as fp: + print("gcf_all.sbt.zip", file=fp) + + another_query = utils.get_test_data( + "gather/GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig" + ) + + cmd = "multigather --query {} --query-from-file {} --db gcf_all.sbt.zip -k 21 --threshold-bp=0".format( + another_query, query_list + ) + cmd = cmd.split(" ") c.run_sourmash(*cmd) out = c.last_result.out @@ -4240,43 +5024,62 @@ def test_multigather_metagenome_sbt_query_from_file_with_addl_query(c): err = c.last_result.err print(err) - assert 'conducted gather searches on 13 signatures' in err - assert 'the recovered matches hit 100.0% of the query' in out - #check for matches to some of the sbt signatures - assert all(('4.7 Mbp 100.0% 100.0%' in out, - 'NC_011080.1 Salmonella enterica subsp' in out)) - assert all(('4.5 Mbp 100.0% 100.0%' in out, - 'NC_004631.1 Salmonella enterica subsp' in out)) - assert all (('1.6 Mbp 100.0% 100.0%' in out, - 'NC_002163.1 Campylobacter jejuni subs' in out)) - assert all(('1.9 Mbp 100.0% 100.0%' in out, - 'NC_000853.1 Thermotoga maritima MSB8 ' in out)) - - #check additional query sig - assert all(('4.9 Mbp 100.0% 100.0%' in out, - 'NC_003198.1 Salmonella enterica subsp' in out)) + assert "conducted gather searches on 13 signatures" in err + assert "the recovered matches hit 100.0% of the query" in out + # check for matches to some of the sbt signatures + assert all( + ( + "4.7 Mbp 100.0% 100.0%" in out, + "NC_011080.1 Salmonella enterica subsp" in out, + ) + ) + assert all( + ( + "4.5 Mbp 100.0% 100.0%" in out, + "NC_004631.1 Salmonella enterica subsp" in out, + ) + ) + assert all( + ( + "1.6 Mbp 100.0% 100.0%" in out, + "NC_002163.1 Campylobacter jejuni subs" in out, + ) + ) + assert all( + ( + "1.9 Mbp 100.0% 100.0%" in out, + "NC_000853.1 Thermotoga maritima MSB8 " in out, + ) + ) + + # check additional query sig + assert all( + ( + "4.9 Mbp 100.0% 100.0%" in out, + "NC_003198.1 Salmonella enterica subsp" in out, + ) + ) @utils.in_tempdir def test_multigather_metagenome_sbt_query_from_file_incorrect(c): - - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all.sbt.zip'] + cmd = ["index", "gcf_all.sbt.zip"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) # incorrectly query with sbt using `--query-from-file` - cmd = 'multigather --query-from-file gcf_all.sbt.zip --db gcf_all.sbt.zip -k 21 --threshold-bp=0' - cmd = cmd.split(' ') + cmd = "multigather --query-from-file gcf_all.sbt.zip --db gcf_all.sbt.zip -k 21 --threshold-bp=0" + cmd = cmd.split(" ") - with pytest.raises(SourmashCommandFailed) as e: + with pytest.raises(SourmashCommandFailed): c.run_sourmash(*cmd) print(c.last_result.out) @@ -4285,25 +5088,27 @@ def test_multigather_metagenome_sbt_query_from_file_incorrect(c): @utils.in_tempdir def test_multigather_metagenome_lca_query_from_file(c): - testdata_glob = utils.get_test_data('47*.fa.sig') + testdata_glob = utils.get_test_data("47*.fa.sig") testdata_sigs = glob.glob(testdata_glob) - lca_db = utils.get_test_data('lca/47+63.lca.json') + lca_db = utils.get_test_data("lca/47+63.lca.json") - cmd = ['index', '47+63.sbt.zip'] + cmd = ["index", "47+63.sbt.zip"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '31']) + cmd.extend(["-k", "31"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('47+63.sbt.zip')) + assert os.path.exists(c.output("47+63.sbt.zip")) # make list w/query sig - query_list = c.output('query.list') - with open(query_list, 'wt') as fp: + query_list = c.output("query.list") + with open(query_list, "w") as fp: print(lca_db, file=fp) - cmd = 'multigather --query-from-file {} --db 47+63.sbt.zip -k 31 --threshold-bp=0'.format(query_list) - cmd = cmd.split(' ') + cmd = "multigather --query-from-file {} --db 47+63.sbt.zip -k 31 --threshold-bp=0".format( + query_list + ) + cmd = cmd.split(" ") c.run_sourmash(*cmd) out = c.last_result.out @@ -4311,36 +5116,40 @@ def test_multigather_metagenome_lca_query_from_file(c): err = c.last_result.err print(err) - assert 'conducted gather searches on 2 signatures' in err - assert 'the recovered matches hit 100.0% of the query' in out -# assert '5.1 Mbp 100.0% 64.9% 491c0a81' in out - assert '5.5 Mbp 100.0% 69.4% 491c0a81' in out + assert "conducted gather searches on 2 signatures" in err + assert "the recovered matches hit 100.0% of the query" in out + # assert '5.1 Mbp 100.0% 64.9% 491c0a81' in out + assert "5.5 Mbp 100.0% 69.4% 491c0a81" in out @utils.in_tempdir def test_multigather_metagenome_query_from_file_with_addl_query(c): # test multigather --query-from-file and --query too - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) c.run_sourmash(*cmd) - assert os.path.exists(c.output('gcf_all.sbt.zip')) + assert os.path.exists(c.output("gcf_all.sbt.zip")) # make list w/query sig - query_list = c.output('query.list') - with open(query_list, 'wt') as fp: + query_list = c.output("query.list") + with open(query_list, "w") as fp: print(query_sig, file=fp) - another_query = utils.get_test_data('gather/GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig') + another_query = utils.get_test_data( + "gather/GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig" + ) - cmd = 'multigather --query-from-file {} --query {} --db gcf_all -k 21 --threshold-bp=0'.format(query_list, another_query) - cmd = cmd.split(' ') + cmd = "multigather --query-from-file {} --query {} --db gcf_all -k 21 --threshold-bp=0".format( + query_list, another_query + ) + cmd = cmd.split(" ") c.run_sourmash(*cmd) out = c.last_result.out @@ -4349,42 +5158,67 @@ def test_multigather_metagenome_query_from_file_with_addl_query(c): print(err) # first gather query - assert 'found 12 matches total' in out - assert 'the recovered matches hit 100.0% of the query' in out - assert all(('4.9 Mbp 33.2% 100.0%' in out, - 'NC_003198.1 Salmonella enterica subsp' in out)) - assert all(('4.7 Mbp 0.5% 1.5%' in out, - 'NC_011294.1 Salmonella enterica subsp' in out)) + assert "found 12 matches total" in out + assert "the recovered matches hit 100.0% of the query" in out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in out, + "NC_003198.1 Salmonella enterica subsp" in out, + ) + ) + assert all( + ( + "4.7 Mbp 0.5% 1.5%" in out, + "NC_011294.1 Salmonella enterica subsp" in out, + ) + ) # second gather query - assert '4.9 Mbp 100.0% 100.0% NC_003198.1 Salmonella enterica subsp' in out - assert 'found 1 matches total;' in out - assert 'the recovered matches hit 100.0% of the query' in out + assert "4.9 Mbp 100.0% 100.0% NC_003198.1 Salmonella enterica subsp" in out + assert "found 1 matches total;" in out + assert "the recovered matches hit 100.0% of the query" in out def test_gather_metagenome_traverse(runtmp, linear_gather, prefetch_gather): # set up a directory $location/gather that contains # everything in the 'tests/test-data/gather' directory # *except* the query sequence, which is 'combined.sig'. - testdata_dir = utils.get_test_data('gather') - copy_testdata = runtmp.output('somesigs') + testdata_dir = utils.get_test_data("gather") + copy_testdata = runtmp.output("somesigs") shutil.copytree(testdata_dir, copy_testdata) - os.unlink(os.path.join(copy_testdata, 'combined.sig')) + os.unlink(os.path.join(copy_testdata, "combined.sig")) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") # now, feed in the new directory -- - runtmp.sourmash('gather', query_sig, copy_testdata, '-k', '21', '--threshold-bp=0', linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + query_sig, + copy_testdata, + "-k", + "21", + "--threshold-bp=0", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 12 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out - assert all(('4.9 Mbp 33.2% 100.0%' in runtmp.last_result.out, - 'NC_003198.1 Salmonella enterica subsp' in runtmp.last_result.out)) - assert all(('4.7 Mbp 0.5% 1.5%' in runtmp.last_result.out, - 'NC_011294.1 Salmonella enterica subsp' in runtmp.last_result.out)) + assert "found 12 matches total" in runtmp.last_result.out + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in runtmp.last_result.out, + "NC_003198.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) + assert all( + ( + "4.7 Mbp 0.5% 1.5%" in runtmp.last_result.out, + "NC_011294.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) def test_gather_metagenome_traverse_check_csv(runtmp, linear_gather, prefetch_gather): @@ -4394,300 +5228,483 @@ def test_gather_metagenome_traverse_check_csv(runtmp, linear_gather, prefetch_ga # set up a directory $location/gather that contains # everything in the 'tests/test-data/gather' directory # *except* the query sequence, which is 'combined.sig'. - testdata_dir = utils.get_test_data('gather') - copy_testdata = runtmp.output('somesigs') + testdata_dir = utils.get_test_data("gather") + copy_testdata = runtmp.output("somesigs") shutil.copytree(testdata_dir, copy_testdata) - os.unlink(os.path.join(copy_testdata, 'combined.sig')) + os.unlink(os.path.join(copy_testdata, "combined.sig")) - query_sig = utils.get_test_data('gather/combined.sig') - out_csv = runtmp.output('out.csv') + query_sig = utils.get_test_data("gather/combined.sig") + out_csv = runtmp.output("out.csv") # now, feed in the new directory -- - runtmp.sourmash('gather', query_sig, copy_testdata, '-k', '21', '--threshold-bp=0', '-o', out_csv, linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + query_sig, + copy_testdata, + "-k", + "21", + "--threshold-bp=0", + "-o", + out_csv, + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - with open(out_csv, 'rt') as fp: + with open(out_csv) as fp: prefix_len = len(copy_testdata) r = csv.DictReader(fp) for row in r: - filename = row['filename'] + filename = row["filename"] assert filename.startswith(copy_testdata), filename # should have full path to file sig was loaded from assert len(filename) > prefix_len - assert 'found 12 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out - assert all(('4.9 Mbp 33.2% 100.0%' in runtmp.last_result.out, - 'NC_003198.1 Salmonella enterica subsp' in runtmp.last_result.out)) - assert all(('4.7 Mbp 0.5% 1.5%' in runtmp.last_result.out, - 'NC_011294.1 Salmonella enterica subsp' in runtmp.last_result.out)) + assert "found 12 matches total" in runtmp.last_result.out + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in runtmp.last_result.out, + "NC_003198.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) + assert all( + ( + "4.7 Mbp 0.5% 1.5%" in runtmp.last_result.out, + "NC_011294.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) @utils.in_tempdir def test_gather_traverse_incompatible(c): - searchdir = c.output('searchme') + searchdir = c.output("searchme") os.mkdir(searchdir) - num_sig = utils.get_test_data('num/47.fa.sig') - scaled_sig = utils.get_test_data('47.fa.sig') - shutil.copyfile(num_sig, c.output('searchme/num.sig')) - shutil.copyfile(scaled_sig, c.output('searchme/scaled.sig')) + num_sig = utils.get_test_data("num/47.fa.sig") + scaled_sig = utils.get_test_data("47.fa.sig") + shutil.copyfile(num_sig, c.output("searchme/num.sig")) + shutil.copyfile(scaled_sig, c.output("searchme/scaled.sig")) - c.run_sourmash("gather", scaled_sig, c.output('searchme')) + c.run_sourmash("gather", scaled_sig, c.output("searchme")) print(c.last_result.out) print(c.last_result.err) - assert "5.2 Mbp 100.0% 100.0% NC_009665.1 Shewanella baltica OS185," in c.last_result.out + assert ( + "5.2 Mbp 100.0% 100.0% NC_009665.1 Shewanella baltica OS185," + in c.last_result.out + ) def test_gather_metagenome_output_unassigned(runtmp): - testdata_glob = utils.get_test_data('gather/GCF_000195995*g') + testdata_glob = utils.get_test_data("gather/GCF_000195995*g") testdata_sigs = glob.glob(testdata_glob)[0] - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - runtmp.sourmash('gather', query_sig, testdata_sigs, '-k', '21', '--output-unassigned=unassigned.sig') + runtmp.sourmash( + "gather", + query_sig, + testdata_sigs, + "-k", + "21", + "--output-unassigned=unassigned.sig", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 1 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 33.2% of the query' in runtmp.last_result.out - assert all(('4.9 Mbp 33.2% 100.0%' in runtmp.last_result.out, - 'NC_003198.1 Salmonella enterica subsp' in runtmp.last_result.out)) + assert "found 1 matches total" in runtmp.last_result.out + assert "the recovered matches hit 33.2% of the query" in runtmp.last_result.out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in runtmp.last_result.out, + "NC_003198.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) # now examine unassigned - testdata2_glob = utils.get_test_data('gather/GCF_000009505.1*.sig') + testdata2_glob = utils.get_test_data("gather/GCF_000009505.1*.sig") testdata2_sigs = glob.glob(testdata2_glob)[0] - runtmp.sourmash('gather', 'unassigned.sig', testdata_sigs, testdata2_sigs, '-k', '21') + runtmp.sourmash( + "gather", "unassigned.sig", testdata_sigs, testdata2_sigs, "-k", "21" + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert all(('1.3 Mbp 13.6% 28.2%' in runtmp.last_result.out, - 'NC_011294.1' in runtmp.last_result.out)) + assert all( + ( + "1.3 Mbp 13.6% 28.2%" in runtmp.last_result.out, + "NC_011294.1" in runtmp.last_result.out, + ) + ) def test_gather_metagenome_output_unassigned_as_zip(runtmp): - testdata_glob = utils.get_test_data('gather/GCF_000195995*g') + testdata_glob = utils.get_test_data("gather/GCF_000195995*g") testdata_sigs = glob.glob(testdata_glob)[0] - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - runtmp.sourmash('gather', query_sig, testdata_sigs, '-k', '21', '--output-unassigned=unassigned.sig.zip') + runtmp.sourmash( + "gather", + query_sig, + testdata_sigs, + "-k", + "21", + "--output-unassigned=unassigned.sig.zip", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 1 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 33.2% of the query' in runtmp.last_result.out - assert all(('4.9 Mbp 33.2% 100.0%' in runtmp.last_result.out, - 'NC_003198.1 Salmonella enterica subsp' in runtmp.last_result.out)) + assert "found 1 matches total" in runtmp.last_result.out + assert "the recovered matches hit 33.2% of the query" in runtmp.last_result.out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in runtmp.last_result.out, + "NC_003198.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) - assert zipfile.is_zipfile(runtmp.output('unassigned.sig.zip')) + assert zipfile.is_zipfile(runtmp.output("unassigned.sig.zip")) # now examine unassigned - testdata2_glob = utils.get_test_data('gather/GCF_000009505.1*.sig') + testdata2_glob = utils.get_test_data("gather/GCF_000009505.1*.sig") testdata2_sigs = glob.glob(testdata2_glob)[0] - runtmp.sourmash('gather', 'unassigned.sig.zip', testdata_sigs, testdata2_sigs, '-k', '21') + runtmp.sourmash( + "gather", "unassigned.sig.zip", testdata_sigs, testdata2_sigs, "-k", "21" + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert all(('1.3 Mbp 13.6% 28.2%' in runtmp.last_result.out, - 'NC_011294.1' in runtmp.last_result.out)) + assert all( + ( + "1.3 Mbp 13.6% 28.2%" in runtmp.last_result.out, + "NC_011294.1" in runtmp.last_result.out, + ) + ) def test_gather_metagenome_output_unassigned_none(runtmp): # test what happens when there's nothing unassigned to output - testdata_glob = utils.get_test_data('gather/GCF_*.sig') + testdata_glob = utils.get_test_data("gather/GCF_*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - runtmp.sourmash('gather', query_sig, *testdata_sigs, '-k', '21', '--output-unassigned=unassigned.sig', '--threshold=0') + runtmp.sourmash( + "gather", + query_sig, + *testdata_sigs, + "-k", + "21", + "--output-unassigned=unassigned.sig", + "--threshold=0", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 12 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out - assert all(('4.9 Mbp 33.2% 100.0%' in runtmp.last_result.out, - 'NC_003198.1 Salmonella enterica subsp' in runtmp.last_result.out)) - assert all(('4.5 Mbp 0.1% 0.4%' in runtmp.last_result.out, - 'NC_004631.1 Salmonella enterica subsp' in runtmp.last_result.out)) + assert "found 12 matches total" in runtmp.last_result.out + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out + assert all( + ( + "4.9 Mbp 33.2% 100.0%" in runtmp.last_result.out, + "NC_003198.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) + assert all( + ( + "4.5 Mbp 0.1% 0.4%" in runtmp.last_result.out, + "NC_004631.1 Salmonella enterica subsp" in runtmp.last_result.out, + ) + ) # now examine unassigned - assert not os.path.exists(runtmp.output('unassigned.sig')) - assert 'no unassigned hashes to save with --output-unassigned!' in runtmp.last_result.err + assert not os.path.exists(runtmp.output("unassigned.sig")) + assert ( + "no unassigned hashes to save with --output-unassigned!" + in runtmp.last_result.err + ) -def test_gather_metagenome_output_unassigned_nomatches(runtmp, prefetch_gather, linear_gather): +def test_gather_metagenome_output_unassigned_nomatches( + runtmp, prefetch_gather, linear_gather +): c = runtmp # test --output-unassigned when there are no matches - query_sig = utils.get_test_data('2.fa.sig') - against_sig = utils.get_test_data('47.fa.sig') - - c.run_sourmash('gather', query_sig, against_sig, - '--output-unassigned', 'foo.sig', linear_gather, - prefetch_gather) + query_sig = utils.get_test_data("2.fa.sig") + against_sig = utils.get_test_data("47.fa.sig") + + c.run_sourmash( + "gather", + query_sig, + against_sig, + "--output-unassigned", + "foo.sig", + linear_gather, + prefetch_gather, + ) print(c.last_result.out) assert "No matches found for --threshold-bp at 50.0 kbp." in c.last_result.err x = sourmash.load_one_signature(query_sig, ksize=31) - y = sourmash.load_one_signature(c.output('foo.sig')) + y = sourmash.load_one_signature(c.output("foo.sig")) assert x.minhash == y.minhash -def test_gather_metagenome_output_unassigned_nomatches_protein(runtmp, linear_gather, prefetch_gather): +def test_gather_metagenome_output_unassigned_nomatches_protein( + runtmp, linear_gather, prefetch_gather +): c = runtmp # test --output-unassigned with protein signatures - query_sig = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - against_sig = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig') - - c.run_sourmash('gather', query_sig, against_sig, - '--output-unassigned', 'foo.sig', linear_gather, - prefetch_gather) + query_sig = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) + against_sig = utils.get_test_data( + "prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig" + ) + + c.run_sourmash( + "gather", + query_sig, + against_sig, + "--output-unassigned", + "foo.sig", + linear_gather, + prefetch_gather, + ) print(c.last_result.out) assert "No matches found for --threshold-bp at 50.0 kbp." in c.last_result.err - c.run_sourmash('sig', 'describe', c.output('foo.sig')) + c.run_sourmash("sig", "describe", c.output("foo.sig")) print(c.last_result.out) x = sourmash.load_one_signature(query_sig, ksize=57) - y = sourmash.load_one_signature(c.output('foo.sig')) + y = sourmash.load_one_signature(c.output("foo.sig")) assert x.minhash == y.minhash assert y.minhash.moltype == "protein" def test_gather_check_scaled_bounds_negative(runtmp, prefetch_gather, linear_gather): - testdata_glob = utils.get_test_data('gather/GCF*.sig') - testdata_sigs = glob.glob(testdata_glob) + testdata_glob = utils.get_test_data("gather/GCF*.sig") + glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('gather', query_sig, prefetch_gather, linear_gather, 'gcf_all', '-k', '21', '--scaled', '-5', '--threshold-bp', '50000') + runtmp.sourmash( + "gather", + query_sig, + prefetch_gather, + linear_gather, + "gcf_all", + "-k", + "21", + "--scaled", + "-5", + "--threshold-bp", + "50000", + ) assert "ERROR: scaled value must be positive" in runtmp.last_result.err -def test_gather_check_scaled_bounds_less_than_minimum(runtmp, prefetch_gather, linear_gather): - testdata_glob = utils.get_test_data('gather/GCF*.sig') - testdata_sigs = glob.glob(testdata_glob) +def test_gather_check_scaled_bounds_less_than_minimum( + runtmp, prefetch_gather, linear_gather +): + testdata_glob = utils.get_test_data("gather/GCF*.sig") + glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('gather', query_sig, prefetch_gather, linear_gather, 'gcf_all', '-k', '21', '--scaled', '50', '--threshold-bp', '50000') - - assert "WARNING: scaled value should be >= 100. Continuing anyway." in runtmp.last_result.err - - -def test_gather_check_scaled_bounds_more_than_maximum(runtmp, prefetch_gather, linear_gather): - testdata_glob = utils.get_test_data('gather/GCF*.sig') - testdata_sigs = glob.glob(testdata_glob) - - query_sig = utils.get_test_data('gather/combined.sig') + runtmp.sourmash( + "gather", + query_sig, + prefetch_gather, + linear_gather, + "gcf_all", + "-k", + "21", + "--scaled", + "50", + "--threshold-bp", + "50000", + ) + + assert ( + "WARNING: scaled value should be >= 100. Continuing anyway." + in runtmp.last_result.err + ) + + +def test_gather_check_scaled_bounds_more_than_maximum( + runtmp, prefetch_gather, linear_gather +): + testdata_glob = utils.get_test_data("gather/GCF*.sig") + glob.glob(testdata_glob) + + query_sig = utils.get_test_data("gather/combined.sig") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('gather', query_sig, prefetch_gather, linear_gather, '-k', '21', '--scaled', '1e9', '--threshold-bp', '50000') - - assert "WARNING: scaled value should be <= 1e6. Continuing anyway." in runtmp.last_result.err + runtmp.sourmash( + "gather", + query_sig, + prefetch_gather, + linear_gather, + "-k", + "21", + "--scaled", + "1e9", + "--threshold-bp", + "50000", + ) + + assert ( + "WARNING: scaled value should be <= 1e6. Continuing anyway." + in runtmp.last_result.err + ) def test_gather_metagenome_downsample(runtmp, prefetch_gather, linear_gather): # downsample w/scaled of 100,000 - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) - - runtmp.sourmash('gather', query_sig, 'gcf_all', '-k', '21', '--scaled', '100000', prefetch_gather, linear_gather, '--threshold-bp', '50000') + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) + + runtmp.sourmash( + "gather", + query_sig, + "gcf_all", + "-k", + "21", + "--scaled", + "100000", + prefetch_gather, + linear_gather, + "--threshold-bp", + "50000", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 11 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out - assert all(('5.2 Mbp 32.9% 100.0%' in runtmp.last_result.out, - 'NC_003198.1' in runtmp.last_result.out)) - assert all(('4.1 Mbp 0.6% 2.4%' in runtmp.last_result.out, - '4.1 Mbp 4.4% 17.1%' in runtmp.last_result.out)) + assert "found 11 matches total" in runtmp.last_result.out + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out + assert all( + ( + "5.2 Mbp 32.9% 100.0%" in runtmp.last_result.out, + "NC_003198.1" in runtmp.last_result.out, + ) + ) + assert all( + ( + "4.1 Mbp 0.6% 2.4%" in runtmp.last_result.out, + "4.1 Mbp 4.4% 17.1%" in runtmp.last_result.out, + ) + ) def test_gather_query_downsample(runtmp, linear_gather, prefetch_gather): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) print(testdata_sigs) - query_sig = utils.get_test_data('GCF_000006945.2-s500.sig') + query_sig = utils.get_test_data("GCF_000006945.2-s500.sig") - runtmp.sourmash('gather', '-k', '31', linear_gather, prefetch_gather, query_sig, *testdata_sigs) + runtmp.sourmash( + "gather", "-k", "31", linear_gather, prefetch_gather, query_sig, *testdata_sigs + ) print(runtmp.last_result.out) print(runtmp.last_result.err) err = runtmp.last_result.err - assert 'loaded 36 total signatures from 12 locations.' in err - assert 'after selecting signatures compatible with search, 12 remain.' in err + assert "loaded 36 total signatures from 12 locations." in err + assert "after selecting signatures compatible with search, 12 remain." in err - assert all(('4.9 Mbp 100.0% 100.0%' in runtmp.last_result.out, - 'NC_003197.2' in runtmp.last_result.out)) + assert all( + ( + "4.9 Mbp 100.0% 100.0%" in runtmp.last_result.out, + "NC_003197.2" in runtmp.last_result.out, + ) + ) - assert 'WARNING: final scaled was 10000, vs query scaled of 500' in runtmp.last_result.out + assert ( + "WARNING: final scaled was 10000, vs query scaled of 500" + in runtmp.last_result.out + ) def test_gather_query_downsample_explicit(runtmp, linear_gather, prefetch_gather): # do an explicit downsampling to fix `test_gather_query_downsample` - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('GCF_000006945.2-s500.sig') + query_sig = utils.get_test_data("GCF_000006945.2-s500.sig") - runtmp.sourmash('gather', '-k', '31', '--scaled', '10000', linear_gather, prefetch_gather, query_sig, *testdata_sigs) + runtmp.sourmash( + "gather", + "-k", + "31", + "--scaled", + "10000", + linear_gather, + prefetch_gather, + query_sig, + *testdata_sigs, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) err = runtmp.last_result.err - assert 'loaded 36 total signatures from 12 locations.' in err - assert 'after selecting signatures compatible with search, 12 remain.' in err + assert "loaded 36 total signatures from 12 locations." in err + assert "after selecting signatures compatible with search, 12 remain." in err - assert all(('4.9 Mbp 100.0% 100.0%' in runtmp.last_result.out, - 'NC_003197.2' in runtmp.last_result.out)) + assert all( + ( + "4.9 Mbp 100.0% 100.0%" in runtmp.last_result.out, + "NC_003197.2" in runtmp.last_result.out, + ) + ) def test_gather_downsample_multiple(runtmp, linear_gather, prefetch_gather): # test multiple different downsamplings in gather code - query_sig = utils.get_test_data('GCF_000006945.2-s500.sig') + query_sig = utils.get_test_data("GCF_000006945.2-s500.sig") # load in the hashes and do split them into four bins, randomly. ss = sourmash.load_one_signature(query_sig) hashes = list(ss.minhash.hashes) - random.seed(a=1) # fix seed so test is reproducible + random.seed(a=1) # fix seed so test is reproducible random.shuffle(hashes) # split into 4 bins: - mh_bins = [ ss.minhash.copy_and_clear() for i in range(4) ] + mh_bins = [ss.minhash.copy_and_clear() for i in range(4)] for i, hashval in enumerate(hashes): mh_bins[i % 4].add_hash(hashval) @@ -4706,23 +5723,37 @@ def test_gather_downsample_multiple(runtmp, linear_gather, prefetch_gather): gathersigs.append(f"bin{i}.sig") - runtmp.sourmash('gather', '-k', '31', linear_gather, prefetch_gather, query_sig, *gathersigs) + runtmp.sourmash( + "gather", "-k", "31", linear_gather, prefetch_gather, query_sig, *gathersigs + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert "WARNING: final scaled was 1000, vs query scaled of 500" in runtmp.last_result.out + assert ( + "WARNING: final scaled was 1000, vs query scaled of 500" + in runtmp.last_result.out + ) def test_gather_with_picklist(runtmp, linear_gather, prefetch_gather): # test 'sourmash gather' with picklists - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') - - runtmp.sourmash('gather', metag_sig, *gcf_sigs, '--threshold-bp=0', - '-k', '21', '--picklist', f"{picklist}:md5:md5", - linear_gather, prefetch_gather) + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") + + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + "--threshold-bp=0", + "-k", + "21", + "--picklist", + f"{picklist}:md5:md5", + linear_gather, + prefetch_gather, + ) err = runtmp.last_result.err print(err) @@ -4740,13 +5771,22 @@ def test_gather_with_picklist(runtmp, linear_gather, prefetch_gather): def test_gather_with_picklist_exclude(runtmp, linear_gather, prefetch_gather): # test 'sourmash gather' with picklists - exclude - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') - - runtmp.sourmash('gather', metag_sig, *gcf_sigs, '--threshold-bp=0', - '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude", - linear_gather, prefetch_gather) + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") + + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + "--threshold-bp=0", + "-k", + "21", + "--picklist", + f"{picklist}:md5:md5:exclude", + linear_gather, + prefetch_gather, + ) err = runtmp.last_result.err print(err) @@ -4769,12 +5809,21 @@ def test_gather_with_picklist_exclude(runtmp, linear_gather, prefetch_gather): def test_gather_with_pattern_include(runtmp, linear_gather, prefetch_gather): # test 'sourmash gather' with --include-db-pattern - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - - runtmp.sourmash('gather', metag_sig, *gcf_sigs, '--threshold-bp=0', - '-k', '21', '--include', "thermotoga", - linear_gather, prefetch_gather) + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + "--threshold-bp=0", + "-k", + "21", + "--include", + "thermotoga", + linear_gather, + prefetch_gather, + ) err = runtmp.last_result.err print(err) @@ -4789,12 +5838,21 @@ def test_gather_with_pattern_include(runtmp, linear_gather, prefetch_gather): def test_gather_with_pattern_exclude(runtmp, linear_gather, prefetch_gather): # test 'sourmash gather' with --exclude - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - - runtmp.sourmash('gather', metag_sig, *gcf_sigs, '--threshold-bp=0', - '-k', '21', '--exclude', "thermotoga", - linear_gather, prefetch_gather) + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + "--threshold-bp=0", + "-k", + "21", + "--exclude", + "thermotoga", + linear_gather, + prefetch_gather, + ) err = runtmp.last_result.err print(err) @@ -4814,53 +5872,78 @@ def test_gather_with_pattern_exclude(runtmp, linear_gather, prefetch_gather): def test_gather_save_matches(runtmp, linear_gather, prefetch_gather): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) - - runtmp.sourmash('gather', query_sig, 'gcf_all', '-k', '21', '--save-matches', 'save.sigs', linear_gather, prefetch_gather, '--threshold-bp', '0') + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) + + runtmp.sourmash( + "gather", + query_sig, + "gcf_all", + "-k", + "21", + "--save-matches", + "save.sigs", + linear_gather, + prefetch_gather, + "--threshold-bp", + "0", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 12 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out - assert os.path.exists(runtmp.output('save.sigs')) + assert "found 12 matches total" in runtmp.last_result.out + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out + assert os.path.exists(runtmp.output("save.sigs")) def test_gather_save_matches_and_save_prefetch(runtmp, linear_gather): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - query_sig = utils.get_test_data('gather/combined.sig') + query_sig = utils.get_test_data("gather/combined.sig") - cmd = ['index', 'gcf_all'] + cmd = ["index", "gcf_all"] cmd.extend(testdata_sigs) - cmd.extend(['-k', '21']) + cmd.extend(["-k", "21"]) runtmp.sourmash(*cmd) - assert os.path.exists(runtmp.output('gcf_all.sbt.zip')) - - runtmp.sourmash('gather', query_sig, 'gcf_all', '-k', '21', '--save-matches', 'save.sigs', '--save-prefetch', 'save2.sigs', linear_gather, '--threshold-bp', '0') + assert os.path.exists(runtmp.output("gcf_all.sbt.zip")) + + runtmp.sourmash( + "gather", + query_sig, + "gcf_all", + "-k", + "21", + "--save-matches", + "save.sigs", + "--save-prefetch", + "save2.sigs", + linear_gather, + "--threshold-bp", + "0", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 12 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 100.0% of the query' in runtmp.last_result.out + assert "found 12 matches total" in runtmp.last_result.out + assert "the recovered matches hit 100.0% of the query" in runtmp.last_result.out - matches_save = runtmp.output('save.sigs') - prefetch_save = runtmp.output('save2.sigs') + matches_save = runtmp.output("save.sigs") + prefetch_save = runtmp.output("save2.sigs") assert os.path.exists(matches_save) assert os.path.exists(prefetch_save) @@ -4873,12 +5956,14 @@ def test_gather_save_matches_and_save_prefetch(runtmp, linear_gather): @utils.in_tempdir def test_gather_error_no_sigs_traverse(c): # test gather applied to a directory - query = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') + query = utils.get_test_data( + "prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig" + ) - emptydir = c.output('') + emptydir = c.output("") - with pytest.raises(SourmashCommandFailed) as e: - c.run_sourmash('gather', query, emptydir) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("gather", query, emptydir) err = c.last_result.err print(err) @@ -4886,65 +5971,85 @@ def test_gather_error_no_sigs_traverse(c): def test_gather_error_no_cardinality_query(runtmp, linear_gather, prefetch_gather): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=31,num=500", testdata1, testdata2) - testdata3 = utils.get_test_data('short3.fa') + testdata3 = utils.get_test_data("short3.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata3) + runtmp.sourmash("sketch", "translate", "-p", "k=31,num=500", testdata3) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('gather', 'short3.fa.sig', 'zzz', linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", "short3.fa.sig", "zzz", linear_gather, prefetch_gather + ) assert runtmp.last_result.status == -1 assert "query signature needs to be created with --scaled" in runtmp.last_result.err def test_gather_deduce_ksize(runtmp, prefetch_gather, linear_gather): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'dna', '-p', 'k=23,scaled=10', testdata1, testdata2) + runtmp.sourmash("sketch", "dna", "-p", "k=23,scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch','dna','-p','k=23,scaled=10', '-o', 'query.fa.sig', testdata2) + runtmp.sourmash( + "sketch", "dna", "-p", "k=23,scaled=10", "-o", "query.fa.sig", testdata2 + ) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', prefetch_gather, linear_gather, '--threshold-bp=1') + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + prefetch_gather, + linear_gather, + "--threshold-bp=1", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '0.9 kbp 100.0% 100.0%' in runtmp.last_result.out + assert "0.9 kbp 100.0% 100.0%" in runtmp.last_result.out def test_gather_deduce_moltype(runtmp, linear_gather, prefetch_gather): # gather should automatically figure out ksize - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") - runtmp.sourmash('sketch', 'translate', '-p', 'k=10,scaled=10', testdata1,testdata2) + runtmp.sourmash("sketch", "translate", "-p", "k=10,scaled=10", testdata1, testdata2) - runtmp.sourmash('sketch', 'translate', '-p', 'k=10,scaled=10', '-o', 'query.fa.sig',testdata2) + runtmp.sourmash( + "sketch", "translate", "-p", "k=10,scaled=10", "-o", "query.fa.sig", testdata2 + ) - runtmp.sourmash('index', 'zzz', 'short.fa.sig', 'short2.fa.sig') + runtmp.sourmash("index", "zzz", "short.fa.sig", "short2.fa.sig") - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', linear_gather, prefetch_gather, '--threshold-bp=1') + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + linear_gather, + prefetch_gather, + "--threshold-bp=1", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert '1.9 kbp 100.0% 100.0%' in runtmp.last_result.out + assert "1.9 kbp 100.0% 100.0%" in runtmp.last_result.out def test_gather_abund_1_1(runtmp, linear_gather, prefetch_gather): @@ -4966,14 +6071,14 @@ def test_gather_abund_1_1(runtmp, linear_gather, prefetch_gather): # ./sourmash compute -k 21 --scaled 1000 --merge=1-1 -o reads-s10-s11.sig r[13].fa --track-abundance # ./sourmash compute -k 21 --scaled 1000 --merge=10-1 -o reads-s10x10-s11.sig r[23].fa --track-abundance - query = utils.get_test_data('gather-abund/reads-s10-s11.sig') - against_list = ['genome-s10', 'genome-s11', 'genome-s12'] - against_list = ['gather-abund/' + i + '.fa.gz.sig' - for i in against_list] + query = utils.get_test_data("gather-abund/reads-s10-s11.sig") + against_list = ["genome-s10", "genome-s11", "genome-s12"] + against_list = ["gather-abund/" + i + ".fa.gz.sig" for i in against_list] against_list = [utils.get_test_data(i) for i in against_list] - status, out, err = c.run_sourmash('gather', query, *against_list, - linear_gather, prefetch_gather) + status, out, err = c.run_sourmash( + "gather", query, *against_list, linear_gather, prefetch_gather + ) print(out) print(err) @@ -4985,9 +6090,9 @@ def test_gather_abund_1_1(runtmp, linear_gather, prefetch_gather): # (this is due to the low coverage of 2 used to build queries) # * approximately 2.0 abundance (third column, avg_abund) - assert '49.6% 78.5% 1.8 tests/test-data/genome-s10.fa.gz' in out - assert '50.4% 80.0% 1.9 tests/test-data/genome-s11.fa.gz' in out - assert 'genome-s12.fa.gz' not in out + assert "49.6% 78.5% 1.8 tests/test-data/genome-s10.fa.gz" in out + assert "50.4% 80.0% 1.9 tests/test-data/genome-s11.fa.gz" in out + assert "genome-s12.fa.gz" not in out assert "the recovered matches hit 100.0% of the abundance-weighted query" in out assert "the recovered matches hit 100.0% of the query k-mers (unweighted)" in out @@ -5003,15 +6108,14 @@ def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather): # ./sourmash compute -k 21 --scaled 1000 --merge=1-1 -o reads-s10-s11.sig r[13].fa --track-abundance # ./sourmash compute -k 21 --scaled 1000 --merge=10-1 -o reads-s10x10-s11.sig r[23].fa --track-abundance - query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - against_list = ['genome-s10', 'genome-s11', 'genome-s12'] - against_list = ['gather-abund/' + i + '.fa.gz.sig' - for i in against_list] + query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + against_list = ["genome-s10", "genome-s11", "genome-s12"] + against_list = ["gather-abund/" + i + ".fa.gz.sig" for i in against_list] against_list = [utils.get_test_data(i) for i in against_list] - status, out, err = c.run_sourmash('gather', query, '-o', 'xxx.csv', - *against_list, linear_gather, - prefetch_gather) + status, out, err = c.run_sourmash( + "gather", query, "-o", "xxx.csv", *against_list, linear_gather, prefetch_gather + ) print(out) print(err) @@ -5025,14 +6129,14 @@ def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather): # * approximately 2.0 abundance (third column, avg_abund) for s11, # and (very) approximately 20x abundance for genome s10. - assert '91.0% 100.0% 14.5 tests/test-data/genome-s10.fa.gz' in out - assert '9.0% 80.0% 1.9 tests/test-data/genome-s11.fa.gz' in out - assert 'genome-s12.fa.gz' not in out + assert "91.0% 100.0% 14.5 tests/test-data/genome-s10.fa.gz" in out + assert "9.0% 80.0% 1.9 tests/test-data/genome-s11.fa.gz" in out + assert "genome-s12.fa.gz" not in out assert "the recovered matches hit 100.0% of the abundance-weighted query" in out # check the calculations behind the above output by looking into # the CSV. - with open(c.output('xxx.csv'), 'rt') as fp: + with open(c.output("xxx.csv")) as fp: r = csv.DictReader(fp) overlaps = [] @@ -5046,14 +6150,14 @@ def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather): total_weighted_list = [] for n, row in enumerate(r): - assert int(row['gather_result_rank']) == n + assert int(row["gather_result_rank"]) == n # other than f_weighted, these are all 'flat' numbers - no abunds. - overlap = float(row['intersect_bp']) - remaining_bp = float(row['remaining_bp']) - unique_overlap = float(row['unique_intersect_bp']) - f_weighted = float(row['f_unique_weighted']) - average_abund = float(row['average_abund']) + overlap = float(row["intersect_bp"]) + remaining_bp = float(row["remaining_bp"]) + unique_overlap = float(row["unique_intersect_bp"]) + f_weighted = float(row["f_unique_weighted"]) + average_abund = float(row["average_abund"]) overlaps.append(overlap) unique_overlaps.append(unique_overlap) @@ -5062,14 +6166,14 @@ def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather): remaining_bps.append(remaining_bp) # also track weighted calculations - n_weighted_list.append(float(row['n_unique_weighted_found'])) - sum_weighted_list.append(float(row['sum_weighted_found'])) - total_weighted_list.append(float(row['total_weighted_hashes'])) + n_weighted_list.append(float(row["n_unique_weighted_found"])) + sum_weighted_list.append(float(row["sum_weighted_found"])) + total_weighted_list.append(float(row["total_weighted_hashes"])) weighted_calc = [] - for (overlap, average_abund) in zip(overlaps, average_abunds): - prod = overlap*average_abund - weighted_calc.append(prod) # @CTB redundant terms with below? + for overlap, average_abund in zip(overlaps, average_abunds): + prod = overlap * average_abund + weighted_calc.append(prod) # @CTB redundant terms with below? total_weighted = sum(weighted_calc) for prod, f_weighted in zip(weighted_calc, f_weighted_list): @@ -5103,6 +6207,7 @@ def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather): f_weighted = f_weighted_list[i] assert f_weighted == n_weighted / 7986 + def test_gather_abund_10_1_ignore_abundance(runtmp, linear_gather, prefetch_gather): # check gather with an abundance-weighted query, then flattened with # --ignore-abund @@ -5115,18 +6220,21 @@ def test_gather_abund_10_1_ignore_abundance(runtmp, linear_gather, prefetch_gath # ./sourmash compute -k 21 --scaled 1000 --merge=1-1 -o reads-s10-s11.sig r[13].fa --track-abundance # ./sourmash compute -k 21 --scaled 1000 --merge=10-1 -o reads-s10x10-s11.sig r[23].fa --track-abundance - query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - against_list = ['genome-s10', 'genome-s11', 'genome-s12'] - against_list = ['gather-abund/' + i + '.fa.gz.sig' - for i in against_list] + query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + against_list = ["genome-s10", "genome-s11", "genome-s12"] + against_list = ["gather-abund/" + i + ".fa.gz.sig" for i in against_list] against_list = [utils.get_test_data(i) for i in against_list] - status, out, err = c.run_sourmash('gather', query, - '--ignore-abundance', - *against_list, - linear_gather, prefetch_gather, - '-o', c.output('results.csv')) - + status, out, err = c.run_sourmash( + "gather", + query, + "--ignore-abundance", + *against_list, + linear_gather, + prefetch_gather, + "-o", + c.output("results.csv"), + ) print(out) print(err) @@ -5139,38 +6247,47 @@ def test_gather_abund_10_1_ignore_abundance(runtmp, linear_gather, prefetch_gath # * approximately 100% of the high coverage genome being matched, # with only 80% of the low coverage genome - assert all(('57.2% 100.0%', 'tests/test-data/genome-s10.fa.gz' in out)) - assert all(('42.8% 80.0%', 'tests/test-data/genome-s11.fa.gz' in out)) - assert 'genome-s12.fa.gz' not in out + assert all(("57.2% 100.0%", "tests/test-data/genome-s10.fa.gz" in out)) + assert all(("42.8% 80.0%", "tests/test-data/genome-s11.fa.gz" in out)) + assert "genome-s12.fa.gz" not in out - with open(c.output('results.csv'), 'rt') as fp: + with open(c.output("results.csv")) as fp: r = csv.DictReader(fp) some_results = False for row in r: some_results = True - assert row['average_abund'] == '' - assert row['median_abund'] == '' - assert row['std_abund'] == '' + assert row["average_abund"] == "" + assert row["median_abund"] == "" + assert row["std_abund"] == "" - assert row['query_abundance'] == 'False', row['query_abundance'] - assert row['n_unique_weighted_found'] == '' + assert row["query_abundance"] == "False", row["query_abundance"] + assert row["n_unique_weighted_found"] == "" assert some_results -def test_gather_output_unassigned_with_abundance(runtmp, prefetch_gather, linear_gather): +def test_gather_output_unassigned_with_abundance( + runtmp, prefetch_gather, linear_gather +): # check --output-unassigned with an abund query # @CTB: could add check on sum weighted etc. c = runtmp - query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - against = utils.get_test_data('gather-abund/genome-s10.fa.gz.sig') - - c.run_sourmash('gather', query, against, '--output-unassigned', - c.output('unassigned.sig'), linear_gather, prefetch_gather) - - assert os.path.exists(c.output('unassigned.sig')) - - nomatch = sourmash.load_one_signature(c.output('unassigned.sig')) + query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + against = utils.get_test_data("gather-abund/genome-s10.fa.gz.sig") + + c.run_sourmash( + "gather", + query, + against, + "--output-unassigned", + c.output("unassigned.sig"), + linear_gather, + prefetch_gather, + ) + + assert os.path.exists(c.output("unassigned.sig")) + + nomatch = sourmash.load_one_signature(c.output("unassigned.sig")) assert nomatch.minhash.track_abundance query_ss = sourmash.load_one_signature(query) @@ -5189,14 +6306,21 @@ def test_gather_output_unassigned_with_abundance(runtmp, prefetch_gather, linear def test_gather_empty_db_fail(runtmp, linear_gather, prefetch_gather): # gather should fail on empty db with --fail-on-empty-database - query = utils.get_test_data('2.fa.sig') - against = utils.get_test_data('47.fa.sig') - against2 = utils.get_test_data('lca/47+63.lca.json') + query = utils.get_test_data("2.fa.sig") + against = utils.get_test_data("47.fa.sig") + against2 = utils.get_test_data("lca/47+63.lca.json") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('gather', query, against, against2, '-k', '51', - linear_gather, prefetch_gather) - + runtmp.sourmash( + "gather", + query, + against, + against2, + "-k", + "51", + linear_gather, + prefetch_gather, + ) err = runtmp.last_result.err assert "no compatible signatures found in " in err @@ -5204,13 +6328,21 @@ def test_gather_empty_db_fail(runtmp, linear_gather, prefetch_gather): def test_gather_empty_db_nofail(runtmp, prefetch_gather, linear_gather): # gather should not fail on empty db with --no-fail-on-empty-database - query = utils.get_test_data('2.fa.sig') - against = utils.get_test_data('47.fa.sig') - against2 = utils.get_test_data('lca/47+63.lca.json') - - runtmp.sourmash('gather', query, against, against2, '-k', '51', - '--no-fail-on-empty-data', - linear_gather, prefetch_gather) + query = utils.get_test_data("2.fa.sig") + against = utils.get_test_data("47.fa.sig") + against2 = utils.get_test_data("lca/47+63.lca.json") + + runtmp.sourmash( + "gather", + query, + against, + against2, + "-k", + "51", + "--no-fail-on-empty-data", + linear_gather, + prefetch_gather, + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -5218,16 +6350,20 @@ def test_gather_empty_db_nofail(runtmp, prefetch_gather, linear_gather): print(err) assert "no compatible signatures found in " in err - assert "ksize on this database is 31; this is different from requested ksize of 51" in err + assert ( + "ksize on this database is 31; this is different from requested ksize of 51" + in err + ) assert "loaded 50 total signatures from 2 locations" in err assert "after selecting signatures compatible with search, 0 remain." in err + def test_multigather_output_unassigned_with_abundance(runtmp): c = runtmp - query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - against = utils.get_test_data('gather-abund/genome-s10.fa.gz.sig') + query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + against = utils.get_test_data("gather-abund/genome-s10.fa.gz.sig") - cmd = 'multigather --query {} --db {}'.format(query, against).split() + cmd = f"multigather --query {query} --db {against}".split() c.run_sourmash(*cmd) print(c.last_result.out) @@ -5237,9 +6373,9 @@ def test_multigather_output_unassigned_with_abundance(runtmp): assert "the recovered matches hit 91.0% of the abundance-weighted query." in out assert "the recovered matches hit 57.2% of the query k-mers (unweighted)." in out - assert os.path.exists(c.output('r3.fa.unassigned.sig')) + assert os.path.exists(c.output("r3.fa.unassigned.sig")) - nomatch = sourmash.load_one_signature(c.output('r3.fa.unassigned.sig')) + nomatch = sourmash.load_one_signature(c.output("r3.fa.unassigned.sig")) assert nomatch.minhash.track_abundance query_ss = sourmash.load_one_signature(query) @@ -5258,13 +6394,14 @@ def test_multigather_output_unassigned_with_abundance(runtmp): def test_multigather_empty_db_fail(runtmp): # multigather should fail on empty db with --fail-on-empty-database - query = utils.get_test_data('2.fa.sig') - against = utils.get_test_data('47.fa.sig') - against2 = utils.get_test_data('lca/47+63.lca.json') + query = utils.get_test_data("2.fa.sig") + against = utils.get_test_data("47.fa.sig") + against2 = utils.get_test_data("lca/47+63.lca.json") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('multigather', '--query', query, - '--db', against, against2, '-k', '51') + runtmp.sourmash( + "multigather", "--query", query, "--db", against, against2, "-k", "51" + ) err = runtmp.last_result.err assert "no compatible signatures found in " in err @@ -5272,13 +6409,21 @@ def test_multigather_empty_db_fail(runtmp): def test_multigather_empty_db_nofail(runtmp): # multigather should not fail on empty db with --no-fail-on-empty-database - query = utils.get_test_data('2.fa.sig') - against = utils.get_test_data('47.fa.sig') - against2 = utils.get_test_data('lca/47+63.lca.json') - - runtmp.sourmash('multigather', '--query', query, - '--db', against, against2, '-k', '51', - '--no-fail-on-empty-data') + query = utils.get_test_data("2.fa.sig") + against = utils.get_test_data("47.fa.sig") + against2 = utils.get_test_data("lca/47+63.lca.json") + + runtmp.sourmash( + "multigather", + "--query", + query, + "--db", + against, + against2, + "-k", + "51", + "--no-fail-on-empty-data", + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -5286,7 +6431,10 @@ def test_multigather_empty_db_nofail(runtmp): print(err) assert "no compatible signatures found in " in err - assert "ksize on this database is 31; this is different from requested ksize of 51" in err + assert ( + "ksize on this database is 31; this is different from requested ksize of 51" + in err + ) assert "conducted gather searches on 0 signatures" in err assert "loaded 50 total signatures from 2 locations" in err assert "after selecting signatures compatible with search, 0 remain." in err @@ -5294,53 +6442,54 @@ def test_multigather_empty_db_nofail(runtmp): def test_multigather_nomatch(runtmp): testdata_query = utils.get_test_data( - 'gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig') - testdata_match = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + "gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig" + ) + testdata_match = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") - runtmp.sourmash('multigather', '--query', testdata_query, - '--db', testdata_match, '-k', '31') + runtmp.sourmash( + "multigather", "--query", testdata_query, "--db", testdata_match, "-k", "31" + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 0 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 0.0% of the query' in runtmp.last_result.out + assert "found 0 matches total" in runtmp.last_result.out + assert "the recovered matches hit 0.0% of the query" in runtmp.last_result.out def test_multigather_abund_nomatch(runtmp): - testdata_query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - testdata_match = utils.get_test_data('gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig') + testdata_query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + testdata_match = utils.get_test_data( + "gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig" + ) - runtmp.sourmash('multigather', '--query', testdata_query, - '--db', testdata_match) + runtmp.sourmash("multigather", "--query", testdata_query, "--db", testdata_match) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'found 0 matches total' in runtmp.last_result.out - assert 'the recovered matches hit 0.0% of the query' in runtmp.last_result.out + assert "found 0 matches total" in runtmp.last_result.out + assert "the recovered matches hit 0.0% of the query" in runtmp.last_result.out def test_sbt_categorize(runtmp): - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") # all four in the current directory for categorize . - shutil.copyfile(testdata1, runtmp.output('1.sig')) - shutil.copyfile(testdata2, runtmp.output('2.sig')) - shutil.copyfile(testdata3, runtmp.output('3.sig')) - shutil.copyfile(testdata4, runtmp.output('4.sig')) + shutil.copyfile(testdata1, runtmp.output("1.sig")) + shutil.copyfile(testdata2, runtmp.output("2.sig")) + shutil.copyfile(testdata3, runtmp.output("3.sig")) + shutil.copyfile(testdata4, runtmp.output("4.sig")) # omit 3 - args = ['index', '--dna', '-k', '21', 'zzz', '1.sig', '2.sig'] + args = ["index", "--dna", "-k", "21", "zzz", "1.sig", "2.sig"] runtmp.sourmash(*args) - # categorize all of the ones that were copied to 'location' - args = ['categorize', 'zzz', '.', - '--ksize', '21', '--dna', '--csv', 'out.csv'] + args = ["categorize", "zzz", ".", "--ksize", "21", "--dna", "--csv", "out.csv"] runtmp.sourmash(*args) print(runtmp.last_result.out) @@ -5348,27 +6497,34 @@ def test_sbt_categorize(runtmp): # mash dist genome-s10.fa.gz genome-s10+s11.fa.gz # yields 521/1000 ==> ~0.5 - assert 'for genome-s10+s11, found: 0.50 genome-s10' in runtmp.last_result.err + assert "for genome-s10+s11, found: 0.50 genome-s10" in runtmp.last_result.err - out_csv = Path(runtmp.output('out.csv')).read_text() + out_csv = Path(runtmp.output("out.csv")).read_text() print(out_csv) - assert '4.sig,genome-s10+s11,genome-s10,0.504' in out_csv + assert "4.sig,genome-s10+s11,genome-s10,0.504" in out_csv def test_sbt_categorize_ignore_abundance_1(runtmp): # --- Categorize without ignoring abundance --- - query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - against_list = ['reads-s10-s11'] - against_list = ['gather-abund/' + i + '.sig' - for i in against_list] + query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + against_list = ["reads-s10-s11"] + against_list = ["gather-abund/" + i + ".sig" for i in against_list] against_list = [utils.get_test_data(i) for i in against_list] # omit 3 - args = ['index', '--dna', '-k', '21', 'thebestdatabase'] + against_list + args = ["index", "--dna", "-k", "21", "thebestdatabase"] + against_list runtmp.sourmash(*args) - args = ['categorize', 'thebestdatabase', - '--ksize', '21', '--dna', '--csv', 'out3.csv', query] + args = [ + "categorize", + "thebestdatabase", + "--ksize", + "21", + "--dna", + "--csv", + "out3.csv", + query, + ] with pytest.raises(SourmashCommandFailed): runtmp.sourmash(*args) @@ -5377,120 +6533,138 @@ def test_sbt_categorize_ignore_abundance_1(runtmp): print(runtmp.last_result.out) print(runtmp.last_result.err) - assert "ERROR: this search cannot be done on signatures calculated with abundance." in runtmp.last_result.err + assert ( + "ERROR: this search cannot be done on signatures calculated with abundance." + in runtmp.last_result.err + ) assert "ERROR: please specify --ignore-abundance." in runtmp.last_result.err def test_sbt_categorize_ignore_abundance_3(runtmp): # --- Now categorize with ignored abundance --- - query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') - against_list = ['reads-s10-s11'] - against_list = ['gather-abund/' + i + '.sig' - for i in against_list] + query = utils.get_test_data("gather-abund/reads-s10x10-s11.sig") + against_list = ["reads-s10-s11"] + against_list = ["gather-abund/" + i + ".sig" for i in against_list] against_list = [utils.get_test_data(i) for i in against_list] # omit 3 - args = ['index', '--dna', '-k', '21', 'thebestdatabase'] + against_list + args = ["index", "--dna", "-k", "21", "thebestdatabase"] + against_list runtmp.sourmash(*args) - args = ['categorize', '--ignore-abundance', - '--ksize', '21', '--dna', '--csv', 'out4.csv', - 'thebestdatabase', query] + args = [ + "categorize", + "--ignore-abundance", + "--ksize", + "21", + "--dna", + "--csv", + "out4.csv", + "thebestdatabase", + query, + ] runtmp.sourmash(*args) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'for 1-1, found: 0.88 1-1' in runtmp.last_result.err + assert "for 1-1, found: 0.88 1-1" in runtmp.last_result.err - out_csv4 = Path(runtmp.output('out4.csv')).read_text() - assert 'reads-s10x10-s11.sig,1-1,1-1,0.87699' in out_csv4 + out_csv4 = Path(runtmp.output("out4.csv")).read_text() + assert "reads-s10x10-s11.sig,1-1,1-1,0.87699" in out_csv4 def test_sbt_categorize_already_done(runtmp): - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") - shutil.copyfile(testdata1, runtmp.output('1.sig')) - shutil.copyfile(testdata2, runtmp.output('2.sig')) - shutil.copyfile(testdata3, runtmp.output('3.sig')) - shutil.copyfile(testdata4, runtmp.output('4.sig')) + shutil.copyfile(testdata1, runtmp.output("1.sig")) + shutil.copyfile(testdata2, runtmp.output("2.sig")) + shutil.copyfile(testdata3, runtmp.output("3.sig")) + shutil.copyfile(testdata4, runtmp.output("4.sig")) # omit 3 - args = ['index', '--dna', '-k', '21', 'zzz', '1.sig', '2.sig'] + args = ["index", "--dna", "-k", "21", "zzz", "1.sig", "2.sig"] runtmp.sourmash(*args) - with open(runtmp.output('in.csv'), 'wt') as fp: - fp.write('./4.sig,genome-s10.fa.gz,0.50') - - args = ['categorize', 'zzz', './2.sig', './4.sig', - '--ksize', '21', '--dna', '--load-csv', 'in.csv'] + with open(runtmp.output("in.csv"), "w") as fp: + fp.write("./4.sig,genome-s10.fa.gz,0.50") + + args = [ + "categorize", + "zzz", + "./2.sig", + "./4.sig", + "--ksize", + "21", + "--dna", + "--load-csv", + "in.csv", + ] runtmp.sourmash(*args) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'for genome-s11.fa.gz, no match found' - assert not 'for s10+s11, found: 0.50 genome-s10.fa.gz' in runtmp.last_result.err + assert "for genome-s11.fa.gz, no match found" + assert "for s10+s11, found: 0.50 genome-s10.fa.gz" not in runtmp.last_result.err def test_sbt_categorize_already_done_traverse(runtmp): - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') - testdata4 = utils.get_test_data('genome-s10+s11.sig') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") + testdata4 = utils.get_test_data("genome-s10+s11.sig") - shutil.copyfile(testdata1, runtmp.output('1.sig')) - shutil.copyfile(testdata2, runtmp.output('2.sig')) - shutil.copyfile(testdata3, runtmp.output('3.sig')) - shutil.copyfile(testdata4, runtmp.output('4.sig')) + shutil.copyfile(testdata1, runtmp.output("1.sig")) + shutil.copyfile(testdata2, runtmp.output("2.sig")) + shutil.copyfile(testdata3, runtmp.output("3.sig")) + shutil.copyfile(testdata4, runtmp.output("4.sig")) # omit 3 - args = ['index', '--dna', '-k', '21', 'zzz', '1.sig', '2.sig'] + args = ["index", "--dna", "-k", "21", "zzz", "1.sig", "2.sig"] runtmp.sourmash(*args) - with open(runtmp.output('in.csv'), 'wt') as fp: - fp.write('./4.sig,genome-s10.fa.gz,0.50') + with open(runtmp.output("in.csv"), "w") as fp: + fp.write("./4.sig,genome-s10.fa.gz,0.50") - args = ['categorize', 'zzz', '.', - '--ksize', '21', '--dna', '--load-csv', 'in.csv'] + args = ["categorize", "zzz", ".", "--ksize", "21", "--dna", "--load-csv", "in.csv"] runtmp.sourmash(*args) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'for genome-s11.fa.gz, no match found' - assert not 'for s10+s11, found: 0.50 genome-s10.fa.gz' in runtmp.last_result.err + assert "for genome-s11.fa.gz, no match found" + assert "for s10+s11, found: 0.50 genome-s10.fa.gz" not in runtmp.last_result.err def test_sbt_categorize_multiple_ksizes_moltypes(runtmp): # 'categorize' works fine with multiple moltypes/ksizes - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') - testdata3 = utils.get_test_data('genome-s12.fa.gz.sig') + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + testdata2 = utils.get_test_data("genome-s11.fa.gz.sig") + testdata3 = utils.get_test_data("genome-s12.fa.gz.sig") - shutil.copyfile(testdata1, runtmp.output('1.sig')) - shutil.copyfile(testdata2, runtmp.output('2.sig')) - shutil.copyfile(testdata3, runtmp.output('3.sig')) + shutil.copyfile(testdata1, runtmp.output("1.sig")) + shutil.copyfile(testdata2, runtmp.output("2.sig")) + shutil.copyfile(testdata3, runtmp.output("3.sig")) - args = ['index', '--dna', '-k', '21', 'zzz', '1.sig', '2.sig'] + args = ["index", "--dna", "-k", "21", "zzz", "1.sig", "2.sig"] runtmp.sourmash(*args) - args = ['categorize', 'zzz', '.'] + args = ["categorize", "zzz", "."] runtmp.sourmash(*args) def test_watch_check_num_bounds_negative(runtmp): # check that watch properly outputs error on negative num c = runtmp - testdata0 = utils.get_test_data('genome-s10.fa.gz') - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - shutil.copyfile(testdata1, c.output('1.sig')) + testdata0 = utils.get_test_data("genome-s10.fa.gz") + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + shutil.copyfile(testdata1, c.output("1.sig")) - c.run_sourmash('index', '--dna', '-k', '21', 'zzz', '1.sig') + c.run_sourmash("index", "--dna", "-k", "21", "zzz", "1.sig") - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('watch', '--ksize', '21', '-n', '-5', '--dna', 'zzz', testdata0) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("watch", "--ksize", "21", "-n", "-5", "--dna", "zzz", testdata0) assert "ERROR: num value must be positive" in c.last_result.err @@ -5498,13 +6672,13 @@ def test_watch_check_num_bounds_negative(runtmp): def test_watch_check_num_bounds_less_than_minimum(runtmp): # check that watch properly outputs warnings on small num c = runtmp - testdata0 = utils.get_test_data('genome-s10.fa.gz') - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - shutil.copyfile(testdata1, c.output('1.sig')) + testdata0 = utils.get_test_data("genome-s10.fa.gz") + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + shutil.copyfile(testdata1, c.output("1.sig")) - c.run_sourmash('index', '--dna', '-k', '21', 'zzz', '1.sig') + c.run_sourmash("index", "--dna", "-k", "21", "zzz", "1.sig") - c.run_sourmash('watch', '--ksize', '21', '-n', '25', '--dna', 'zzz', testdata0) + c.run_sourmash("watch", "--ksize", "21", "-n", "25", "--dna", "zzz", testdata0) assert "WARNING: num value should be >= 50. Continuing anyway." in c.last_result.err @@ -5512,113 +6686,124 @@ def test_watch_check_num_bounds_less_than_minimum(runtmp): def test_watch_check_num_bounds_more_than_maximum(runtmp): # check that watch properly outputs warnings on large num c = runtmp - testdata0 = utils.get_test_data('genome-s10.fa.gz') - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - shutil.copyfile(testdata1, c.output('1.sig')) + testdata0 = utils.get_test_data("genome-s10.fa.gz") + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + shutil.copyfile(testdata1, c.output("1.sig")) - c.run_sourmash('index', '--dna', '-k', '21', 'zzz', '1.sig') + c.run_sourmash("index", "--dna", "-k", "21", "zzz", "1.sig") - c.run_sourmash('watch', '--ksize', '21', '-n', '100000', '--dna', 'zzz', testdata0) + c.run_sourmash("watch", "--ksize", "21", "-n", "100000", "--dna", "zzz", testdata0) - assert "WARNING: num value should be <= 50000. Continuing anyway." in c.last_result.err + assert ( + "WARNING: num value should be <= 50000. Continuing anyway." in c.last_result.err + ) def test_watch(runtmp): # check basic watch functionality c = runtmp - testdata0 = utils.get_test_data('genome-s10.fa.gz') - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - shutil.copyfile(testdata1, c.output('1.sig')) + testdata0 = utils.get_test_data("genome-s10.fa.gz") + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + shutil.copyfile(testdata1, c.output("1.sig")) - c.run_sourmash('index', '--dna', '-k', '21', 'zzz', '1.sig') + c.run_sourmash("index", "--dna", "-k", "21", "zzz", "1.sig") - c.run_sourmash('watch', '--ksize', '21', '--dna', 'zzz', testdata0) + c.run_sourmash("watch", "--ksize", "21", "--dna", "zzz", testdata0) print(c.last_result.out) print(c.last_result.err) - assert 'FOUND: genome-s10, at 1.000' in c.last_result.out + assert "FOUND: genome-s10, at 1.000" in c.last_result.out def test_watch_deduce_ksize(runtmp): # check that watch guesses ksize automatically from database c = runtmp - testdata0 = utils.get_test_data('genome-s10.fa.gz') - c.run_sourmash('sketch','dna','-p','k=29,num=500', '-o', '1.sig', testdata0) + testdata0 = utils.get_test_data("genome-s10.fa.gz") + c.run_sourmash("sketch", "dna", "-p", "k=29,num=500", "-o", "1.sig", testdata0) - c.run_sourmash('index', '--dna', '-k', '29', 'zzz', '1.sig') + c.run_sourmash("index", "--dna", "-k", "29", "zzz", "1.sig") - c.run_sourmash('watch', '--dna', 'zzz', testdata0) + c.run_sourmash("watch", "--dna", "zzz", testdata0) print(c.last_result.out) print(c.last_result.err) - assert 'Computing signature for k=29' in c.last_result.err - assert 'genome-s10.fa.gz, at 1.000' in c.last_result.out + assert "Computing signature for k=29" in c.last_result.err + assert "genome-s10.fa.gz, at 1.000" in c.last_result.out def test_watch_coverage(runtmp): # check output details/coverage of found - testdata0 = utils.get_test_data('genome-s10.fa.gz') - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - shutil.copyfile(testdata1, runtmp.output('1.sig')) + testdata0 = utils.get_test_data("genome-s10.fa.gz") + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + shutil.copyfile(testdata1, runtmp.output("1.sig")) - args = ['index', '--dna', '-k', '21', 'zzz', '1.sig'] + args = ["index", "--dna", "-k", "21", "zzz", "1.sig"] runtmp.sourmash(*args) - with open(runtmp.output('query.fa'), 'wt') as fp: + with open(runtmp.output("query.fa"), "w") as fp: record = list(screed.open(testdata0))[0] for start in range(0, len(record), 100): - fp.write('>{}\n{}\n'.format(start, - record.sequence[start:start+500])) + fp.write(f">{start}\n{record.sequence[start : start + 500]}\n") - args = ['watch', '--ksize', '21', '--dna', 'zzz', 'query.fa'] + args = ["watch", "--ksize", "21", "--dna", "zzz", "query.fa"] runtmp.sourmash(*args) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'FOUND: genome-s10, at 1.000' in runtmp.last_result.out + assert "FOUND: genome-s10, at 1.000" in runtmp.last_result.out def test_watch_output_sig(runtmp): # test watch --output - testdata0 = utils.get_test_data('genome-s10.fa.gz') - testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') - shutil.copyfile(testdata1, runtmp.output('1.sig')) + testdata0 = utils.get_test_data("genome-s10.fa.gz") + testdata1 = utils.get_test_data("genome-s10.fa.gz.sig") + shutil.copyfile(testdata1, runtmp.output("1.sig")) - args = ['index', '--dna', '-k', '21', 'zzz', '1.sig'] + args = ["index", "--dna", "-k", "21", "zzz", "1.sig"] runtmp.sourmash(*args) - with open(runtmp.output('query.fa'), 'wt') as fp: + with open(runtmp.output("query.fa"), "w") as fp: record = list(screed.open(testdata0))[0] for start in range(0, len(record), 100): - fp.write('>{}\n{}\n'.format(start, - record.sequence[start:start+500])) - - args = ['watch', '--ksize', '21', '--dna', 'zzz', 'query.fa', - '-o', 'out.sig', '--name', 'xyzfoo'] + fp.write(f">{start}\n{record.sequence[start : start + 500]}\n") + + args = [ + "watch", + "--ksize", + "21", + "--dna", + "zzz", + "query.fa", + "-o", + "out.sig", + "--name", + "xyzfoo", + ] runtmp.sourmash(*args) print(runtmp.last_result.out) print(runtmp.last_result.err) - out_sig = runtmp.output('out.sig') + out_sig = runtmp.output("out.sig") assert os.path.exists(out_sig) siglist = list(sourmash.load_file_as_signatures(out_sig)) assert len(siglist) == 1 - assert siglist[0].filename == 'stdin' - assert siglist[0].name == 'xyzfoo' + assert siglist[0].filename == "stdin" + assert siglist[0].name == "xyzfoo" def test_storage_convert(runtmp): - testdata = utils.get_test_data('v2.sbt.json') - shutil.copyfile(testdata, runtmp.output('v2.sbt.json')) - shutil.copytree(os.path.join(os.path.dirname(testdata), '.sbt.v2'), - runtmp.output('.sbt.v2')) - testsbt = runtmp.output('v2.sbt.json') + testdata = utils.get_test_data("v2.sbt.json") + shutil.copyfile(testdata, runtmp.output("v2.sbt.json")) + shutil.copytree( + os.path.join(os.path.dirname(testdata), ".sbt.v2"), runtmp.output(".sbt.v2") + ) + testsbt = runtmp.output("v2.sbt.json") original = SBT.load(testsbt, leaf_loader=SigLeaf.load) - args = ['storage', 'convert', '-b', 'ipfs', testsbt] + args = ["storage", "convert", "-b", "ipfs", testsbt] try: runtmp.sourmash(*args) except SourmashCommandFailed: @@ -5626,151 +6811,165 @@ def test_storage_convert(runtmp): if runtmp.last_result.status: if "ipfshttpclient.ConnectionError" in runtmp.last_result.err: - raise pytest.xfail('ipfs probably not running') + raise pytest.xfail("ipfs probably not running") if "No module named 'ipfshttpclient'" in runtmp.last_result.err: - raise pytest.xfail('ipfshttpclient module not installed') + raise pytest.xfail("ipfshttpclient module not installed") print("NO FAIL; KEEP ON GOING!") - ipfs = SBT.load(testsbt, leaf_loader=SigLeaf.load) assert len(original) == len(ipfs) - assert all(n1[1].name == n2[1].name - for (n1, n2) in zip(sorted(original), sorted(ipfs))) - - args = ['storage', 'convert', - '-b', """'ZipStorage("{}")'""".format( - runtmp.output('v2.sbt.zip')), - testsbt] + assert all( + n1[1].name == n2[1].name for (n1, n2) in zip(sorted(original), sorted(ipfs)) + ) + + args = [ + "storage", + "convert", + "-b", + """'ZipStorage("{}")'""".format(runtmp.output("v2.sbt.zip")), + testsbt, + ] runtmp.sourmash(*args) tar = SBT.load(testsbt, leaf_loader=SigLeaf.load) assert len(original) == len(tar) - assert all(n1[1].name == n2[1].name - for (n1, n2) in zip(sorted(original), sorted(tar))) + assert all( + n1[1].name == n2[1].name for (n1, n2) in zip(sorted(original), sorted(tar)) + ) print("it all worked!!") def test_storage_convert_identity(runtmp): - testdata = utils.get_test_data('v2.sbt.json') - shutil.copyfile(testdata, runtmp.output('v2.sbt.json')) - shutil.copytree(os.path.join(os.path.dirname(testdata), '.sbt.v2'), - runtmp.output('.sbt.v2')) - testsbt = runtmp.output('v2.sbt.json') + testdata = utils.get_test_data("v2.sbt.json") + shutil.copyfile(testdata, runtmp.output("v2.sbt.json")) + shutil.copytree( + os.path.join(os.path.dirname(testdata), ".sbt.v2"), runtmp.output(".sbt.v2") + ) + testsbt = runtmp.output("v2.sbt.json") original = SBT.load(testsbt, leaf_loader=SigLeaf.load) - args = ['storage', 'convert', '-b', 'fsstorage', testsbt] + args = ["storage", "convert", "-b", "fsstorage", testsbt] runtmp.sourmash(*args) identity = SBT.load(testsbt, leaf_loader=SigLeaf.load) assert len(original) == len(identity) - assert all(n1[1].name == n2[1].name - for (n1, n2) in zip(sorted(original), sorted(identity))) + assert all( + n1[1].name == n2[1].name for (n1, n2) in zip(sorted(original), sorted(identity)) + ) def test_storage_convert_fsstorage_newpath(runtmp): - testdata = utils.get_test_data('v2.sbt.json') - shutil.copyfile(testdata, runtmp.output('v2.sbt.json')) - shutil.copytree(os.path.join(os.path.dirname(testdata), '.sbt.v2'), - runtmp.output('.sbt.v2')) - testsbt = runtmp.output('v2.sbt.json') + testdata = utils.get_test_data("v2.sbt.json") + shutil.copyfile(testdata, runtmp.output("v2.sbt.json")) + shutil.copytree( + os.path.join(os.path.dirname(testdata), ".sbt.v2"), runtmp.output(".sbt.v2") + ) + testsbt = runtmp.output("v2.sbt.json") original = SBT.load(testsbt, leaf_loader=SigLeaf.load) - args = ['storage', 'convert', - '-b', 'fsstorage({})'.format(runtmp.output('v3')), - testsbt] + args = [ + "storage", + "convert", + "-b", + "fsstorage({})".format(runtmp.output("v3")), + testsbt, + ] runtmp.sourmash(*args) identity = SBT.load(testsbt, leaf_loader=SigLeaf.load) assert len(original) == len(identity) - assert all(n1[1].name == n2[1].name - for (n1, n2) in zip(sorted(original), sorted(identity))) + assert all( + n1[1].name == n2[1].name for (n1, n2) in zip(sorted(original), sorted(identity)) + ) def test_migrate(runtmp): - testdata = utils.get_test_data('v3.sbt.json') - shutil.copyfile(testdata, runtmp.output('v3.sbt.json')) - shutil.copytree(os.path.join(os.path.dirname(testdata), '.sbt.v3'), - runtmp.output('.sbt.v3')) - testsbt = runtmp.output('v3.sbt.json') + testdata = utils.get_test_data("v3.sbt.json") + shutil.copyfile(testdata, runtmp.output("v3.sbt.json")) + shutil.copytree( + os.path.join(os.path.dirname(testdata), ".sbt.v3"), runtmp.output(".sbt.v3") + ) + testsbt = runtmp.output("v3.sbt.json") original = SBT.load(testsbt, leaf_loader=SigLeaf.load) - runtmp.sourmash('migrate', testsbt) + runtmp.sourmash("migrate", testsbt) identity = SBT.load(testsbt, leaf_loader=SigLeaf.load) assert len(original) == len(identity) - assert all(n1[1].name == n2[1].name - for (n1, n2) in zip(sorted(original), - sorted(identity))) + assert all( + n1[1].name == n2[1].name for (n1, n2) in zip(sorted(original), sorted(identity)) + ) assert "this is an old index version" not in runtmp.last_result.err - assert all('min_n_below' in node.metadata - for node in identity - if isinstance(node, Node)) + assert all( + "min_n_below" in node.metadata for node in identity if isinstance(node, Node) + ) def test_license_cc0(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch','translate', '-p', 'k=31', testdata1) + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "translate", "-p", "k=31", testdata1) - sigfile = runtmp.output('short.fa.sig') + sigfile = runtmp.output("short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('short.fa') + assert str(sig).endswith("short.fa") - assert sig.license == 'CC0' + assert sig.license == "CC0" def test_license_non_cc0(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'translate', '-p','k=31', '--license', 'GPL', testdata1) + runtmp.sourmash( + "sketch", "translate", "-p", "k=31", "--license", "GPL", testdata1 + ) assert runtmp.last_result.status != 0 print(runtmp.last_result.out) print(runtmp.last_result.err) - assert 'sourmash only supports CC0' in runtmp.last_result.err + assert "sourmash only supports CC0" in runtmp.last_result.err def test_license_load_non_cc0(): - sigfile = utils.get_test_data('bad-license.sig') + sigfile = utils.get_test_data("bad-license.sig") try: - sig = next(signature.load_signatures(sigfile, do_raise=True)) + next(signature.load_signatures(sigfile, do_raise=True)) except Exception as e: assert "sourmash only supports CC0-licensed signatures" in str(e) @utils.in_tempdir def test_do_sourmash_index_zipfile(c): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) - c.run_sourmash('index', '-k', '31', 'zzz.sbt.zip', - *testdata_sigs) + c.run_sourmash("index", "-k", "31", "zzz.sbt.zip", *testdata_sigs) - outfile = c.output('zzz.sbt.zip') + outfile = c.output("zzz.sbt.zip") assert os.path.exists(outfile) print(c) assert c.last_result.status == 0 - assert 'Finished saving SBT index, available at' in c.last_result.err + assert "Finished saving SBT index, available at" in c.last_result.err # look internally at the zip file with zipfile.ZipFile(outfile) as zf: content = zf.namelist() assert len(content) == 26 - assert len([c for c in content if 'internal' in c]) == 11 + assert len([c for c in content if "internal" in c]) == 11 assert ".sbt.zzz/" in content sbts = [c for c in content if c.endswith(".sbt.json")] assert len(sbts) == 1 @@ -5779,7 +6978,7 @@ def test_do_sourmash_index_zipfile(c): @utils.in_tempdir def test_do_sourmash_index_zipfile_append(c): - testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_glob = utils.get_test_data("gather/GCF*.sig") testdata_sigs = glob.glob(testdata_glob) half_point = int(len(testdata_sigs) / 2) first_half = testdata_sigs[:half_point] @@ -5792,35 +6991,33 @@ def test_do_sourmash_index_zipfile_append(c): assert not set(first_half).intersection(set(second_half)) with warnings.catch_warnings(record=True) as record: - c.run_sourmash('index', '-k', '31', 'zzz.sbt.zip', - *first_half) + c.run_sourmash("index", "-k", "31", "zzz.sbt.zip", *first_half) # UserWarning is raised when there are duplicated entries in the zipfile assert not record, record - outfile = c.output('zzz.sbt.zip') + outfile = c.output("zzz.sbt.zip") assert os.path.exists(outfile) print(c) assert c.last_result.status == 0 - assert 'Finished saving SBT index, available at' in c.last_result.err + assert "Finished saving SBT index, available at" in c.last_result.err with warnings.catch_warnings(record=True) as record: - c.run_sourmash('index', "--append", '-k', '31', 'zzz.sbt.zip', - *second_half) + c.run_sourmash("index", "--append", "-k", "31", "zzz.sbt.zip", *second_half) # UserWarning is raised when there are duplicated entries in the zipfile print(record) - #assert not record, record + # assert not record, record print(c) assert c.last_result.status == 0 - assert 'Finished saving SBT index, available at' in c.last_result.err + assert "Finished saving SBT index, available at" in c.last_result.err # look internally at the zip file with zipfile.ZipFile(outfile) as zf: content = zf.namelist() print(content) assert len(content) == 26 - assert len([c for c in content if 'internal' in c]) == 11 + assert len([c for c in content if "internal" in c]) == 11 assert ".sbt.zzz/" in content sbts = [c for c in content if c.endswith(".sbt.json")] assert len(sbts) == 1 @@ -5829,13 +7026,14 @@ def test_do_sourmash_index_zipfile_append(c): def test_index_with_picklist(runtmp): # test 'sourmash index' with picklists - gcf_sig_dir = utils.get_test_data('gather/') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sig_dir = utils.get_test_data("gather/") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") - output_db = runtmp.output('thermo.sbt.zip') + output_db = runtmp.output("thermo.sbt.zip") - runtmp.sourmash('index', output_db, gcf_sig_dir, - '-k', '31', '--picklist', f"{picklist}:md5:md5") + runtmp.sourmash( + "index", output_db, gcf_sig_dir, "-k", "31", "--picklist", f"{picklist}:md5:md5" + ) err = runtmp.last_result.err print(err) @@ -5848,18 +7046,25 @@ def test_index_with_picklist(runtmp): siglist = list(sourmash.load_file_as_signatures(output_db)) assert len(siglist) == 3 for ss in siglist: - assert 'Thermotoga' in ss.name + assert "Thermotoga" in ss.name def test_index_with_picklist_exclude(runtmp): # test 'sourmash index' with picklists - exclude - gcf_sig_dir = utils.get_test_data('gather/') - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + gcf_sig_dir = utils.get_test_data("gather/") + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") - output_db = runtmp.output('thermo-exclude.sbt.zip') + output_db = runtmp.output("thermo-exclude.sbt.zip") - runtmp.sourmash('index', output_db, gcf_sig_dir, - '-k', '31', '--picklist', f"{picklist}:md5:md5:exclude") + runtmp.sourmash( + "index", + output_db, + gcf_sig_dir, + "-k", + "31", + "--picklist", + f"{picklist}:md5:md5:exclude", + ) err = runtmp.last_result.err print(err) @@ -5869,35 +7074,43 @@ def test_index_with_picklist_exclude(runtmp): siglist = list(sourmash.load_file_as_signatures(output_db)) assert len(siglist) == 9 for ss in siglist: - assert 'Thermotoga' not in ss.name + assert "Thermotoga" not in ss.name def test_index_matches_search_with_picklist(runtmp): # test 'sourmash index' with picklists - gcf_sig_dir = utils.get_test_data('gather/') - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') - metag_sig = utils.get_test_data('gather/combined.sig') + gcf_sig_dir = utils.get_test_data("gather/") + glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") + metag_sig = utils.get_test_data("gather/combined.sig") - output_db = runtmp.output('thermo.sbt.zip') + output_db = runtmp.output("thermo.sbt.zip") - runtmp.sourmash('index', output_db, gcf_sig_dir, '-k', '21') + runtmp.sourmash("index", output_db, gcf_sig_dir, "-k", "21") print(runtmp.last_result.out) print(runtmp.last_result.err) # verify: siglist = list(sourmash.load_file_as_signatures(output_db)) - assert len(siglist) > 3 # all signatures included... + assert len(siglist) > 3 # all signatures included... n_thermo = 0 for ss in siglist: - if 'Thermotoga' in ss.name: + if "Thermotoga" in ss.name: n_thermo += 1 assert n_thermo == 3 - runtmp.sourmash('search', metag_sig, output_db, '--containment', - '-k', '21', '--picklist', f"{picklist}:md5:md5") + runtmp.sourmash( + "search", + metag_sig, + output_db, + "--containment", + "-k", + "21", + "--picklist", + f"{picklist}:md5:md5", + ) err = runtmp.last_result.err print(err) @@ -5915,30 +7128,38 @@ def test_index_matches_search_with_picklist(runtmp): def test_index_matches_search_with_picklist_exclude(runtmp): # test 'sourmash index' with picklists - exclude - gcf_sig_dir = utils.get_test_data('gather/') - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - picklist = utils.get_test_data('gather/thermotoga-picklist.csv') - metag_sig = utils.get_test_data('gather/combined.sig') + gcf_sig_dir = utils.get_test_data("gather/") + glob.glob(utils.get_test_data("gather/GCF*.sig")) + picklist = utils.get_test_data("gather/thermotoga-picklist.csv") + metag_sig = utils.get_test_data("gather/combined.sig") - output_db = runtmp.output('thermo-exclude.sbt.zip') + output_db = runtmp.output("thermo-exclude.sbt.zip") - runtmp.sourmash('index', output_db, gcf_sig_dir, '-k', '21') + runtmp.sourmash("index", output_db, gcf_sig_dir, "-k", "21") print(runtmp.last_result.out) print(runtmp.last_result.err) # verify: siglist = list(sourmash.load_file_as_signatures(output_db)) - assert len(siglist) > 3 # all signatures included... + assert len(siglist) > 3 # all signatures included... n_thermo = 0 for ss in siglist: - if 'Thermotoga' in ss.name: + if "Thermotoga" in ss.name: n_thermo += 1 assert n_thermo == 3 - runtmp.sourmash('search', metag_sig, output_db, '--containment', - '-k', '21', '--picklist', f"{picklist}:md5:md5:exclude") + runtmp.sourmash( + "search", + metag_sig, + output_db, + "--containment", + "-k", + "21", + "--picklist", + f"{picklist}:md5:md5:exclude", + ) err = runtmp.last_result.err print(err) @@ -5956,12 +7177,11 @@ def test_index_matches_search_with_picklist_exclude(runtmp): def test_gather_with_prefetch_picklist(runtmp, linear_gather): # test 'gather' using a picklist taken from 'sourmash prefetch' output - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - prefetch_csv = runtmp.output('prefetch-out.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + prefetch_csv = runtmp.output("prefetch-out.csv") - runtmp.sourmash('prefetch', metag_sig, *gcf_sigs, - '-k', '21', '-o', prefetch_csv) + runtmp.sourmash("prefetch", metag_sig, *gcf_sigs, "-k", "21", "-o", prefetch_csv) err = runtmp.last_result.err print(err) @@ -5970,12 +7190,22 @@ def test_gather_with_prefetch_picklist(runtmp, linear_gather): print(out) assert "total of 12 matching signatures." in err - assert "of 1466 distinct query hashes, 1466 were found in matches above threshold." in err + assert ( + "of 1466 distinct query hashes, 1466 were found in matches above threshold." + in err + ) # now, do a gather with the results - runtmp.sourmash('gather', metag_sig, *gcf_sigs, linear_gather, - '-k', '21', '--picklist', - f'{prefetch_csv}:match_md5:md5short') + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + linear_gather, + "-k", + "21", + "--picklist", + f"{prefetch_csv}:match_md5:md5short", + ) err = runtmp.last_result.err print(err) @@ -5993,12 +7223,11 @@ def test_gather_with_prefetch_picklist(runtmp, linear_gather): def test_gather_with_prefetch_picklist_2_prefetch(runtmp, linear_gather): # test 'gather' using a picklist taken from 'sourmash prefetch' output # using ::prefetch - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - prefetch_csv = runtmp.output('prefetch-out.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + prefetch_csv = runtmp.output("prefetch-out.csv") - runtmp.sourmash('prefetch', metag_sig, *gcf_sigs, - '-k', '21', '-o', prefetch_csv) + runtmp.sourmash("prefetch", metag_sig, *gcf_sigs, "-k", "21", "-o", prefetch_csv) err = runtmp.last_result.err print(err) @@ -6007,12 +7236,22 @@ def test_gather_with_prefetch_picklist_2_prefetch(runtmp, linear_gather): print(out) assert "total of 12 matching signatures." in err - assert "of 1466 distinct query hashes, 1466 were found in matches above threshold." in err + assert ( + "of 1466 distinct query hashes, 1466 were found in matches above threshold." + in err + ) # now, do a gather with the results - runtmp.sourmash('gather', metag_sig, *gcf_sigs, linear_gather, - '-k', '21', '--picklist', - f'{prefetch_csv}::prefetch') + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + linear_gather, + "-k", + "21", + "--picklist", + f"{prefetch_csv}::prefetch", + ) err = runtmp.last_result.err print(err) @@ -6031,12 +7270,11 @@ def test_gather_with_prefetch_picklist_3_gather(runtmp, linear_gather): # test 'gather' using a picklist taken from 'sourmash gather' output, # using ::gather. # (this doesn't really do anything useful, but it's an ok test :) - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - gather_csv = runtmp.output('gather-out.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + gather_csv = runtmp.output("gather-out.csv") - runtmp.sourmash('gather', metag_sig, *gcf_sigs, - '-k', '21', '-o', gather_csv) + runtmp.sourmash("gather", metag_sig, *gcf_sigs, "-k", "21", "-o", gather_csv) err = runtmp.last_result.err print(err) @@ -6051,9 +7289,16 @@ def test_gather_with_prefetch_picklist_3_gather(runtmp, linear_gather): assert "1.9 Mbp 13.1% 100.0% NC_000853.1 " in out # now, do another gather with the results - runtmp.sourmash('gather', metag_sig, *gcf_sigs, linear_gather, - '-k', '21', '--picklist', - f'{gather_csv}::gather') + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + linear_gather, + "-k", + "21", + "--picklist", + f"{gather_csv}::gather", + ) err = runtmp.last_result.err print(err) @@ -6072,12 +7317,11 @@ def test_gather_with_prefetch_picklist_3_gather_badcol(runtmp): # test 'gather' using a picklist taken from 'sourmash gather' output, # using ::gather. # (this doesn't really do anything useful, but it's an ok test :) - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - gather_csv = runtmp.output('gather-out.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + gather_csv = runtmp.output("gather-out.csv") - runtmp.sourmash('gather', metag_sig, *gcf_sigs, - '-k', '21', '-o', gather_csv) + runtmp.sourmash("gather", metag_sig, *gcf_sigs, "-k", "21", "-o", gather_csv) err = runtmp.last_result.err print(err) @@ -6094,9 +7338,15 @@ def test_gather_with_prefetch_picklist_3_gather_badcol(runtmp): # now, do another gather with the results, but with a bad picklist # parameter with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('gather', metag_sig, *gcf_sigs, - '-k', '21', '--picklist', - f'{gather_csv}:FOO:gather') + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + "-k", + "21", + "--picklist", + f"{gather_csv}:FOO:gather", + ) err = runtmp.last_result.err print(err) @@ -6112,11 +7362,11 @@ def test_gather_with_prefetch_picklist_4_manifest(runtmp, linear_gather): # test 'gather' using a picklist taken from 'sourmash sig manifest' # output, using ::manifest. # (this doesn't really do anything useful, but it's an ok test :) - gather_dir = utils.get_test_data('gather/') - metag_sig = utils.get_test_data('gather/combined.sig') - manifest_csv = runtmp.output('manifest.csv') + gather_dir = utils.get_test_data("gather/") + metag_sig = utils.get_test_data("gather/combined.sig") + manifest_csv = runtmp.output("manifest.csv") - runtmp.sourmash('sig', 'manifest', gather_dir, '-o', manifest_csv) + runtmp.sourmash("sig", "manifest", gather_dir, "-o", manifest_csv) err = runtmp.last_result.err print(err) @@ -6125,9 +7375,16 @@ def test_gather_with_prefetch_picklist_4_manifest(runtmp, linear_gather): print(out) # now, do a gather on the manifest - runtmp.sourmash('gather', metag_sig, gather_dir, linear_gather, - '-k', '21', '--picklist', - f'{manifest_csv}::manifest') + runtmp.sourmash( + "gather", + metag_sig, + gather_dir, + linear_gather, + "-k", + "21", + "--picklist", + f"{manifest_csv}::manifest", + ) err = runtmp.last_result.err print(err) @@ -6146,11 +7403,11 @@ def test_gather_with_prefetch_picklist_4_manifest_excl(runtmp, linear_gather): # test 'gather' using a picklist taken from 'sourmash sig manifest' # output, using ::manifest. # (this doesn't really do anything useful, but it's an ok test :) - gather_dir = utils.get_test_data('gather/') - metag_sig = utils.get_test_data('gather/combined.sig') - manifest_csv = runtmp.output('manifest.csv') + gather_dir = utils.get_test_data("gather/") + metag_sig = utils.get_test_data("gather/combined.sig") + manifest_csv = runtmp.output("manifest.csv") - runtmp.sourmash('sig', 'manifest', gather_dir, '-o', manifest_csv) + runtmp.sourmash("sig", "manifest", gather_dir, "-o", manifest_csv) err = runtmp.last_result.err print(err) @@ -6159,9 +7416,16 @@ def test_gather_with_prefetch_picklist_4_manifest_excl(runtmp, linear_gather): print(out) # now, do a gather on the manifest - runtmp.sourmash('gather', metag_sig, gather_dir, linear_gather, - '-k', '21', '--picklist', - f'{manifest_csv}::manifest:exclude') + runtmp.sourmash( + "gather", + metag_sig, + gather_dir, + linear_gather, + "-k", + "21", + "--picklist", + f"{manifest_csv}::manifest:exclude", + ) err = runtmp.last_result.err print(err) @@ -6176,12 +7440,13 @@ def test_gather_with_prefetch_picklist_4_manifest_excl(runtmp, linear_gather): def test_gather_with_prefetch_picklist_5_search(runtmp): # test 'gather' using a picklist taken from 'sourmash prefetch' output # using ::prefetch - gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) - metag_sig = utils.get_test_data('gather/combined.sig') - search_csv = runtmp.output('search-out.csv') + gcf_sigs = glob.glob(utils.get_test_data("gather/GCF*.sig")) + metag_sig = utils.get_test_data("gather/combined.sig") + search_csv = runtmp.output("search-out.csv") - runtmp.sourmash('search', '--containment', metag_sig, *gcf_sigs, - '-k', '21', '-o', search_csv) + runtmp.sourmash( + "search", "--containment", metag_sig, *gcf_sigs, "-k", "21", "-o", search_csv + ) err = runtmp.last_result.err print(err) @@ -6193,9 +7458,15 @@ def test_gather_with_prefetch_picklist_5_search(runtmp): assert " 33.2% NC_003198.1 Salmonella enterica subsp." in out # now, do a gather with the results - runtmp.sourmash('gather', metag_sig, *gcf_sigs, - '-k', '21', '--picklist', - f'{search_csv}::search') + runtmp.sourmash( + "gather", + metag_sig, + *gcf_sigs, + "-k", + "21", + "--picklist", + f"{search_csv}::search", + ) err = runtmp.last_result.err print(err) @@ -6212,17 +7483,17 @@ def test_gather_with_prefetch_picklist_5_search(runtmp): def test_gather_scaled_1(runtmp, linear_gather, prefetch_gather): # test gather on a sig indexed with scaled=1 - inp = utils.get_test_data('short.fa') - outp = runtmp.output('out.sig') + inp = utils.get_test_data("short.fa") + outp = runtmp.output("out.sig") # prepare a signature with a scaled of 1 - runtmp.sourmash('sketch', 'dna', '-p', 'scaled=1,k=31', inp, '-o', outp) + runtmp.sourmash("sketch", "dna", "-p", "scaled=1,k=31", inp, "-o", outp) # run with a low threshold - runtmp.sourmash('gather', outp, outp, '--threshold-bp', '0') + runtmp.sourmash("gather", outp, outp, "--threshold-bp", "0") print(runtmp.last_result.out) - print('---') + print("---") print(runtmp.last_result.err) assert "1.0 kbp 100.0% 100.0%" in runtmp.last_result.out @@ -6231,25 +7502,25 @@ def test_gather_scaled_1(runtmp, linear_gather, prefetch_gather): def test_standalone_manifest_search(runtmp): # test loading/searching a manifest file from the command line. - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - dirname = runtmp.output('somedir') + dirname = runtmp.output("somedir") os.mkdir(dirname) - subdir = runtmp.output('somedir/subdir') + subdir = runtmp.output("somedir/subdir") os.mkdir(subdir) - shutil.copyfile(sig47, os.path.join(dirname, '47.fa.sig')) - shutil.copyfile(sig63, os.path.join(subdir, '63.fa.sig')) + shutil.copyfile(sig47, os.path.join(dirname, "47.fa.sig")) + shutil.copyfile(sig63, os.path.join(subdir, "63.fa.sig")) # for now, the output manifest must be within top level dir for # CLI stuff to work properly. - mf = os.path.join(dirname, 'mf.csv') + mf = os.path.join(dirname, "mf.csv") # build manifest... - runtmp.sourmash('sig', 'manifest', dirname, '-o', mf) + runtmp.sourmash("sig", "manifest", dirname, "-o", mf) # ...and now use for a search! - runtmp.sourmash('search', sig47, mf) + runtmp.sourmash("search", sig47, mf) out = runtmp.last_result.out print(out) @@ -6261,95 +7532,100 @@ def test_standalone_manifest_search(runtmp): def test_standalone_manifest_search_fail(runtmp): # test loading/searching a manifest file from the command line; should # fail if manifest is not located within tld. - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") - dirname = runtmp.output('somedir') + dirname = runtmp.output("somedir") os.mkdir(dirname) - subdir = runtmp.output('somedir/subdir') + subdir = runtmp.output("somedir/subdir") os.mkdir(subdir) - shutil.copyfile(sig47, os.path.join(dirname, '47.fa.sig')) - shutil.copyfile(sig63, os.path.join(subdir, '63.fa.sig')) + shutil.copyfile(sig47, os.path.join(dirname, "47.fa.sig")) + shutil.copyfile(sig63, os.path.join(subdir, "63.fa.sig")) # for now, the output manifest must be within top level dir for # CLI stuff to work properly. here we intentionally break this, # for testing purposes. - mf = runtmp.output('mf.csv') + mf = runtmp.output("mf.csv") # build manifest... - runtmp.sourmash('sig', 'manifest', dirname, '-o', mf) + runtmp.sourmash("sig", "manifest", dirname, "-o", mf) # ...and now use for a search! with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('search', sig47, mf) + runtmp.sourmash("search", sig47, mf) def test_search_ani_jaccard(runtmp): c = runtmp - sig47 = utils.get_test_data('47.fa.sig') - sig4763 = utils.get_test_data('47+63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig4763 = utils.get_test_data("47+63.fa.sig") - c.run_sourmash('search', sig47, sig4763, '-o', 'xxx.csv') + c.run_sourmash("search", sig47, sig4763, "-o", "xxx.csv") print(c.last_result.status, c.last_result.out, c.last_result.err) search_result_names = SearchResult.search_write_cols - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) assert search_result_names == list(row.keys()) - assert float(row['similarity']) == 0.6564798376870403 - assert row['filename'].endswith('47+63.fa.sig') - assert row['md5'] == '491c0a81b2cfb0188c0d3b46837c2f42' - assert row['query_filename'].endswith('47.fa') - assert row['query_name'] == 'NC_009665.1 Shewanella baltica OS185, complete genome' - assert row['query_md5'] == '09a08691' - assert row['ani'] == "0.992530907924384" + assert float(row["similarity"]) == 0.6564798376870403 + assert row["filename"].endswith("47+63.fa.sig") + assert row["md5"] == "491c0a81b2cfb0188c0d3b46837c2f42" + assert row["query_filename"].endswith("47.fa") + assert ( + row["query_name"] == "NC_009665.1 Shewanella baltica OS185, complete genome" + ) + assert row["query_md5"] == "09a08691" + assert row["ani"] == "0.992530907924384" def test_search_ani_jaccard_error_too_high(runtmp): c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=1", testdata1, testdata2) - c.run_sourmash('search', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv') + c.run_sourmash("search", "short.fa.sig", "short2.fa.sig", "-o", "xxx.csv") print(c.last_result.status, c.last_result.out, c.last_result.err) search_result_names = SearchResult.search_write_cols - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) assert search_result_names == list(row.keys()) - assert float(row['similarity']) == 0.9288577154308617 - assert row['filename'].endswith('short2.fa.sig') - assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db' - assert row['query_filename'].endswith('short.fa') - assert row['query_name'] == '' - assert row['query_md5'] == '9191284a' - #assert row['ani'] == "0.9987884602947684" - assert row['ani'] == '' + assert float(row["similarity"]) == 0.9288577154308617 + assert row["filename"].endswith("short2.fa.sig") + assert row["md5"] == "bf752903d635b1eb83c53fe4aae951db" + assert row["query_filename"].endswith("short.fa") + assert row["query_name"] == "" + assert row["query_md5"] == "9191284a" + # assert row['ani'] == "0.9987884602947684" + assert row["ani"] == "" - assert "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." in c.last_result.err + assert ( + "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." + in c.last_result.err + ) def test_searchabund_no_ani(runtmp): c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=10,abund', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=10,abund", testdata1, testdata2) - c.run_sourmash('search', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv') + c.run_sourmash("search", "short.fa.sig", "short2.fa.sig", "-o", "xxx.csv") print(c.last_result.status, c.last_result.out, c.last_result.err) - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") search_result_names = SearchResult.search_write_cols with open(csv_file) as fp: @@ -6357,158 +7633,178 @@ def test_searchabund_no_ani(runtmp): row = next(reader) print(row) assert search_result_names == list(row.keys()) - assert float(row['similarity']) == 0.8224046424612483 - assert row['md5'] == 'c9d5a795eeaaf58e286fb299133e1938' - assert row['filename'].endswith('short2.fa.sig') - assert row['query_filename'].endswith('short.fa') - assert row['query_name'] == '' - assert row['query_md5'] == 'b5cc464c' - assert row['ani'] == "" # do we want empty column to appear?? + assert float(row["similarity"]) == 0.8224046424612483 + assert row["md5"] == "c9d5a795eeaaf58e286fb299133e1938" + assert row["filename"].endswith("short2.fa.sig") + assert row["query_filename"].endswith("short.fa") + assert row["query_name"] == "" + assert row["query_md5"] == "b5cc464c" + assert row["ani"] == "" # do we want empty column to appear?? def test_search_ani_containment(runtmp): c = runtmp - testdata1 = utils.get_test_data('2+63.fa.sig') - testdata2 = utils.get_test_data('47+63.fa.sig') + testdata1 = utils.get_test_data("2+63.fa.sig") + testdata2 = utils.get_test_data("47+63.fa.sig") - c.run_sourmash('search', '--containment', testdata1, testdata2, '-o', 'xxx.csv') + c.run_sourmash("search", "--containment", testdata1, testdata2, "-o", "xxx.csv") print(c.last_result.status, c.last_result.out, c.last_result.err) search_result_names = SearchResult.search_write_cols - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) assert search_result_names == list(row.keys()) - assert float(row['similarity']) == 0.6597808288197506 - assert row['filename'].endswith('47+63.fa.sig') - assert row['md5'] == '491c0a81b2cfb0188c0d3b46837c2f42' - assert row['query_name'] == '' - assert row['query_md5'] == '832a45e8' - assert row['ani'] == "0.9866751346467802" + assert float(row["similarity"]) == 0.6597808288197506 + assert row["filename"].endswith("47+63.fa.sig") + assert row["md5"] == "491c0a81b2cfb0188c0d3b46837c2f42" + assert row["query_name"] == "" + assert row["query_md5"] == "832a45e8" + assert row["ani"] == "0.9866751346467802" # search other direction - c.run_sourmash('search', '--containment', testdata2, testdata1, '-o', 'xxxx.csv') + c.run_sourmash("search", "--containment", testdata2, testdata1, "-o", "xxxx.csv") print(c.last_result.status, c.last_result.out, c.last_result.err) - csv_file = c.output('xxxx.csv') + csv_file = c.output("xxxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) assert search_result_names == list(row.keys()) - assert float(row['similarity']) == 0.6642150646715699 - assert row['filename'].endswith('2+63.fa.sig') - assert row['md5'] == '832a45e85bdca6eaef5d73047e3e6321' - assert row['query_name'] == '' - assert row['query_md5'] == '491c0a81' - assert row['ani'] == "0.9868883523107224" + assert float(row["similarity"]) == 0.6642150646715699 + assert row["filename"].endswith("2+63.fa.sig") + assert row["md5"] == "832a45e85bdca6eaef5d73047e3e6321" + assert row["query_name"] == "" + assert row["query_md5"] == "491c0a81" + assert row["ani"] == "0.9868883523107224" def test_search_ani_containment_asymmetry(runtmp): # test contained_by asymmetries, viz #2215 - query_sig = utils.get_test_data('47.fa.sig') - merged_sig = utils.get_test_data('47-63-merge.sig') + query_sig = utils.get_test_data("47.fa.sig") + merged_sig = utils.get_test_data("47-63-merge.sig") - runtmp.sourmash('search', query_sig, merged_sig, '-o', - 'query-in-merged.csv', '--containment') - runtmp.sourmash('search', merged_sig, query_sig, '-o', - 'merged-in-query.csv', '--containment') + runtmp.sourmash( + "search", query_sig, merged_sig, "-o", "query-in-merged.csv", "--containment" + ) + runtmp.sourmash( + "search", merged_sig, query_sig, "-o", "merged-in-query.csv", "--containment" + ) - with sourmash_args.FileInputCSV(runtmp.output('query-in-merged.csv')) as r: + with sourmash_args.FileInputCSV(runtmp.output("query-in-merged.csv")) as r: query_in_merged = list(r)[0] - with sourmash_args.FileInputCSV(runtmp.output('merged-in-query.csv')) as r: + with sourmash_args.FileInputCSV(runtmp.output("merged-in-query.csv")) as r: merged_in_query = list(r)[0] - assert query_in_merged['ani'] == '1.0' - assert merged_in_query['ani'] == '0.9865155060423993' + assert query_in_merged["ani"] == "1.0" + assert merged_in_query["ani"] == "0.9865155060423993" def test_search_ani_containment_fail(runtmp): c = runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=10', testdata1, testdata2) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=10", testdata1, testdata2) - c.run_sourmash('search', '--containment', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv') + c.run_sourmash( + "search", "--containment", "short.fa.sig", "short2.fa.sig", "-o", "xxx.csv" + ) print(c.last_result.status, c.last_result.out, c.last_result.err) search_result_names = SearchResult.search_write_cols - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) assert search_result_names == list(row.keys()) - assert round(float(row['similarity']), 3) == 0.967 - assert row['ani'] == "0.998906999319701" + assert round(float(row["similarity"]), 3) == 0.967 + assert row["ani"] == "0.998906999319701" # With PR #2268, this error message should not appear - #assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." in c.last_result.err - + # assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." in c.last_result.err + def test_search_ani_containment_estimate_ci(runtmp): # test ANI confidence intervals, based on (asymmetric) containment c = runtmp - testdata1 = utils.get_test_data('2+63.fa.sig') - testdata2 = utils.get_test_data('47+63.fa.sig') - - c.run_sourmash('search', '--containment', testdata1, testdata2, '-o', 'xxx.csv', '--estimate-ani-ci') + testdata1 = utils.get_test_data("2+63.fa.sig") + testdata2 = utils.get_test_data("47+63.fa.sig") + + c.run_sourmash( + "search", + "--containment", + testdata1, + testdata2, + "-o", + "xxx.csv", + "--estimate-ani-ci", + ) print(c.last_result.status, c.last_result.out, c.last_result.err) search_result_names_ci = SearchResult.search_write_cols_ci - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) assert search_result_names_ci == list(row.keys()) - assert float(row['similarity']) == 0.6597808288197506 - assert row['filename'].endswith('47+63.fa.sig') - assert row['md5'] == '491c0a81b2cfb0188c0d3b46837c2f42' - assert row['query_name'] == '' - assert row['query_md5'] == '832a45e8' - assert row['ani'] == "0.9866751346467802" - assert row['ani_low'] == "0.9861576758035308" #"0.9861559138341189" - assert row['ani_high'] == "0.9871770716451368" #"0.9871787293232042" + assert float(row["similarity"]) == 0.6597808288197506 + assert row["filename"].endswith("47+63.fa.sig") + assert row["md5"] == "491c0a81b2cfb0188c0d3b46837c2f42" + assert row["query_name"] == "" + assert row["query_md5"] == "832a45e8" + assert row["ani"] == "0.9866751346467802" + assert row["ani_low"] == "0.9861576758035308" # "0.9861559138341189" + assert row["ani_high"] == "0.9871770716451368" # "0.9871787293232042" # search other direction - c.run_sourmash('search', '--containment', testdata2, testdata1, '-o', 'xxxx.csv', '--estimate-ani-ci') + c.run_sourmash( + "search", + "--containment", + testdata2, + testdata1, + "-o", + "xxxx.csv", + "--estimate-ani-ci", + ) print(c.last_result.status, c.last_result.out, c.last_result.err) - csv_file = c.output('xxxx.csv') + csv_file = c.output("xxxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) assert search_result_names_ci == list(row.keys()) - assert float(row['similarity']) == 0.6642150646715699 - assert row['filename'].endswith('2+63.fa.sig') - assert row['md5'] == '832a45e85bdca6eaef5d73047e3e6321' - assert row['query_name'] == '' - assert row['query_md5'] == '491c0a81' - assert row['ani'] == "0.9868883523107224" - assert row['ani_low'] == "0.986374049720872" #"0.9863757952722036" - assert row['ani_high'] == "0.9873870188726516" #"0.9873853776786775" + assert float(row["similarity"]) == 0.6642150646715699 + assert row["filename"].endswith("2+63.fa.sig") + assert row["md5"] == "832a45e85bdca6eaef5d73047e3e6321" + assert row["query_name"] == "" + assert row["query_md5"] == "491c0a81" + assert row["ani"] == "0.9868883523107224" + assert row["ani_low"] == "0.986374049720872" # "0.9863757952722036" + assert row["ani_high"] == "0.9873870188726516" # "0.9873853776786775" def test_search_ani_max_containment(runtmp): c = runtmp - testdata1 = utils.get_test_data('2+63.fa.sig') - testdata2 = utils.get_test_data('47+63.fa.sig') + testdata1 = utils.get_test_data("2+63.fa.sig") + testdata2 = utils.get_test_data("47+63.fa.sig") - c.run_sourmash('search', '--max-containment', testdata1, testdata2, '-o', 'xxx.csv') + c.run_sourmash("search", "--max-containment", testdata1, testdata2, "-o", "xxx.csv") print(c.last_result.status, c.last_result.out, c.last_result.err) - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") search_result_names = SearchResult.search_write_cols with open(csv_file) as fp: @@ -6516,25 +7812,33 @@ def test_search_ani_max_containment(runtmp): row = next(reader) print(row) assert search_result_names == list(row.keys()) - assert float(row['similarity']) == 0.6642150646715699 - assert row['filename'].endswith('47+63.fa.sig') - assert row['md5'] == '491c0a81b2cfb0188c0d3b46837c2f42' - assert row['query_name'] == '' - assert row['query_md5'] == '832a45e8' - assert row['ani'] == "0.9868883523107224" + assert float(row["similarity"]) == 0.6642150646715699 + assert row["filename"].endswith("47+63.fa.sig") + assert row["md5"] == "491c0a81b2cfb0188c0d3b46837c2f42" + assert row["query_name"] == "" + assert row["query_md5"] == "832a45e8" + assert row["ani"] == "0.9868883523107224" def test_search_ani_max_containment_estimate_ci(runtmp): # test ANI confidence intervals, based on (symmetric) max-containment c = runtmp - testdata1 = utils.get_test_data('2+63.fa.sig') - testdata2 = utils.get_test_data('47+63.fa.sig') - - c.run_sourmash('search', '--max-containment', testdata1, testdata2, '-o', 'xxx.csv', '--estimate-ani-ci') + testdata1 = utils.get_test_data("2+63.fa.sig") + testdata2 = utils.get_test_data("47+63.fa.sig") + + c.run_sourmash( + "search", + "--max-containment", + testdata1, + testdata2, + "-o", + "xxx.csv", + "--estimate-ani-ci", + ) print(c.last_result.status, c.last_result.out, c.last_result.err) - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") search_result_names_ci = SearchResult.search_write_cols_ci with open(csv_file) as fp: @@ -6542,32 +7846,32 @@ def test_search_ani_max_containment_estimate_ci(runtmp): row = next(reader) print(row) assert search_result_names_ci == list(row.keys()) - assert float(row['similarity']) == 0.6642150646715699 - assert row['filename'].endswith('47+63.fa.sig') - assert row['md5'] == '491c0a81b2cfb0188c0d3b46837c2f42' - assert row['query_name'] == '' - assert row['query_md5'] == '832a45e8' - assert row['ani'] == "0.9868883523107224" - assert row['ani_low'] == "0.986374049720872" - assert row['ani_high'] == "0.9873870188726516" + assert float(row["similarity"]) == 0.6642150646715699 + assert row["filename"].endswith("47+63.fa.sig") + assert row["md5"] == "491c0a81b2cfb0188c0d3b46837c2f42" + assert row["query_name"] == "" + assert row["query_md5"] == "832a45e8" + assert row["ani"] == "0.9868883523107224" + assert row["ani_low"] == "0.986374049720872" + assert row["ani_high"] == "0.9873870188726516" def test_search_jaccard_ani_downsample(runtmp): c = runtmp - sig47 = utils.get_test_data('47.fa.sig') - sig4763 = utils.get_test_data('47+63.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") + sig4763 = utils.get_test_data("47+63.fa.sig") ss47 = sourmash.load_one_signature(sig47) ss4763 = sourmash.load_one_signature(sig4763) print(f"SCALED: sig1: {ss47.minhash.scaled}, sig2: {ss4763.minhash.scaled}") - c.run_sourmash('search', sig47, sig4763, '-o', 'xxx.csv') + c.run_sourmash("search", sig47, sig4763, "-o", "xxx.csv") print(c.last_result.status, c.last_result.out, c.last_result.err) search_result_names = SearchResult.search_write_cols search_result_names_ci = SearchResult.search_write_cols_ci - csv_file = c.output('xxx.csv') + csv_file = c.output("xxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) @@ -6575,50 +7879,61 @@ def test_search_jaccard_ani_downsample(runtmp): print(row) assert search_result_names == list(row.keys()) assert search_result_names_ci != list(row.keys()) - assert float(row['similarity']) == 0.6564798376870403 - assert row['filename'].endswith('47+63.fa.sig') - assert row['md5'] == '491c0a81b2cfb0188c0d3b46837c2f42' - assert row['query_filename'].endswith('47.fa') - assert row['query_name'] == 'NC_009665.1 Shewanella baltica OS185, complete genome' - assert row['query_md5'] == '09a08691' - assert row['ani'] == "0.992530907924384" + assert float(row["similarity"]) == 0.6564798376870403 + assert row["filename"].endswith("47+63.fa.sig") + assert row["md5"] == "491c0a81b2cfb0188c0d3b46837c2f42" + assert row["query_filename"].endswith("47.fa") + assert ( + row["query_name"] == "NC_009665.1 Shewanella baltica OS185, complete genome" + ) + assert row["query_md5"] == "09a08691" + assert row["ani"] == "0.992530907924384" # downsample one and check similarity and ANI ds_sig47 = c.output("ds_sig47.sig") - c.run_sourmash('sig', "downsample", sig47, "--scaled", "2000", '-o', ds_sig47) - c.run_sourmash('search', ds_sig47, sig4763, '-o', 'xxx.csv') -# - csv_file = c.output('xxx.csv') + c.run_sourmash("sig", "downsample", sig47, "--scaled", "2000", "-o", ds_sig47) + c.run_sourmash("search", ds_sig47, sig4763, "-o", "xxx.csv") + # + csv_file = c.output("xxx.csv") with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) - assert round(float(row['similarity']), 3) == round(0.6634517766497462, 3) - assert round(float(row['ani']), 3) == 0.993 + assert round(float(row["similarity"]), 3) == round(0.6634517766497462, 3) + assert round(float(row["ani"]), 3) == 0.993 - #downsample manually and assert same ANI + # downsample manually and assert same ANI ss47_ds = signature.load_one_signature(ds_sig47) print("SCALED:", ss47_ds.minhash.scaled, ss4763.minhash.scaled) ani_info = ss47_ds.jaccard_ani(ss4763, downsample=True) print(ani_info) - assert round(ani_info.ani,3) == 0.993 + assert round(ani_info.ani, 3) == 0.993 assert (1 - round(ani_info.dist, 3)) == 0.993 def test_gather_ani_csv(runtmp, linear_gather, prefetch_gather): - testdata1 = utils.get_test_data('63.fa.sig') - testdata2 = utils.get_test_data('47+63.fa.sig') + testdata1 = utils.get_test_data("63.fa.sig") + testdata2 = utils.get_test_data("47+63.fa.sig") - runtmp.sourmash('index', '-k', '31', 'zzz', testdata2) + runtmp.sourmash("index", "-k", "31", "zzz", testdata2) - assert os.path.exists(runtmp.output('zzz.sbt.zip')) + assert os.path.exists(runtmp.output("zzz.sbt.zip")) - runtmp.sourmash('gather', testdata1, 'zzz', '-o', 'foo.csv', '--threshold-bp=1', linear_gather, prefetch_gather) + runtmp.sourmash( + "gather", + testdata1, + "zzz", + "-o", + "foo.csv", + "--threshold-bp=1", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - csv_file = runtmp.output('foo.csv') + csv_file = runtmp.output("foo.csv") gather_result_names = GatherResult.gather_write_cols gather_result_names_ci = GatherResult.gather_write_cols_ci @@ -6628,42 +7943,63 @@ def test_gather_ani_csv(runtmp, linear_gather, prefetch_gather): print(row) assert gather_result_names == list(row.keys()) assert gather_result_names_ci != list(row.keys()) - assert float(row['intersect_bp']) == 5238000.0 - assert float(row['unique_intersect_bp']) == 5238000.0 - assert float(row['remaining_bp']) == 0.0 - assert float(row['f_orig_query']) == 1.0 - assert float(row['f_unique_to_query']) == 1.0 - assert float(row['f_match']) == 0.6642150646715699 - assert row['filename'] == 'zzz' - assert row['md5'] == '491c0a81b2cfb0188c0d3b46837c2f42' - assert row['gather_result_rank'] == '0' - assert row['query_md5'] == '38729c63' - assert row['query_bp'] == '5238000' - assert row['query_containment_ani']== '1.0' - assert round(float(row['match_containment_ani']), 3) == 0.987 - assert round(float(row['average_containment_ani']), 3) == 0.993 - assert round(float(row['max_containment_ani']),3) == 1.0 - assert row['potential_false_negative'] == 'False' + assert float(row["intersect_bp"]) == 5238000.0 + assert float(row["unique_intersect_bp"]) == 5238000.0 + assert float(row["remaining_bp"]) == 0.0 + assert float(row["f_orig_query"]) == 1.0 + assert float(row["f_unique_to_query"]) == 1.0 + assert float(row["f_match"]) == 0.6642150646715699 + assert row["filename"] == "zzz" + assert row["md5"] == "491c0a81b2cfb0188c0d3b46837c2f42" + assert row["gather_result_rank"] == "0" + assert row["query_md5"] == "38729c63" + assert row["query_bp"] == "5238000" + assert row["query_containment_ani"] == "1.0" + assert round(float(row["match_containment_ani"]), 3) == 0.987 + assert round(float(row["average_containment_ani"]), 3) == 0.993 + assert round(float(row["max_containment_ani"]), 3) == 1.0 + assert row["potential_false_negative"] == "False" def test_gather_ani_csv_estimate_ci(runtmp, linear_gather, prefetch_gather): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - - runtmp.sourmash('sketch','dna','-p','scaled=10', '--name-from-first', testdata1, testdata2) - - runtmp.sourmash('sketch','dna','-p','scaled=10', '-o', 'query.fa.sig', '--name-from-first', testdata2) - - runtmp.sourmash('index', '-k', '31', 'zzz', 'short.fa.sig', 'short2.fa.sig') - - assert os.path.exists(runtmp.output('zzz.sbt.zip')) - - runtmp.sourmash('gather', 'query.fa.sig', 'zzz', '-o', 'foo.csv', '--threshold-bp=1', '--estimate-ani-ci', linear_gather, prefetch_gather) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + + runtmp.sourmash( + "sketch", "dna", "-p", "scaled=10", "--name-from-first", testdata1, testdata2 + ) + + runtmp.sourmash( + "sketch", + "dna", + "-p", + "scaled=10", + "-o", + "query.fa.sig", + "--name-from-first", + testdata2, + ) + + runtmp.sourmash("index", "-k", "31", "zzz", "short.fa.sig", "short2.fa.sig") + + assert os.path.exists(runtmp.output("zzz.sbt.zip")) + + runtmp.sourmash( + "gather", + "query.fa.sig", + "zzz", + "-o", + "foo.csv", + "--threshold-bp=1", + "--estimate-ani-ci", + linear_gather, + prefetch_gather, + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - csv_file = runtmp.output('foo.csv') + csv_file = runtmp.output("foo.csv") gather_result_names = GatherResult.gather_write_cols_ci @@ -6672,29 +8008,29 @@ def test_gather_ani_csv_estimate_ci(runtmp, linear_gather, prefetch_gather): row = next(reader) print(row) assert gather_result_names == list(row.keys()) - assert float(row['intersect_bp']) == 910 - assert float(row['unique_intersect_bp']) == 910 - assert float(row['remaining_bp']) == 0 - assert float(row['f_orig_query']) == 1.0 - assert float(row['f_unique_to_query']) == 1.0 - assert float(row['f_match']) == 1.0 - assert row['filename'] == 'zzz' - assert row['name'] == 'tr1 4' - assert row['md5'] == 'c9d5a795eeaaf58e286fb299133e1938' - assert row['gather_result_rank'] == '0' - assert row['query_filename'].endswith('short2.fa') - assert row['query_name'] == 'tr1 4' - assert row['query_md5'] == 'c9d5a795' - assert row['query_bp'] == '910' - assert row['query_containment_ani'] == '1.0' - assert row['query_containment_ani_low'] == '1.0' - assert row['query_containment_ani_high'] == '1.0' - assert row['match_containment_ani'] == '1.0' - assert row['match_containment_ani_low'] == '1.0' - assert row['match_containment_ani_high'] == '1.0' - assert row['average_containment_ani'] == '1.0' - assert row['max_containment_ani'] == '1.0' - assert row['potential_false_negative'] == 'False' + assert float(row["intersect_bp"]) == 910 + assert float(row["unique_intersect_bp"]) == 910 + assert float(row["remaining_bp"]) == 0 + assert float(row["f_orig_query"]) == 1.0 + assert float(row["f_unique_to_query"]) == 1.0 + assert float(row["f_match"]) == 1.0 + assert row["filename"] == "zzz" + assert row["name"] == "tr1 4" + assert row["md5"] == "c9d5a795eeaaf58e286fb299133e1938" + assert row["gather_result_rank"] == "0" + assert row["query_filename"].endswith("short2.fa") + assert row["query_name"] == "tr1 4" + assert row["query_md5"] == "c9d5a795" + assert row["query_bp"] == "910" + assert row["query_containment_ani"] == "1.0" + assert row["query_containment_ani_low"] == "1.0" + assert row["query_containment_ani_high"] == "1.0" + assert row["match_containment_ani"] == "1.0" + assert row["match_containment_ani_low"] == "1.0" + assert row["match_containment_ani_high"] == "1.0" + assert row["average_containment_ani"] == "1.0" + assert row["max_containment_ani"] == "1.0" + assert row["potential_false_negative"] == "False" def test_compare_containment_ani(runtmp): @@ -6704,12 +8040,21 @@ def test_compare_containment_ani(runtmp): sigfiles = ["2.fa.sig", "2+63.fa.sig", "47.fa.sig", "63.fa.sig"] testdata_sigs = [utils.get_test_data(c) for c in sigfiles] - c.run_sourmash('compare', '--containment', '-k', '31', - '--ani', '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--containment", + "-k", + "31", + "--ani", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -6728,11 +8073,14 @@ def test_compare_containment_ani(runtmp): containment_ani = 0.0 mat_val = round(mat[i][j], 3) - assert containment_ani == mat_val #, (i, j) + assert containment_ani == mat_val # , (i, j) print(c.last_result.err) print(c.last_result.out) - assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err + assert ( + "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." + in c.last_result.err + ) def test_compare_containment_ani_asymmetry(runtmp): @@ -6744,11 +8092,19 @@ def test_compare_containment_ani_asymmetry(runtmp): sigfiles = ["47.fa.sig", "47-63-merge.sig"] testdata_sigs = [utils.get_test_data(c) for c in sigfiles] - c.run_sourmash('compare', '--containment', '-k', '31', - '--ani', '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--containment", + "-k", + "31", + "--ani", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output of compare --containment --estimate-ani - with open(c.output('output.csv'), 'rt') as fp: + with open(c.output("output.csv")) as fp: r = iter(csv.reader(fp)) headers = next(r) @@ -6760,7 +8116,7 @@ def test_compare_containment_ani_asymmetry(runtmp): print(mat) # load in all the input signatures - idx_to_sig = dict() + idx_to_sig = {} for idx, filename in enumerate(testdata_sigs): ss = sourmash.load_one_signature(filename, ksize=31) idx_to_sig[idx] = ss @@ -6782,7 +8138,7 @@ def test_compare_containment_ani_asymmetry(runtmp): containment_ani = 0.0 mat_val = round(mat[i][j], 6) - assert containment_ani == mat_val #, (i, j) + assert containment_ani == mat_val # , (i, j) print(c.last_result.err) print(c.last_result.out) @@ -6794,12 +8150,21 @@ def test_compare_jaccard_ani(runtmp): sigfiles = ["47.fa.sig", "47-63-merge.sig"] testdata_sigs = [utils.get_test_data(c) for c in sigfiles] - c.run_sourmash('compare', '--containment', '-k', '31', - '--ani', '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--containment", + "-k", + "31", + "--ani", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -6818,7 +8183,7 @@ def test_compare_jaccard_ani(runtmp): containment_ani = 0.0 mat_val = round(mat[i][j], 6) - assert containment_ani == mat_val #, (i, j) + assert containment_ani == mat_val # , (i, j) print(c.last_result.err) print(c.last_result.out) @@ -6831,8 +8196,7 @@ def test_compare_jaccard_protein_parallel_ani_bug(runtmp): sigfile = utils.get_test_data("prot/protein.zip") - c.run_sourmash('compare', '--ani', '-p', '2', '--csv', 'output.csv', - sigfile) + c.run_sourmash("compare", "--ani", "-p", "2", "--csv", "output.csv", sigfile) print(c.last_result.err) print(c.last_result.out) @@ -6846,12 +8210,22 @@ def test_compare_containment_ani_asymmetry_distance(runtmp): sigfiles = ["47.fa.sig", "47-63-merge.sig"] testdata_sigs = [utils.get_test_data(c) for c in sigfiles] - c.run_sourmash('compare', '--containment', '-k', '31', '--distance-matrix', - '--ani', '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--containment", + "-k", + "31", + "--distance-matrix", + "--ani", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -6870,7 +8244,7 @@ def test_compare_containment_ani_asymmetry_distance(runtmp): containment_ani = 1 mat_val = round(mat[i][j], 6) - assert containment_ani == mat_val #, (i, j) + assert containment_ani == mat_val # , (i, j) print(c.last_result.err) print(c.last_result.out) @@ -6882,12 +8256,14 @@ def test_compare_jaccard_ani(runtmp): sigfiles = ["2.fa.sig", "2+63.fa.sig", "47.fa.sig", "63.fa.sig"] testdata_sigs = [utils.get_test_data(c) for c in sigfiles] - c.run_sourmash('compare', '-k', '31', '--estimate-ani', - '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", "-k", "31", "--estimate-ani", "--csv", "output.csv", *testdata_sigs + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit calculations against output of compare for i in range(len(idx_to_sig)): @@ -6906,30 +8282,43 @@ def test_compare_jaccard_ani(runtmp): jaccard_ani = 0.0 print(jaccard_ani) - assert jaccard_ani == mat_val #, (i, j) + assert jaccard_ani == mat_val # , (i, j) print(c.last_result.err) print(c.last_result.out) - assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err + assert ( + "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." + in c.last_result.err + ) def test_compare_jaccard_ani_jaccard_error_too_high(runtmp): c = runtmp - testdata1 = utils.get_test_data('short.fa') - sig1 = c.output('short.fa.sig') - testdata2 = utils.get_test_data('short2.fa') - sig2 = c.output('short2.fa.sig') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', '-o', sig1, testdata1) - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', '-o', sig2, testdata2) + testdata1 = utils.get_test_data("short.fa") + sig1 = c.output("short.fa.sig") + testdata2 = utils.get_test_data("short2.fa") + sig2 = c.output("short2.fa.sig") + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=1", "-o", sig1, testdata1) + c.run_sourmash("sketch", "dna", "-p", "k=31,scaled=1", "-o", sig2, testdata2) testdata_sigs = [sig1, sig2] - c.run_sourmash('compare', '-k', '31', '--estimate-ani', '--csv', 'output.csv', 'short.fa.sig', 'short2.fa.sig') + c.run_sourmash( + "compare", + "-k", + "31", + "--estimate-ani", + "--csv", + "output.csv", + "short.fa.sig", + "short2.fa.sig", + ) print(c.last_result.status, c.last_result.out, c.last_result.err) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -6948,10 +8337,12 @@ def test_compare_jaccard_ani_jaccard_error_too_high(runtmp): jaccard_ani = 0.0 print(jaccard_ani) - assert jaccard_ani == mat_val #, (i, j) + assert jaccard_ani == mat_val # , (i, j) - - assert "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." in c.last_result.err + assert ( + "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." + in c.last_result.err + ) def test_compare_max_containment_ani(runtmp): @@ -6960,12 +8351,21 @@ def test_compare_max_containment_ani(runtmp): sigfiles = ["2.fa.sig", "2+63.fa.sig", "47.fa.sig", "63.fa.sig"] testdata_sigs = [utils.get_test_data(c) for c in sigfiles] - c.run_sourmash('compare', '--max-containment', '-k', '31', - '--estimate-ani', '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--max-containment", + "-k", + "31", + "--estimate-ani", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit containment against output of compare for i in range(len(idx_to_sig)): @@ -6987,7 +8387,10 @@ def test_compare_max_containment_ani(runtmp): print(c.last_result.err) print(c.last_result.out) - assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err + assert ( + "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." + in c.last_result.err + ) def test_compare_avg_containment_ani(runtmp): @@ -6997,12 +8400,21 @@ def test_compare_avg_containment_ani(runtmp): sigfiles = ["2.fa.sig", "2+63.fa.sig", "47.fa.sig", "63.fa.sig"] testdata_sigs = [utils.get_test_data(c) for c in sigfiles] - c.run_sourmash('compare', '--avg-containment', '-k', '31', - '--estimate-ani', '--csv', 'output.csv', *testdata_sigs) + c.run_sourmash( + "compare", + "--avg-containment", + "-k", + "31", + "--estimate-ani", + "--csv", + "output.csv", + *testdata_sigs, + ) # load the matrix output - mat, idx_to_sig = _load_compare_matrix_and_sigs(c.output('output.csv'), - testdata_sigs) + mat, idx_to_sig = _load_compare_matrix_and_sigs( + c.output("output.csv"), testdata_sigs + ) # check explicit avg containment against output of compare for i in range(len(idx_to_sig)): @@ -7024,29 +8436,40 @@ def test_compare_avg_containment_ani(runtmp): print(c.last_result.err) print(c.last_result.out) - assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err + assert ( + "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." + in c.last_result.err + ) def test_compare_ANI_require_scaled(runtmp): # check that compare with containment requires scaled sketches c = runtmp - s47 = utils.get_test_data('num/47.fa.sig') - s63 = utils.get_test_data('num/63.fa.sig') + s47 = utils.get_test_data("num/47.fa.sig") + s63 = utils.get_test_data("num/63.fa.sig") # containment and estimate ANI will give this error - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', '--containment', '--estimate-ani', '-k', '31', s47, s63, - fail_ok=True) - assert 'must use scaled signatures with --containment, --max-containment, and --avg-containment' in \ - c.last_result.err + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compare", + "--containment", + "--estimate-ani", + "-k", + "31", + s47, + s63, + fail_ok=True, + ) + assert ( + "must use scaled signatures with --containment, --max-containment, and --avg-containment" + in c.last_result.err + ) assert c.last_result.status != 0 # jaccard + estimate ANI will give this error - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compare', '--estimate-ani', '-k', '31', s47, s63, - fail_ok=True) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash("compare", "--estimate-ani", "-k", "31", s47, s63, fail_ok=True) - assert 'must use scaled signatures with --estimate-ani' in \ - c.last_result.err + assert "must use scaled signatures with --estimate-ani" in c.last_result.err assert c.last_result.status != 0 diff --git a/tests/test_sourmash_args.py b/tests/test_sourmash_args.py index ae83dc324d..7fcbe2511e 100644 --- a/tests/test_sourmash_args.py +++ b/tests/test_sourmash_args.py @@ -22,9 +22,9 @@ def test_save_signatures_api_none(): # save to sigfile - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) with sourmash_args.SaveSignaturesToLocation(None) as save_sig: @@ -37,12 +37,12 @@ def test_save_signatures_api_none(): def test_save_signatures_to_location_1_sig(runtmp): # save to sigfile.sig - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('foo.sig') + outloc = runtmp.output("foo.sig") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -56,9 +56,9 @@ def test_save_signatures_to_location_1_sig(runtmp): def test_save_signatures_to_location_1_stdout(): # save to stdout - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) output_capture = io.StringIO() @@ -77,12 +77,12 @@ def test_save_signatures_to_location_1_stdout(): def test_save_signatures_to_location_1_sig_is_default(runtmp): # save to sigfile.txt - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('foo.txt') + outloc = runtmp.output("foo.txt") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -96,12 +96,12 @@ def test_save_signatures_to_location_1_sig_is_default(runtmp): def test_save_signatures_to_location_1_sig_gz(runtmp): # save to sigfile.gz - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('foo.sig.gz') + outloc = runtmp.output("foo.sig.gz") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -120,12 +120,12 @@ def test_save_signatures_to_location_1_sig_gz(runtmp): def test_save_signatures_to_location_1_zip(runtmp): # save to sigfile.zip - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('foo.zip') + outloc = runtmp.output("foo.zip") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -143,33 +143,33 @@ def test_save_signatures_to_location_1_zip(runtmp): def test_save_signatures_to_location_1_zip_bad(runtmp): # try saving to bad sigfile.zip - sig2 = utils.get_test_data('2.fa.sig') - ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') - ss47 = sourmash.load_one_signature(sig47, ksize=31) + sig2 = utils.get_test_data("2.fa.sig") + sourmash.load_one_signature(sig2, ksize=31) + sig47 = utils.get_test_data("47.fa.sig") + sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('foo.zip') + outloc = runtmp.output("foo.zip") # create bad zip: - with open(outloc, 'wt') as fp: + with open(outloc, "w"): pass # now check for error with pytest.raises(ValueError) as exc: - with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: + with sourmash_args.SaveSignaturesToLocation(outloc): pass - assert 'cannot be opened as a zip file' in str(exc) + assert "cannot be opened as a zip file" in str(exc) def test_save_signatures_to_location_1_zip_dup(runtmp): # save to sigfile.zip - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('foo.zip') + outloc = runtmp.output("foo.zip") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -178,11 +178,11 @@ def test_save_signatures_to_location_1_zip_dup(runtmp): # here we have to change the names so the sig content is different; # exact duplicates will not be saved, otherwise. ss2 = ss2.to_mutable() - ss2.name = 'different name for ss2' + ss2.name = "different name for ss2" save_sig.add(ss2) ss47 = ss47.to_mutable() - ss47.name = 'different name for ss47' + ss47.name = "different name for ss47" save_sig.add(ss47) # can we open as a .zip file? @@ -197,13 +197,13 @@ def test_save_signatures_to_location_1_zip_dup(runtmp): def test_save_signatures_to_location_2_zip_add(runtmp): # create sigfile.zip; then, add a new signature. - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) # add only ss2 - outloc = runtmp.output('foo.zip') + outloc = runtmp.output("foo.zip") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -230,13 +230,13 @@ def test_save_signatures_to_location_2_zip_add(runtmp): def test_save_signatures_to_location_2_zip_add_dup(runtmp): # create sigfile.zip; then, add a new signature, plus a ~duplicate. - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) # add only ss2 - outloc = runtmp.output('foo.zip') + outloc = runtmp.output("foo.zip") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -257,8 +257,9 @@ def test_save_signatures_to_location_2_zip_add_dup(runtmp): # add ss2; here we have to change the names so the sig content is # different exact duplicates will not be saved, otherwise. import copy + ss2copy = ss2.to_mutable() - ss2copy.name = 'different name for ss2' + ss2copy.name = "different name for ss2" save_sig.add(ss2copy) # updated file should contain all three. @@ -271,15 +272,15 @@ def test_save_signatures_to_location_2_zip_add_dup(runtmp): def test_save_signatures_to_location_3_zip_add_fail(runtmp): # create sigfile.zip using zipfile, then try to add to it (& fail) - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') - ss47 = sourmash.load_one_signature(sig47, ksize=31) + sig47 = utils.get_test_data("47.fa.sig") + sourmash.load_one_signature(sig47, ksize=31) # add only ss2, using zipfile API - outloc = runtmp.output('foo.zip') - with zipfile.ZipFile(outloc, 'x') as zf: - with zf.open('xyz.sig', 'w') as fp: + outloc = runtmp.output("foo.zip") + with zipfile.ZipFile(outloc, "x") as zf: + with zf.open("xyz.sig", "w") as fp: sourmash.save_signatures([ss2], fp=fp, compression=1) # verify it can be loaded, yada yada @@ -289,28 +290,29 @@ def test_save_signatures_to_location_3_zip_add_fail(runtmp): # now, try to open existing file with SaveSignaturesToLocation... with pytest.raises(ValueError) as exc: - with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: + with sourmash_args.SaveSignaturesToLocation(outloc): pass - assert 'Cannot add to existing zipfile' in str(exc) + assert "Cannot add to existing zipfile" in str(exc) def test_save_signatures_to_location_3_zip_add_with_manifest(runtmp): # create sigfile.zip using zipfile, then try to add to it (& fail) - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) # add only ss2, using zipfile API; add manifest manually. - outloc = runtmp.output('foo.zip') - with zipfile.ZipFile(outloc, 'x') as zf: - with zf.open('xyz.sig', 'w') as fp: + outloc = runtmp.output("foo.zip") + with zipfile.ZipFile(outloc, "x") as zf: + with zf.open("xyz.sig", "w") as fp: sourmash.save_signatures([ss2], fp=fp, compression=1) # make a manifest row... - row = manifest.CollectionManifest.make_manifest_row(ss2, 'xyz.sig', - include_signature=False) + row = manifest.CollectionManifest.make_manifest_row( + ss2, "xyz.sig", include_signature=False + ) # construct & save manifest mf = manifest.CollectionManifest([row]) @@ -320,7 +322,7 @@ def test_save_signatures_to_location_3_zip_add_with_manifest(runtmp): mf.write_to_csv(manifest_fp, write_header=True) manifest_data = manifest_fp.getvalue().encode("utf-8") - with zf.open(mf_name, 'w') as fp: + with zf.open(mf_name, "w") as fp: fp.write(manifest_data) # fini! made our artisanal hand-crafted zipfile. Now... @@ -345,12 +347,12 @@ def test_save_signatures_to_location_3_zip_add_with_manifest(runtmp): def test_save_signatures_to_location_1_dirout(runtmp): # save to sigout/ (directory) - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('sigout/') + outloc = runtmp.output("sigout/") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -366,12 +368,12 @@ def test_save_signatures_to_location_1_dirout(runtmp): def test_save_signatures_to_location_1_dirout_bug_2751(runtmp): # check for 2x compressed sig files - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('sigout/') + outloc = runtmp.output("sigout/") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -380,7 +382,7 @@ def test_save_signatures_to_location_1_dirout_bug_2751(runtmp): assert os.path.isdir(outloc) print(os.listdir(outloc)) - outloc2 = runtmp.output('sigout/09a08691ce52952152f0e866a59f6261.sig.gz') + outloc2 = runtmp.output("sigout/09a08691ce52952152f0e866a59f6261.sig.gz") with gzip.open(outloc2, "r") as fp: data = fp.read() print(data) @@ -389,12 +391,12 @@ def test_save_signatures_to_location_1_dirout_bug_2751(runtmp): def test_save_signatures_to_location_1_dirout_duplicate(runtmp): # save to sigout/ (directory) - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) - outloc = runtmp.output('sigout/') + outloc = runtmp.output("sigout/") with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: print(save_sig) save_sig.add(ss2) @@ -411,8 +413,8 @@ def test_save_signatures_to_location_1_dirout_duplicate(runtmp): def test_load_empty_zipfile(runtmp): - outloc = runtmp.output('empty.zip') - with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: + outloc = runtmp.output("empty.zip") + with sourmash_args.SaveSignaturesToLocation(outloc): pass sigiter = sourmash.load_file_as_signatures(outloc) @@ -422,15 +424,14 @@ def test_load_empty_zipfile(runtmp): def test_load_many_sigs_empty_file(runtmp): # make sure load_many_signatures behaves properly on empty file outloc = runtmp.output("empty.sig") - with open(outloc, "wt") as fp: + with open(outloc, "w"): pass progress = sourmash_args.SignatureLoadingProgress() with contextlib.redirect_stderr(io.StringIO()) as errfp: - with pytest.raises(SystemExit) as exc: - for ss, sigloc in sourmash_args.load_many_signatures([outloc], - progress): + with pytest.raises(SystemExit): + for ss, sigloc in sourmash_args.load_many_signatures([outloc], progress): pass err = errfp.getvalue() @@ -442,15 +443,15 @@ def test_load_many_sigs_empty_file(runtmp): def test_load_many_sigs_empty_file_force(runtmp): # make sure load_many_signatures behaves properly on empty file w/force outloc = runtmp.output("empty.sig") - with open(outloc, "wt") as fp: + with open(outloc, "w"): pass progress = sourmash_args.SignatureLoadingProgress() with contextlib.redirect_stderr(io.StringIO()) as errfp: - for ss, sigloc in sourmash_args.load_many_signatures([outloc], - progress, - force=True): + for ss, sigloc in sourmash_args.load_many_signatures( + [outloc], progress, force=True + ): pass err = errfp.getvalue() @@ -461,7 +462,7 @@ def test_load_many_sigs_empty_file_force(runtmp): def test_get_manifest_1(): # basic get_manifest retrieves a manifest - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") idx = sourmash.load_file_as_index(sig47) manifest = sourmash_args.get_manifest(idx) @@ -470,18 +471,18 @@ def test_get_manifest_1(): def test_get_manifest_2_cannot_build(): # test what happens when get_manifest cannot build manifest - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47) idx = LinearIndex([ss47]) - with pytest.raises(SystemExit) as exc: - m = sourmash_args.get_manifest(idx) + with pytest.raises(SystemExit): + sourmash_args.get_manifest(idx) def test_get_manifest_2_cannot_buildno_require(): # test what happens when get_manifest cannot build manifest - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47) idx = LinearIndex([ss47]) @@ -493,11 +494,12 @@ def test_get_manifest_2_cannot_buildno_require(): def test_get_manifest_3_build(): # check that manifest is building - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47) class FakeIndex(LinearIndex): was_called = 0 + def _signatures_with_internal(self): self.was_called = 1 return [(ss47, "fakeiloc")] @@ -510,12 +512,12 @@ def _signatures_with_internal(self): print(m) assert len(m) == 1 - assert m.rows[0]['internal_location'] == "fakeiloc" + assert m.rows[0]["internal_location"] == "fakeiloc" def test_get_manifest_3_build_2(): # check that manifest is building, but only when asked - sig47 = utils.get_test_data('47.fa.sig') + sig47 = utils.get_test_data("47.fa.sig") ss47 = sourmash.load_one_signature(sig47) class FakeIndex(LinearIndex): @@ -546,7 +548,7 @@ def _signatures_with_internal(self): assert m == m3 -class FakeArgs(object): +class FakeArgs: picklist = None include_db_pattern = None exclude_db_pattern = None @@ -567,63 +569,63 @@ def test_pattern_1(): # test just --include-pattern handling args = FakeArgs() args.picklist = None - args.include_db_pattern = 'foo' + args.include_db_pattern = "foo" args.exclude_db_pattern = None pattern_search = sourmash_args.load_include_exclude_db_patterns(args) - assert pattern_search(['foo', 'bar', 'baz']) - assert not pattern_search(['bar', 'bif']) + assert pattern_search(["foo", "bar", "baz"]) + assert not pattern_search(["bar", "bif"]) def test_pattern_2(): # test just --exclude-pattern handling args = FakeArgs() args.picklist = None - args.exclude_db_pattern = 'foo' + args.exclude_db_pattern = "foo" args.include_db_pattern = None pattern_search = sourmash_args.load_include_exclude_db_patterns(args) - assert not pattern_search(['foo', 'bar', 'baz']) - assert pattern_search(['bar', 'baz', 'bif']) + assert not pattern_search(["foo", "bar", "baz"]) + assert pattern_search(["bar", "baz", "bif"]) def test_pattern_3(): # test with --picklist and --exclude: should fail args = FakeArgs() args.picklist = True - args.exclude_db_pattern = 'foo' + args.exclude_db_pattern = "foo" args.include_db_pattern = None with pytest.raises(SystemExit): - pattern_search = sourmash_args.load_include_exclude_db_patterns(args) + sourmash_args.load_include_exclude_db_patterns(args) def test_pattern_4(): # test with --picklist and --include: should fail args = FakeArgs() args.picklist = True - args.include_db_pattern = 'foo' + args.include_db_pattern = "foo" args.exclude_db_pattern = None with pytest.raises(SystemExit): - pattern_search = sourmash_args.load_include_exclude_db_patterns(args) + sourmash_args.load_include_exclude_db_patterns(args) def test_pattern_5(): # test with --include and --exclude: should fail args = FakeArgs() args.picklist = None - args.exclude_db_pattern = 'foo' - args.include_db_pattern = 'bar' + args.exclude_db_pattern = "foo" + args.include_db_pattern = "bar" with pytest.raises(SystemExit): - pattern_search = sourmash_args.load_include_exclude_db_patterns(args) + sourmash_args.load_include_exclude_db_patterns(args) def test_fileinput_csv_1_plain(): # test basic CSV input - testfile = utils.get_test_data('tax/test.taxonomy.csv') + testfile = utils.get_test_data("tax/test.taxonomy.csv") with sourmash_args.FileInputCSV(testfile) as r: rows = list(r) @@ -633,21 +635,21 @@ def test_fileinput_csv_1_plain(): def test_fileinput_csv_1_no_such_file(runtmp): # test fail to load file - noexistfile = runtmp.output('does-not-exist.csv') + noexistfile = runtmp.output("does-not-exist.csv") with pytest.raises(FileNotFoundError): - with sourmash_args.FileInputCSV(noexistfile) as r: + with sourmash_args.FileInputCSV(noexistfile): pass def test_fileinput_csv_2_gz(runtmp): # test basic CSV input from gz file - testfile = utils.get_test_data('tax/test.taxonomy.csv') - gzfile = runtmp.output('test.csv.gz') + testfile = utils.get_test_data("tax/test.taxonomy.csv") + gzfile = runtmp.output("test.csv.gz") - with gzip.open(gzfile, 'wt') as outfp: - with open(testfile, 'rt', newline='') as infp: + with gzip.open(gzfile, "wt") as outfp: + with open(testfile, newline="") as infp: outfp.write(infp.read()) with sourmash_args.FileInputCSV(gzfile) as r: @@ -658,42 +660,42 @@ def test_fileinput_csv_2_gz(runtmp): def test_fileinput_csv_2_gz_not_csv(runtmp): # test basic CSV input from gz file that's not CSV - works - gzfile = runtmp.output('test.csv.gz') + gzfile = runtmp.output("test.csv.gz") - with gzip.open(gzfile, 'wt') as outfp: + with gzip.open(gzfile, "wt") as outfp: outfp.write("hello world!") with sourmash_args.FileInputCSV(gzfile) as r: - assert r.fieldnames == ['hello world!'] + assert r.fieldnames == ["hello world!"] def test_fileinput_csv_2_gz_bad_version_header(runtmp): # test basic CSV input from gz file with bad version header # currently this works; not clear to me how it should fail :grin: - gzfile = runtmp.output('test.csv.gz') + gzfile = runtmp.output("test.csv.gz") - with gzip.open(gzfile, 'wt') as outfp: + with gzip.open(gzfile, "wt") as outfp: outfp.write("# excelsior\nhello world!") with sourmash_args.FileInputCSV(gzfile) as r: - assert r.fieldnames == ['hello world!'] + assert r.fieldnames == ["hello world!"] print(r.version_info) - assert r.version_info == ['excelsior'] + assert r.version_info == ["excelsior"] def test_fileinput_csv_2_zip(runtmp): # test CSV input from zip file, with component filename - testfile = utils.get_test_data('tax/test.taxonomy.csv') - zf_file = runtmp.output('test.zip') + testfile = utils.get_test_data("tax/test.taxonomy.csv") + zf_file = runtmp.output("test.zip") - with zipfile.ZipFile(zf_file, 'w') as outzip: - with open(testfile, 'rb') as infp: - with outzip.open('XYZ.csv', 'w') as outfp: + with zipfile.ZipFile(zf_file, "w") as outzip: + with open(testfile, "rb") as infp: + with outzip.open("XYZ.csv", "w") as outfp: outfp.write(infp.read()) - with sourmash_args.FileInputCSV(zf_file, default_csv_name='XYZ.csv') as r: + with sourmash_args.FileInputCSV(zf_file, default_csv_name="XYZ.csv") as r: rows = list(r) assert len(rows) == 6 print(rows) @@ -702,20 +704,21 @@ def test_fileinput_csv_2_zip(runtmp): def test_fileinput_csv_3_load_manifest(): # test loading a manifest from a zipfile collection, using # FileInputCSV. - testfile = utils.get_test_data('prot/all.zip') - - with sourmash_args.FileInputCSV(testfile, default_csv_name='SOURMASH-MANIFEST.csv') as r: + testfile = utils.get_test_data("prot/all.zip") + with sourmash_args.FileInputCSV( + testfile, default_csv_name="SOURMASH-MANIFEST.csv" + ) as r: rows = list(r) assert len(rows) == 8 - assert r.version_info == ['SOURMASH-MANIFEST-VERSION', '1.0'] + assert r.version_info == ["SOURMASH-MANIFEST-VERSION", "1.0"] def test_fileinput_csv_3_load_manifest_no_default(): # test loading a manifest from a zipfile collection, using # FileInputCSV, but with no default_csv_name - should fail - testfile = utils.get_test_data('prot/all.zip') + testfile = utils.get_test_data("prot/all.zip") with pytest.raises(csv.Error): with sourmash_args.FileInputCSV(testfile) as r: @@ -725,72 +728,71 @@ def test_fileinput_csv_3_load_manifest_no_default(): def test_fileinput_csv_3_load_manifest_zipfile_obj(): # test loading a manifest from an open zipfile obj, using # FileInputCSV. - testfile = utils.get_test_data('prot/all.zip') + testfile = utils.get_test_data("prot/all.zip") with zipfile.ZipFile(testfile, "r") as zf: - with sourmash_args.FileInputCSV(testfile, - default_csv_name='SOURMASH-MANIFEST.csv', - zipfile_obj=zf) as r: + with sourmash_args.FileInputCSV( + testfile, default_csv_name="SOURMASH-MANIFEST.csv", zipfile_obj=zf + ) as r: rows = list(r) assert len(rows) == 8 - assert r.version_info == ['SOURMASH-MANIFEST-VERSION', '1.0'] + assert r.version_info == ["SOURMASH-MANIFEST-VERSION", "1.0"] def test_fileinput_csv_3_load_manifest_zipfile_obj_no_defualt(): # test loading a manifest from an open zipfile obj, using # FileInputCSV, but with no default csv name => should fail. - testfile = utils.get_test_data('prot/all.zip') + testfile = utils.get_test_data("prot/all.zip") with zipfile.ZipFile(testfile, "r") as zf: with pytest.raises(ValueError): - with sourmash_args.FileInputCSV(testfile, - zipfile_obj=zf) as r: + with sourmash_args.FileInputCSV(testfile, zipfile_obj=zf): pass def test_fileoutput_csv_1(runtmp): # test basic behavior - outfile = runtmp.output('xxx.csv') + outfile = runtmp.output("xxx.csv") with sourmash_args.FileOutputCSV(outfile) as fp: w = csv.writer(fp) - w.writerow(['a', 'b', 'c']) - w.writerow(['x', 'y', 'z']) + w.writerow(["a", "b", "c"]) + w.writerow(["x", "y", "z"]) with open(outfile, newline="") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 1 row = rows[0] - assert row['a'] == 'x' - assert row['b'] == 'y' - assert row['c'] == 'z' + assert row["a"] == "x" + assert row["b"] == "y" + assert row["c"] == "z" def test_fileoutput_csv_1_gz(runtmp): # test basic behavior => gz - outfile = runtmp.output('xxx.csv.gz') + outfile = runtmp.output("xxx.csv.gz") with sourmash_args.FileOutputCSV(outfile) as fp: w = csv.writer(fp) - w.writerow(['a', 'b', 'c']) - w.writerow(['x', 'y', 'z']) + w.writerow(["a", "b", "c"]) + w.writerow(["x", "y", "z"]) - with gzip.open(outfile, 'rt') as fp: + with gzip.open(outfile, "rt") as fp: r = csv.DictReader(fp) rows = list(r) assert len(rows) == 1 row = rows[0] - assert row['a'] == 'x' - assert row['b'] == 'y' - assert row['c'] == 'z' + assert row["a"] == "x" + assert row["b"] == "y" + assert row["c"] == "z" def test_fileoutput_csv_2_stdout(): # test '-' and 'None' go to sys.stdout - with sourmash_args.FileOutputCSV('-') as fp: + with sourmash_args.FileOutputCSV("-") as fp: assert fp == sys.stdout with sourmash_args.FileOutputCSV(None) as fp: @@ -802,14 +804,14 @@ def test_add_ksize_arg_no_default(): p = argparse.ArgumentParser() add_ksize_arg(p) args = p.parse_args() - assert args.ksize == None + assert args.ksize is None def test_add_ksize_arg_no_default_specify(): # test behavior of cli.utils.add_ksize_arg p = argparse.ArgumentParser() add_ksize_arg(p) - args = p.parse_args(['-k', '21']) + args = p.parse_args(["-k", "21"]) assert args.ksize == 21 @@ -825,17 +827,17 @@ def test_add_ksize_arg_default_31_specify(): # test behavior of cli.utils.add_ksize_arg p = argparse.ArgumentParser() add_ksize_arg(p, default=31) - args = p.parse_args(['-k', '21']) + args = p.parse_args(["-k", "21"]) assert args.ksize == 21 def test_bug_2370(runtmp): # bug - manifest loading code does not catch gzip.BadGzipFile - sigfile = utils.get_test_data('63.fa.sig') + sigfile = utils.get_test_data("63.fa.sig") # copy sigfile over to a .gz file without compressing it - - shutil.copyfile(sigfile, runtmp.output('not_really_gzipped.gz')) + shutil.copyfile(sigfile, runtmp.output("not_really_gzipped.gz")) # try running sourmash_args.load_file_as_index - #runtmp.sourmash('sig', 'describe', runtmp.output('not_really_gzipped.gz')) - sourmash_args.load_file_as_index(runtmp.output('not_really_gzipped.gz')) + # runtmp.sourmash('sig', 'describe', runtmp.output('not_really_gzipped.gz')) + sourmash_args.load_file_as_index(runtmp.output("not_really_gzipped.gz")) diff --git a/tests/test_sourmash_compute.py b/tests/test_sourmash_compute.py index cb3c48fc32..f6f6370785 100644 --- a/tests/test_sourmash_compute.py +++ b/tests/test_sourmash_compute.py @@ -27,155 +27,231 @@ def test_do_sourmash_compute(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', testdata1], - in_directory=location) + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", ["compute", "-k", "31", testdata1], in_directory=location + ) - sigfile = os.path.join(location, 'short.fa.sig') + sigfile = os.path.join(location, "short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('short.fa') + assert str(sig).endswith("short.fa") def test_do_sourmash_compute_check_num_bounds_negative(runtmp): - c=runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = c.output('short.fa.sig') + c = runtmp + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = c.output("short.fa.sig") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('compute', '-k', '31', '--num-hashes', '-5', '-o', sigfile, '--merge', '"name"', testdata1, testdata2, testdata3) - + c.run_sourmash( + "compute", + "-k", + "31", + "--num-hashes", + "-5", + "-o", + sigfile, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + ) + assert "ERROR: num value must be positive" in c.last_result.err def test_do_sourmash_compute_check_num_bounds_less_than_minimum(runtmp): - c=runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = c.output('short.fa.sig') - - c.run_sourmash('compute', '-k', '31', '--num-hashes', '25', '-o', sigfile, '--merge', '"name"', testdata1, testdata2, testdata3) - + c = runtmp + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = c.output("short.fa.sig") + + c.run_sourmash( + "compute", + "-k", + "31", + "--num-hashes", + "25", + "-o", + sigfile, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + ) + assert "WARNING: num value should be >= 50. Continuing anyway." in c.last_result.err def test_do_sourmash_compute_check_num_bounds_more_than_maximum(runtmp): - c=runtmp - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = c.output('short.fa.sig') - - c.run_sourmash('compute', '-k', '31', '--num-hashes', '100000', '-o', sigfile, '--merge', '"name"', testdata1, testdata2, testdata3) - - assert "WARNING: num value should be <= 50000. Continuing anyway." in c.last_result.err + c = runtmp + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = c.output("short.fa.sig") + + c.run_sourmash( + "compute", + "-k", + "31", + "--num-hashes", + "100000", + "-o", + sigfile, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + ) + + assert ( + "WARNING: num value should be <= 50000. Continuing anyway." in c.last_result.err + ) @utils.in_tempdir def test_do_sourmash_compute_outdir(c): - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', testdata1, - '--outdir', c.location]) + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", ["compute", "-k", "31", testdata1, "--outdir", c.location] + ) - - sigfile = os.path.join(c.location, 'short.fa.sig') + sigfile = os.path.join(c.location, "short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('short.fa') + assert str(sig).endswith("short.fa") def test_do_sourmash_compute_output_valid_file(): - """ Trigger bug #123 """ + """Trigger bug #123""" with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = os.path.join(location, "short.fa.sig") - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '-o', sigfile, - testdata1, - testdata2, testdata3], - in_directory=location) + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "-o", sigfile, testdata1, testdata2, testdata3], + in_directory=location, + ) assert os.path.exists(sigfile) - assert not out # stdout should be empty + assert not out # stdout should be empty # is it valid json? - with open(sigfile, 'r') as f: + with open(sigfile) as f: data = json.load(f) - filesigs = [sig['filename'] for sig in data] - assert all(testdata in filesigs - for testdata in (testdata1, testdata2, testdata3)) + filesigs = [sig["filename"] for sig in data] + assert all( + testdata in filesigs for testdata in (testdata1, testdata2, testdata3) + ) def test_do_sourmash_compute_output_stdout_valid(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '-o', '-', - testdata1, - testdata2, testdata3], - in_directory=location) + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "-o", "-", testdata1, testdata2, testdata3], + in_directory=location, + ) # is it valid json? data = json.loads(out) - filesigs = [sig['filename'] for sig in data] - assert all(testdata in filesigs - for testdata in (testdata1, testdata2, testdata3)) + filesigs = [sig["filename"] for sig in data] + assert all( + testdata in filesigs for testdata in (testdata1, testdata2, testdata3) + ) @utils.in_tempdir def test_do_sourmash_compute_output_and_name_valid_file(c): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = c.output('short.fa.sig') - - c.run_sourmash('compute', '-k', '31', '-o', sigfile, '--merge', '"name"', testdata1, testdata2, testdata3) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = c.output("short.fa.sig") + + c.run_sourmash( + "compute", + "-k", + "31", + "-o", + sigfile, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + ) assert os.path.exists(sigfile) - assert 'calculated 1 signature for 4 sequences taken from 3 files' in c.last_result.err + assert ( + "calculated 1 signature for 4 sequences taken from 3 files" in c.last_result.err + ) # is it valid json? - with open(sigfile, 'r') as f: + with open(sigfile) as f: data = json.load(f) assert len(data) == 1 - sigfile_merged = c.output('short.all.fa.sig') - c.run_sourmash('compute', '-k', '31', '-o', sigfile_merged, '--merge', '"name"', testdata1, testdata2, testdata3) - - with open(sigfile_merged, 'r') as f: + sigfile_merged = c.output("short.all.fa.sig") + c.run_sourmash( + "compute", + "-k", + "31", + "-o", + sigfile_merged, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + ) + + with open(sigfile_merged) as f: data_merged = json.load(f) - assert data[0]['signatures'][0]['mins'] == data_merged[0]['signatures'][0]['mins'] + assert data[0]["signatures"][0]["mins"] == data_merged[0]["signatures"][0]["mins"] @utils.in_tempdir def test_do_sourmash_compute_output_and_name_valid_file_outdir(c): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = os.path.join(c.location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = os.path.join(c.location, "short.fa.sig") - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('compute', '-k', '31', '-o', sigfile, - '--merge', '"name"', - testdata1, testdata2, testdata3, - '--outdir', c.location) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "compute", + "-k", + "31", + "-o", + sigfile, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + "--outdir", + c.location, + ) errmsg = c.last_result.err assert "ERROR: --output-dir doesn't make sense with -o/--output" in errmsg @@ -183,103 +259,109 @@ def test_do_sourmash_compute_output_and_name_valid_file_outdir(c): def test_do_sourmash_compute_singleton(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--singleton', - testdata1], - in_directory=location) - - sigfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "--singleton", testdata1], + in_directory=location, + ) + + sigfile = os.path.join(location, "short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert sig.name.endswith('shortName') + assert sig.name.endswith("shortName") def test_do_sourmash_compute_name(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--merge', 'foo', - testdata1, '-o', 'foo.sig'], - in_directory=location) - - sigfile = os.path.join(location, 'foo.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "--merge", "foo", testdata1, "-o", "foo.sig"], + in_directory=location, + ) + + sigfile = os.path.join(location, "foo.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert sig.name == 'foo' + assert sig.name == "foo" - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--name', 'foo', - testdata1, '-o', 'foo2.sig'], - in_directory=location) + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "--name", "foo", testdata1, "-o", "foo2.sig"], + in_directory=location, + ) - sigfile2 = os.path.join(location, 'foo2.sig') + sigfile2 = os.path.join(location, "foo2.sig") assert os.path.exists(sigfile2) sig2 = next(signature.load_signatures(sigfile)) - assert sig2.name == 'foo' + assert sig2.name == "foo" assert sig.name == sig2.name def test_do_sourmash_compute_name_fail_no_output(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--merge', 'foo', - testdata1], - in_directory=location, - fail_ok=True) + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "--merge", "foo", testdata1], + in_directory=location, + fail_ok=True, + ) assert status == -1 def test_do_sourmash_compute_merge_fail_no_output(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--merge', 'foo', - testdata1], - in_directory=location, - fail_ok=True) + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "--merge", "foo", testdata1], + in_directory=location, + fail_ok=True, + ) assert status == -1 - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--name', 'foo', - testdata1], - in_directory=location, - fail_ok=True) + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "--name", "foo", testdata1], + in_directory=location, + fail_ok=True, + ) assert status == -1 def test_do_sourmash_compute_name_from_first(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short3.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '31', '--name-from-first', - testdata1], - in_directory=location) - - sigfile = os.path.join(location, 'short3.fa.sig') + testdata1 = utils.get_test_data("short3.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "31", "--name-from-first", testdata1], + in_directory=location, + ) + + sigfile = os.path.join(location, "short3.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert sig.name == 'firstname' + assert sig.name == "firstname" def test_do_sourmash_compute_multik(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", ["compute", "-k", "21,31", testdata1], in_directory=location + ) + outfile = os.path.join(location, "short.fa.sig") assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 31 in ksizes assert len(ksizes) == 2 @@ -287,20 +369,20 @@ def test_do_sourmash_compute_multik(): def test_do_sourmash_compute_multik_with_protein(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--protein', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--protein", testdata1], + in_directory=location, + ) + outfile = os.path.join(location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 4 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 30 in ksizes assert 7 in ksizes @@ -310,22 +392,24 @@ def test_do_sourmash_compute_multik_with_protein(): def test_do_sourmash_compute_multik_with_dayhoff(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--dayhoff', '--no-dna', - testdata1], - in_directory=location) - assert 'Computing only Dayhoff-encoded protein (and not nucleotide) ' \ - 'signatures.' in err - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--dayhoff", "--no-dna", testdata1], + in_directory=location, + ) + assert ( + "Computing only Dayhoff-encoded protein (and not nucleotide) " + "signatures." in err + ) + outfile = os.path.join(location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert all(x.minhash.dayhoff for x in siglist) @@ -334,47 +418,49 @@ def test_do_sourmash_compute_multik_with_dayhoff(): def test_do_sourmash_compute_multik_with_dayhoff_and_dna(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--dayhoff', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--dayhoff", testdata1], + in_directory=location, + ) + outfile = os.path.join(location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 4 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 30 in ksizes assert 7 in ksizes assert 10 in ksizes - assert sum(x.minhash.moltype == 'DNA' for x in siglist) == 2 - assert sum(x.minhash.moltype == 'dayhoff' for x in siglist) == 2 + assert sum(x.minhash.moltype == "DNA" for x in siglist) == 2 + assert sum(x.minhash.moltype == "dayhoff" for x in siglist) == 2 assert len(ksizes) == 4 def test_do_sourmash_compute_multik_with_hp(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--hp', '--no-dna', - testdata1], - in_directory=location) - assert 'Computing only hp-encoded protein (and not nucleotide) ' \ - 'signatures.' in err - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--hp", "--no-dna", testdata1], + in_directory=location, + ) + assert ( + "Computing only hp-encoded protein (and not nucleotide) " + "signatures." in err + ) + outfile = os.path.join(location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert all(x.minhash.hp for x in siglist) @@ -383,20 +469,20 @@ def test_do_sourmash_compute_multik_with_hp(): def test_do_sourmash_compute_multik_with_hp_and_dna(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--hp', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--hp", testdata1], + in_directory=location, + ) + outfile = os.path.join(location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 4 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert 21 in ksizes @@ -406,99 +492,98 @@ def test_do_sourmash_compute_multik_with_hp_and_dna(): def test_do_sourmash_compute_multik_with_dayhoff_dna_protein(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--dayhoff', '--protein', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--dayhoff", "--protein", testdata1], + in_directory=location, + ) + outfile = os.path.join(location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 6 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 30 in ksizes assert 7 in ksizes assert 10 in ksizes - assert sum(x.minhash.moltype == 'DNA' for x in siglist) == 2 - assert sum(x.minhash.moltype == 'dayhoff' for x in siglist) == 2 - assert sum(x.minhash.moltype == 'protein' for x in siglist) == 2 + assert sum(x.minhash.moltype == "DNA" for x in siglist) == 2 + assert sum(x.minhash.moltype == "dayhoff" for x in siglist) == 2 + assert sum(x.minhash.moltype == "protein" for x in siglist) == 2 assert len(ksizes) == 4 def test_do_sourmash_compute_multik_with_dayhoff_hp_dna_protein(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--dayhoff', '--hp', '--protein', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--dayhoff", "--hp", "--protein", testdata1], + in_directory=location, + ) + outfile = os.path.join(location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 8 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert 21 in ksizes assert 30 in ksizes - assert sum(x.minhash.moltype == 'DNA' for x in siglist) == 2 - assert sum(x.minhash.moltype == 'dayhoff' for x in siglist) == 2 - assert sum(x.minhash.moltype == 'hp' for x in siglist) == 2 + assert sum(x.minhash.moltype == "DNA" for x in siglist) == 2 + assert sum(x.minhash.moltype == "dayhoff" for x in siglist) == 2 + assert sum(x.minhash.moltype == "hp" for x in siglist) == 2 # 2 = dayhoff, 2 = hp = 4 protein - assert sum(x.minhash.moltype == 'protein' for x in siglist) == 2 + assert sum(x.minhash.moltype == "protein" for x in siglist) == 2 assert len(ksizes) == 4 def test_do_sourmash_compute_multik_with_nothing(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--no-protein', '--no-dna', - testdata1], - in_directory=location, - fail_ok=True) - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", "--no-protein", "--no-dna", testdata1], + in_directory=location, + fail_ok=True, + ) + outfile = os.path.join(location, "short.fa.sig") assert not os.path.exists(outfile) def test_do_sourmash_compute_multik_protein_bad_ksize(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '20,32', - '--protein', '--no-dna', - testdata1], - in_directory=location, - fail_ok=True) - outfile = os.path.join(location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "20,32", "--protein", "--no-dna", testdata1], + in_directory=location, + fail_ok=True, + ) + outfile = os.path.join(location, "short.fa.sig") assert not os.path.exists(outfile) - assert 'protein ksizes must be divisible by 3' in err + assert "protein ksizes must be divisible by 3" in err @utils.in_tempdir def test_do_sourmash_compute_multik_only_protein(c): # check sourmash compute with only protein, no nucl - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('compute', '-k', '21,30', - '--protein', '--no-dna', testdata1) - outfile = os.path.join(c.location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("compute", "-k", "21,30", "--protein", "--no-dna", testdata1) + outfile = os.path.join(c.location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert len(ksizes) == 2 @@ -506,34 +591,40 @@ def test_do_sourmash_compute_multik_only_protein(c): def test_do_sourmash_compute_multik_protein_input_bad_ksize(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short-protein.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '20,32', - '--protein', '--no-dna', - '--input-is-protein', - testdata1], - in_directory=location, - fail_ok=True) - outfile = os.path.join(location, 'short-protein.fa.sig') + testdata1 = utils.get_test_data("short-protein.fa") + status, out, err = utils.runscript( + "sourmash", + [ + "compute", + "-k", + "20,32", + "--protein", + "--no-dna", + "--input-is-protein", + testdata1, + ], + in_directory=location, + fail_ok=True, + ) + os.path.join(location, "short-protein.fa.sig") assert status != 0 - assert 'protein ksizes must be divisible by 3' in err + assert "protein ksizes must be divisible by 3" in err @utils.in_tempdir def test_do_sourmash_compute_multik_only_protein_no_rna(c): # test --no-rna as well (otherwise identical to previous test) - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") - c.run_sourmash('compute', '-k', '21,30', - '--protein', '--no-rna', testdata1) - outfile = os.path.join(c.location, 'short.fa.sig') + c.run_sourmash("compute", "-k", "21,30", "--protein", "--no-rna", testdata1) + outfile = os.path.join(c.location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert len(ksizes) == 2 @@ -542,20 +633,20 @@ def test_do_sourmash_compute_multik_only_protein_no_rna(c): def test_do_sourmash_compute_protein_bad_sequences(): """Proper error handling when Ns in dna sequence""" with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.bad.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--protein', '--no-dna', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'short.bad.fa.sig') + testdata1 = utils.get_test_data("short.bad.fa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--protein", "--no-dna", testdata1], + in_directory=location, + ) + outfile = os.path.join(location, "short.bad.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert len(ksizes) == 2 @@ -563,178 +654,176 @@ def test_do_sourmash_compute_protein_bad_sequences(): def test_do_sourmash_compute_multik_input_is_protein(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('ecoli.faa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,30', - '--input-is-protein', - testdata1], - in_directory=location) - outfile = os.path.join(location, 'ecoli.faa.sig') + testdata1 = utils.get_test_data("ecoli.faa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,30", "--input-is-protein", testdata1], + in_directory=location, + ) + outfile = os.path.join(location, "ecoli.faa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert len(ksizes) == 2 - moltype = set([ x.minhash.moltype == 'protein' - for x in siglist ]) + moltype = set([x.minhash.moltype == "protein" for x in siglist]) assert len(moltype) == 1 assert True in moltype def test_do_sourmash_compute_multik_outfile(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - testdata1, '-o', outfile], - in_directory=location) + testdata1 = utils.get_test_data("short.fa") + outfile = os.path.join(location, "FOO.xxx") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", testdata1, "-o", outfile], + in_directory=location, + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 31 in ksizes def test_do_sourmash_compute_with_scaled_1(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '1', - testdata1, '-o', outfile], - in_directory=location) + testdata1 = utils.get_test_data("short.fa") + outfile = os.path.join(location, "FOO.xxx") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", "--scaled", "1", testdata1, "-o", outfile], + in_directory=location, + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - scaled_vals = [ x.minhash.scaled for x in siglist ] + scaled_vals = [x.minhash.scaled for x in siglist] assert len(scaled_vals) == 2 - assert set(scaled_vals) == { 1 } + assert set(scaled_vals) == {1} def test_do_sourmash_compute_with_scaled_2(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '2', - testdata1, '-o', outfile], - in_directory=location) + testdata1 = utils.get_test_data("short.fa") + outfile = os.path.join(location, "FOO.xxx") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", "--scaled", "2", testdata1, "-o", outfile], + in_directory=location, + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - max_hashes = [ x.minhash._max_hash for x in siglist ] + max_hashes = [x.minhash._max_hash for x in siglist] assert len(max_hashes) == 2 - assert set(max_hashes) == set([ int(2**64 /2.) ]) + assert set(max_hashes) == set([int(2**64 / 2.0)]) def test_do_sourmash_compute_with_scaled(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '100', - testdata1, '-o', outfile], - in_directory=location) + testdata1 = utils.get_test_data("short.fa") + outfile = os.path.join(location, "FOO.xxx") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", "--scaled", "100", testdata1, "-o", outfile], + in_directory=location, + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - max_hashes = [ x.minhash._max_hash for x in siglist ] + max_hashes = [x.minhash._max_hash for x in siglist] assert len(max_hashes) == 2 - assert set(max_hashes) == set([ int(2**64 /100.) ]) + assert set(max_hashes) == set([int(2**64 / 100.0)]) def test_do_sourmash_compute_with_bad_scaled(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '-1', - testdata1, '-o', outfile], - in_directory=location, - fail_ok=True) + testdata1 = utils.get_test_data("short.fa") + outfile = os.path.join(location, "FOO.xxx") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", "--scaled", "-1", testdata1, "-o", outfile], + in_directory=location, + fail_ok=True, + ) assert status != 0 - assert '--scaled value must be >= 1' in err + assert "--scaled value must be >= 1" in err - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '1000.5', - testdata1, '-o', outfile], - in_directory=location, - fail_ok=True) + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", "--scaled", "1000.5", testdata1, "-o", outfile], + in_directory=location, + fail_ok=True, + ) assert status != 0 - assert '--scaled value must be integer value' in err + assert "--scaled value must be integer value" in err - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--scaled', '1e9', - testdata1, '-o', outfile], - in_directory=location) + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", "--scaled", "1e9", testdata1, "-o", outfile], + in_directory=location, + ) assert status == 0 - assert 'WARNING: scaled value is nonsensical!?' in err + assert "WARNING: scaled value is nonsensical!?" in err def test_do_sourmash_compute_with_seed(): with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - outfile = os.path.join(location, 'FOO.xxx') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21,31', - '--seed', '43', - testdata1, '-o', outfile], - in_directory=location) + testdata1 = utils.get_test_data("short.fa") + outfile = os.path.join(location, "FOO.xxx") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21,31", "--seed", "43", testdata1, "-o", outfile], + in_directory=location, + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - seeds = [ x.minhash.seed for x in siglist ] + seeds = [x.minhash.seed for x in siglist] assert len(seeds) == 2 - assert set(seeds) == set([ 43 ]) + assert set(seeds) == set([43]) def test_do_sourmash_check_protein_comparisons(): # this test checks 2 x 2 protein comparisons with E. coli genes. with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('ecoli.faa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21', - '--input-is-protein', - '--singleton', - testdata1], - in_directory=location) - sig1 = os.path.join(location, 'ecoli.faa.sig') + testdata1 = utils.get_test_data("ecoli.faa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21", "--input-is-protein", "--singleton", testdata1], + in_directory=location, + ) + sig1 = os.path.join(location, "ecoli.faa.sig") assert os.path.exists(sig1) - testdata2 = utils.get_test_data('ecoli.genes.fna') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21', - '--protein', '--no-dna', - '--singleton', - testdata2], - in_directory=location) - sig2 = os.path.join(location, 'ecoli.genes.fna.sig') + testdata2 = utils.get_test_data("ecoli.genes.fna") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21", "--protein", "--no-dna", "--singleton", testdata2], + in_directory=location, + ) + sig2 = os.path.join(location, "ecoli.genes.fna.sig") assert os.path.exists(sig2) # I'm not sure why load_signatures is randomizing order, but ok. @@ -745,13 +834,13 @@ def test_do_sourmash_check_protein_comparisons(): sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name) name1 = sig1_aa.name.split()[0] - assert name1 == 'NP_414543.1' + assert name1 == "NP_414543.1" name2 = sig2_aa.name.split()[0] - assert name2 == 'NP_414544.1' + assert name2 == "NP_414544.1" name3 = sig1_trans.name.split()[0] - assert name3 == 'gi|556503834:2801-3733' + assert name3 == "gi|556503834:2801-3733" name4 = sig2_trans.name.split()[0] - assert name4 == 'gi|556503834:337-2799' + assert name4 == "gi|556503834:337-2799" print(name1, name3, round(sig1_aa.similarity(sig1_trans), 3)) print(name2, name3, round(sig2_aa.similarity(sig1_trans), 3)) @@ -768,11 +857,9 @@ def test_do_sourmash_check_protein_comparisons(): def test_do_sourmash_check_knowngood_dna_comparisons(c): # this test checks against a known good signature calculated # by utils/compute-dna-mh-another-way.py - testdata1 = utils.get_test_data('ecoli.genes.fna') - c.run_sourmash('compute', '-k', '21', - '--singleton', '--dna', - testdata1) - sig1 = c.output('ecoli.genes.fna.sig') + testdata1 = utils.get_test_data("ecoli.genes.fna") + c.run_sourmash("compute", "-k", "21", "--singleton", "--dna", testdata1) + sig1 = c.output("ecoli.genes.fna.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) @@ -780,7 +867,7 @@ def test_do_sourmash_check_knowngood_dna_comparisons(c): print(sig1.name) print(sig2.name) - knowngood = utils.get_test_data('benchmark.dna.sig') + knowngood = utils.get_test_data("benchmark.dna.sig") good = list(signature.load_signatures(knowngood))[0] assert sig2.similarity(good) == 1.0 @@ -789,16 +876,15 @@ def test_do_sourmash_check_knowngood_dna_comparisons(c): @utils.in_tempdir def test_do_sourmash_check_knowngood_dna_comparisons_use_rna(c): # check the --rna flag; otherwise identical to previous test. - testdata1 = utils.get_test_data('ecoli.genes.fna') - c.run_sourmash('compute', '-k', '21', '--singleton', '--rna', - testdata1) - sig1 = c.output('ecoli.genes.fna.sig') + testdata1 = utils.get_test_data("ecoli.genes.fna") + c.run_sourmash("compute", "-k", "21", "--singleton", "--rna", testdata1) + sig1 = c.output("ecoli.genes.fna.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) sig1, sig2 = sorted(x, key=lambda x: x.name) - knowngood = utils.get_test_data('benchmark.dna.sig') + knowngood = utils.get_test_data("benchmark.dna.sig") good = list(signature.load_signatures(knowngood))[0] assert sig2.similarity(good) == 1.0 @@ -808,20 +894,19 @@ def test_do_sourmash_check_knowngood_input_protein_comparisons(): # this test checks against a known good signature calculated # by utils/compute-input-prot-another-way.py with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('ecoli.faa') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21', - '--input-is-protein', - '--singleton', - testdata1], - in_directory=location) - sig1 = os.path.join(location, 'ecoli.faa.sig') + testdata1 = utils.get_test_data("ecoli.faa") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21", "--input-is-protein", "--singleton", testdata1], + in_directory=location, + ) + sig1 = os.path.join(location, "ecoli.faa.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name) - knowngood = utils.get_test_data('benchmark.input_prot.sig') + knowngood = utils.get_test_data("benchmark.input_prot.sig") good_aa = list(signature.load_signatures(knowngood))[0] assert sig1_aa.similarity(good_aa) == 1.0 @@ -831,29 +916,36 @@ def test_do_sourmash_check_knowngood_protein_comparisons(): # this test checks against a known good signature calculated # by utils/compute-prot-mh-another-way.py with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('ecoli.genes.fna') - status, out, err = utils.runscript('sourmash', - ['compute', '-k', '21', - '--singleton', '--protein', - '--no-dna', - testdata1], - in_directory=location) - sig1 = os.path.join(location, 'ecoli.genes.fna.sig') + testdata1 = utils.get_test_data("ecoli.genes.fna") + status, out, err = utils.runscript( + "sourmash", + ["compute", "-k", "21", "--singleton", "--protein", "--no-dna", testdata1], + in_directory=location, + ) + sig1 = os.path.join(location, "ecoli.genes.fna.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name) - knowngood = utils.get_test_data('benchmark.prot.sig') + knowngood = utils.get_test_data("benchmark.prot.sig") good_trans = list(signature.load_signatures(knowngood))[0] assert sig2_trans.similarity(good_trans) == 1.0 def test_compute_parameters(): - args_list = ["compute", "-k", "21,31", "--singleton", "--protein", "--no-dna", "input_file"] - - parser = SourmashParser(prog='sourmash') + args_list = [ + "compute", + "-k", + "21,31", + "--singleton", + "--protein", + "--no-dna", + "input_file", + ] + + parser = SourmashParser(prog="sourmash") subp = parser.add_subparsers(title="instruction", dest="cmd", metavar="cmd") subparser(subp) diff --git a/tests/test_sourmash_sketch.py b/tests/test_sourmash_sketch.py index 15925cb344..5c06ace5f2 100644 --- a/tests/test_sourmash_sketch.py +++ b/tests/test_sourmash_sketch.py @@ -9,7 +9,6 @@ import json import csv import pytest -import screed import sourmash_tst_utils as utils import sourmash @@ -31,55 +30,67 @@ def test_do_sourmash_sketch_check_scaled_bounds_negative(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'translate', '-p', 'scaled=-5', testdata1) + runtmp.sourmash("sketch", "translate", "-p", "scaled=-5", testdata1) assert "ERROR: scaled value must be positive" in runtmp.last_result.err def test_do_sourmash_sketch_check_scaled_bounds_less_than_minimum(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'translate', '-p', 'scaled=50', testdata1) - assert "WARNING: scaled value should be >= 100. Continuing anyway." in runtmp.last_result.err + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "translate", "-p", "scaled=50", testdata1) + assert ( + "WARNING: scaled value should be >= 100. Continuing anyway." + in runtmp.last_result.err + ) def test_do_sourmash_sketch_check_scaled_bounds_more_than_maximum(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'translate', '-p', 'scaled=1000000000', testdata1) - assert "WARNING: scaled value should be <= 1e6. Continuing anyway." in runtmp.last_result.err + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "translate", "-p", "scaled=1000000000", testdata1) + assert ( + "WARNING: scaled value should be <= 1e6. Continuing anyway." + in runtmp.last_result.err + ) def test_do_sourmash_sketch_check_num_bounds_negative(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'translate', '-p', 'num=-5', testdata1) + runtmp.sourmash("sketch", "translate", "-p", "num=-5", testdata1) assert "ERROR: num value must be positive" in runtmp.last_result.err def test_do_sourmash_sketch_check_num_bounds_less_than_minimum(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'translate', '-p', 'num=25', testdata1) - assert "WARNING: num value should be >= 50. Continuing anyway." in runtmp.last_result.err + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "translate", "-p", "num=25", testdata1) + assert ( + "WARNING: num value should be >= 50. Continuing anyway." + in runtmp.last_result.err + ) def test_do_sourmash_sketch_check_num_bounds_more_than_maximum(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'translate', '-p', 'num=100000', testdata1) - assert "WARNING: num value should be <= 50000. Continuing anyway." in runtmp.last_result.err + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "translate", "-p", "num=100000", testdata1) + assert ( + "WARNING: num value should be <= 50000. Continuing anyway." + in runtmp.last_result.err + ) def test_empty_factory(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory([], None) + _signatures_for_sketch_factory([], None) def test_no_default_moltype_factory_nonempty(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory(["k=31"], None) + _signatures_for_sketch_factory(["k=31"], None) def test_factory_no_default_moltype_dna(): - factory = _signatures_for_sketch_factory(['dna'], None) + factory = _signatures_for_sketch_factory(["dna"], None) params_list = list(factory.get_compute_params()) assert len(params_list) == 1 @@ -88,7 +99,7 @@ def test_factory_no_default_moltype_dna(): def test_factory_no_default_moltype_protein(): - factory = _signatures_for_sketch_factory(['protein'], None) + factory = _signatures_for_sketch_factory(["protein"], None) params_list = list(factory.get_compute_params()) assert len(params_list) == 1 @@ -97,16 +108,16 @@ def test_factory_no_default_moltype_protein(): def test_factory_dna_nosplit(): - factory = _signatures_for_sketch_factory(['k=31,k=51'], 'dna') + factory = _signatures_for_sketch_factory(["k=31,k=51"], "dna") params_list = list(factory.get_compute_params(split_ksizes=False)) assert len(params_list) == 1 params = params_list[0] - assert params.ksizes == [31,51] + assert params.ksizes == [31, 51] def test_factory_dna_split(): - factory = _signatures_for_sketch_factory(['k=31,k=51'], 'dna') + factory = _signatures_for_sketch_factory(["k=31,k=51"], "dna") params_list = list(factory.get_compute_params(split_ksizes=True)) assert len(params_list) == 2 @@ -117,7 +128,7 @@ def test_factory_dna_split(): def test_factory_protein_nosplit(): - factory = _signatures_for_sketch_factory(['k=10,k=9'], 'protein') + factory = _signatures_for_sketch_factory(["k=10,k=9"], "protein") params_list = list(factory.get_compute_params(split_ksizes=False)) assert len(params_list) == 1 @@ -126,7 +137,7 @@ def test_factory_protein_nosplit(): def test_factory_protein_split(): - factory = _signatures_for_sketch_factory(['k=10,k=9'], 'protein') + factory = _signatures_for_sketch_factory(["k=10,k=9"], "protein") params_list = list(factory.get_compute_params(split_ksizes=True)) assert len(params_list) == 2 @@ -137,12 +148,12 @@ def test_factory_protein_split(): def test_factory_dna_equal(): - factory1 = _signatures_for_sketch_factory(['dna'], None) + factory1 = _signatures_for_sketch_factory(["dna"], None) params_list1 = list(factory1.get_compute_params()) assert len(params_list1) == 1 params1 = params_list1[0] - factory2 = _signatures_for_sketch_factory([], 'dna') + factory2 = _signatures_for_sketch_factory([], "dna") params_list2 = list(factory2.get_compute_params()) assert len(params_list2) == 1 params2 = params_list2[0] @@ -152,12 +163,12 @@ def test_factory_dna_equal(): def test_factory_protein_equal(): - factory1 = _signatures_for_sketch_factory(['protein'], None) + factory1 = _signatures_for_sketch_factory(["protein"], None) params_list1 = list(factory1.get_compute_params()) assert len(params_list1) == 1 params1 = params_list1[0] - factory2 = _signatures_for_sketch_factory([], 'protein') + factory2 = _signatures_for_sketch_factory([], "protein") params_list2 = list(factory2.get_compute_params()) assert len(params_list2) == 1 params2 = params_list2[0] @@ -167,12 +178,12 @@ def test_factory_protein_equal(): def test_factory_dna_multi_ksize_eq(): - factory1 = _signatures_for_sketch_factory(['k=21,k=31,dna'], None) + factory1 = _signatures_for_sketch_factory(["k=21,k=31,dna"], None) params_list1 = list(factory1.get_compute_params()) assert len(params_list1) == 1 params1 = params_list1[0] - factory2 = _signatures_for_sketch_factory(['k=21,k=31'], 'dna') + factory2 = _signatures_for_sketch_factory(["k=21,k=31"], "dna") params_list2 = list(factory2.get_compute_params()) assert len(params_list2) == 1 params2 = params_list2[0] @@ -182,12 +193,12 @@ def test_factory_dna_multi_ksize_eq(): def test_factory_protein_multi_ksize_eq(): - factory1 = _signatures_for_sketch_factory(['k=10,k=11,protein'], None) + factory1 = _signatures_for_sketch_factory(["k=10,k=11,protein"], None) params_list1 = list(factory1.get_compute_params()) assert len(params_list1) == 1 params1 = params_list1[0] - factory2 = _signatures_for_sketch_factory(['k=10,k=11'], 'protein') + factory2 = _signatures_for_sketch_factory(["k=10,k=11"], "protein") params_list2 = list(factory2.get_compute_params()) assert len(params_list2) == 1 params2 = params_list2[0] @@ -197,7 +208,7 @@ def test_factory_protein_multi_ksize_eq(): def test_dna_defaults(): - factory = _signatures_for_sketch_factory([], 'dna') + factory = _signatures_for_sketch_factory([], "dna") params_list = list(factory.get_compute_params()) assert len(params_list) == 1 @@ -219,13 +230,13 @@ def test_dna_defaults(): def test_dna_multiple_ksize(): - factory = _signatures_for_sketch_factory(['k=21,k=31,k=51'], 'dna') + factory = _signatures_for_sketch_factory(["k=21,k=31,k=51"], "dna") params_list = list(factory.get_compute_params()) assert len(params_list) == 1 params = params_list[0] - assert params.ksizes == [21,31,51] + assert params.ksizes == [21, 31, 51] assert params.num_hashes == 0 assert params.scaled == 1000 assert not params.track_abundance @@ -246,8 +257,7 @@ def test_dna_multiple_ksize(): def test_dna_override_1(): - factory = _signatures_for_sketch_factory(['k=21,scaled=2000,abund'], - 'dna') + factory = _signatures_for_sketch_factory(["k=21,scaled=2000,abund"], "dna") params_list = list(factory.get_compute_params()) assert len(params_list) == 1 @@ -266,48 +276,47 @@ def test_dna_override_1(): def test_scaled_param_requires_equal(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory(['k=21,scaled'], 'dna') + _signatures_for_sketch_factory(["k=21,scaled"], "dna") def test_k_param_requires_equal(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory(['k'], 'dna') + _signatures_for_sketch_factory(["k"], "dna") def test_k_param_requires_equal_2(): - with pytest.raises(ValueError) as exc: - factory = _signatures_for_sketch_factory(['k='], 'dna') + with pytest.raises(ValueError): + _signatures_for_sketch_factory(["k="], "dna") def test_seed_param_requires_equal(): - with pytest.raises(ValueError) as exc: - factory = _signatures_for_sketch_factory(['seed='], 'dna') + with pytest.raises(ValueError): + _signatures_for_sketch_factory(["seed="], "dna") def test_num_param_requires_equal(): - with pytest.raises(ValueError) as exc: - factory = _signatures_for_sketch_factory(['num='], 'dna') + with pytest.raises(ValueError): + _signatures_for_sketch_factory(["num="], "dna") def test_dna_override_bad_1(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory(['k=21,scaledFOO=2000,abund'], - 'dna') + _signatures_for_sketch_factory(["k=21,scaledFOO=2000,abund"], "dna") def test_dna_override_bad_2(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory(['k=21,protein'], 'dna') + _signatures_for_sketch_factory(["k=21,protein"], "dna") def test_protein_defaults(): - factory = _signatures_for_sketch_factory([], 'protein') + factory = _signatures_for_sketch_factory([], "protein") params_list = list(factory.get_compute_params()) assert len(params_list) == 1 params = params_list[0] - assert params.ksizes == [30] # x3 for now + assert params.ksizes == [30] # x3 for now assert params.num_hashes == 0 assert params.scaled == 200 assert not params.track_abundance @@ -320,14 +329,15 @@ def test_protein_defaults(): def test_protein_override_bad_2(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory(['k=21,dna'], 'protein') + _signatures_for_sketch_factory(["k=21,dna"], "protein") + def test_protein_override_bad_rust_foo(): # mimic 'sourmash sketch protein -p dna' - factory = _signatures_for_sketch_factory([], 'protein') + factory = _signatures_for_sketch_factory([], "protein") # reach in and avoid error checking to construct a bad params_list. - factory.params_list = [('dna', {})] + factory.params_list = [("dna", {})] # now, get sigs... siglist = factory() @@ -335,7 +345,7 @@ def test_protein_override_bad_rust_foo(): sig = siglist[0] # try adding something - testdata1 = utils.get_test_data('ecoli.faa') + testdata1 = utils.get_test_data("ecoli.faa") with screed.open(testdata1) as f: record = next(iter(f)) @@ -346,13 +356,13 @@ def test_protein_override_bad_rust_foo(): def test_dayhoff_defaults(): - factory = _signatures_for_sketch_factory([], 'dayhoff') + factory = _signatures_for_sketch_factory([], "dayhoff") params_list = list(factory.get_compute_params()) assert len(params_list) == 1 params = params_list[0] - assert params.ksizes == [48] # x3 for now + assert params.ksizes == [48] # x3 for now assert params.num_hashes == 0 assert params.scaled == 200 assert not params.track_abundance @@ -365,17 +375,17 @@ def test_dayhoff_defaults(): def test_dayhoff_override_bad_2(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory(['k=21,dna'], 'dayhoff') + _signatures_for_sketch_factory(["k=21,dna"], "dayhoff") def test_hp_defaults(): - factory = _signatures_for_sketch_factory([], 'hp') + factory = _signatures_for_sketch_factory([], "hp") params_list = list(factory.get_compute_params()) assert len(params_list) == 1 params = params_list[0] - assert params.ksizes == [126] # x3 for now + assert params.ksizes == [126] # x3 for now assert params.num_hashes == 0 assert params.scaled == 200 assert not params.track_abundance @@ -388,21 +398,23 @@ def test_hp_defaults(): def test_hp_override_bad_2(): with pytest.raises(ValueError): - factory = _signatures_for_sketch_factory(['k=21,dna'], 'hp') + _signatures_for_sketch_factory(["k=21,dna"], "hp") def test_multiple_moltypes(): - params_foo = ['k=20,num=500,protein', - 'k=19,num=400,dayhoff,abund', - 'k=30,scaled=200,hp', - 'k=30,scaled=200,seed=58'] - factory = _signatures_for_sketch_factory(params_foo, 'protein') + params_foo = [ + "k=20,num=500,protein", + "k=19,num=400,dayhoff,abund", + "k=30,scaled=200,hp", + "k=30,scaled=200,seed=58", + ] + factory = _signatures_for_sketch_factory(params_foo, "protein") params_list = list(factory.get_compute_params()) assert len(params_list) == 4 params = params_list[0] - assert params.ksizes == [60] # x3, for now. + assert params.ksizes == [60] # x3, for now. assert params.num_hashes == 500 assert params.scaled == 0 assert not params.track_abundance @@ -413,7 +425,7 @@ def test_multiple_moltypes(): assert params.protein params = params_list[1] - assert params.ksizes == [57] # x3, for now. + assert params.ksizes == [57] # x3, for now. assert params.num_hashes == 400 assert params.scaled == 0 assert params.track_abundance @@ -424,7 +436,7 @@ def test_multiple_moltypes(): assert not params.protein params = params_list[2] - assert params.ksizes == [90] # x3, for now. + assert params.ksizes == [90] # x3, for now. assert params.num_hashes == 0 assert params.scaled == 200 assert not params.track_abundance @@ -435,7 +447,7 @@ def test_multiple_moltypes(): assert not params.protein params = params_list[3] - assert params.ksizes == [90] # x3, for now. + assert params.ksizes == [90] # x3, for now. assert params.num_hashes == 0 assert params.scaled == 200 assert not params.track_abundance @@ -446,16 +458,19 @@ def test_multiple_moltypes(): assert params.protein -@pytest.mark.parametrize("input_param_str, expected_output", - [('protein', 'protein,k=10,scaled=200'), - ('dna', 'dna,k=31,scaled=1000'), - ('hp', 'hp,k=42,scaled=200'), - ('dayhoff', 'dayhoff,k=16,scaled=200'), - ('dna,seed=52', 'dna,k=31,scaled=1000,seed=52'), - ('dna,num=500', 'dna,k=31,num=500'), - ('scaled=1100,dna', 'dna,k=31,scaled=1100'), - ('dna,abund', 'dna,k=31,scaled=1000,abund') - ]) +@pytest.mark.parametrize( + "input_param_str, expected_output", + [ + ("protein", "protein,k=10,scaled=200"), + ("dna", "dna,k=31,scaled=1000"), + ("hp", "hp,k=42,scaled=200"), + ("dayhoff", "dayhoff,k=16,scaled=200"), + ("dna,seed=52", "dna,k=31,scaled=1000,seed=52"), + ("dna,num=500", "dna,k=31,num=500"), + ("scaled=1100,dna", "dna,k=31,scaled=1100"), + ("dna,abund", "dna,k=31,scaled=1000,abund"), + ], +) def test_compute_parameters_to_param_str(input_param_str, expected_output): factory = _signatures_for_sketch_factory([input_param_str], None) params_list = list(factory.get_compute_params()) @@ -464,22 +479,18 @@ def test_compute_parameters_to_param_str(input_param_str, expected_output): actual_output_str = params.to_param_str() - assert actual_output_str == expected_output, (actual_output_str, - expected_output) + assert actual_output_str == expected_output, (actual_output_str, expected_output) def test_manifest_row_to_compute_parameters_1(): # test ComputeParameters.from_manifest_row with moltype 'DNA' - row = dict(moltype='DNA', - ksize=21, - num=0, scaled=1000, - with_abundance=1) + row = dict(moltype="DNA", ksize=21, num=0, scaled=1000, with_abundance=1) p = ComputeParameters.from_manifest_row(row) assert p.dna assert not p.protein assert not p.dayhoff assert not p.hp - assert p.moltype == 'DNA' + assert p.moltype == "DNA" assert p.num_hashes == 0 assert p.scaled == 1000 assert p.ksizes == [21] @@ -489,14 +500,11 @@ def test_manifest_row_to_compute_parameters_1(): def test_manifest_row_to_compute_parameters_2(): # test ComputeParameters.from_manifest_row with moltype 'protein' - row = dict(moltype='protein', - ksize=10, - num=0, scaled=200, - with_abundance=1) + row = dict(moltype="protein", ksize=10, num=0, scaled=200, with_abundance=1) p = ComputeParameters.from_manifest_row(row) assert not p.dna assert p.protein - assert p.moltype == 'protein' + assert p.moltype == "protein" assert not p.dayhoff assert not p.hp assert p.num_hashes == 0 @@ -508,15 +516,12 @@ def test_manifest_row_to_compute_parameters_2(): def test_manifest_row_to_compute_parameters_3(): # test ComputeParameters.from_manifest_row with moltype 'dayhoff' - row = dict(moltype='dayhoff', - ksize=12, - num=0, scaled=200, - with_abundance=0) + row = dict(moltype="dayhoff", ksize=12, num=0, scaled=200, with_abundance=0) p = ComputeParameters.from_manifest_row(row) assert not p.dna assert not p.protein assert p.dayhoff - assert p.moltype == 'dayhoff' + assert p.moltype == "dayhoff" assert not p.hp assert p.num_hashes == 0 assert p.scaled == 200 @@ -527,16 +532,13 @@ def test_manifest_row_to_compute_parameters_3(): def test_manifest_row_to_compute_parameters_4(): # test ComputeParameters.from_manifest_row with moltype 'hp' - row = dict(moltype='hp', - ksize=32, - num=0, scaled=200, - with_abundance=0) + row = dict(moltype="hp", ksize=32, num=0, scaled=200, with_abundance=0) p = ComputeParameters.from_manifest_row(row) assert not p.dna assert not p.protein assert not p.dayhoff assert p.hp - assert p.moltype == 'hp' + assert p.moltype == "hp" assert p.num_hashes == 0 assert p.scaled == 200 assert p.ksizes == [96] @@ -545,8 +547,17 @@ def test_manifest_row_to_compute_parameters_4(): def test_bad_compute_parameters(): - p = ComputeParameters(ksizes=[31], seed=42, dna=0, protein=0, dayhoff=0, - hp=0, num_hashes=0, track_abundance=True, scaled=1000) + p = ComputeParameters( + ksizes=[31], + seed=42, + dna=0, + protein=0, + dayhoff=0, + hp=0, + num_hashes=0, + track_abundance=True, + scaled=1000, + ) with pytest.raises(AssertionError): p.moltype @@ -557,434 +568,484 @@ def test_bad_compute_parameters(): @utils.in_thisdir def test_do_sourmash_sketchdna_empty(c): with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sketch', 'dna') - assert 'error: no input filenames provided! nothing to do - exiting.' in c.last_result.err + c.run_sourmash("sketch", "dna") + assert ( + "error: no input filenames provided! nothing to do - exiting." + in c.last_result.err + ) @utils.in_thisdir def test_do_sourmash_sketchprotein_empty(c): with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sketch', 'protein') - assert 'error: no input filenames provided! nothing to do - exiting.' in c.last_result.err + c.run_sourmash("sketch", "protein") + assert ( + "error: no input filenames provided! nothing to do - exiting." + in c.last_result.err + ) @utils.in_thisdir def test_do_sourmash_sketchtranslate_empty(c): with pytest.raises(SourmashCommandFailed): - c.run_sourmash('sketch', 'translate') - assert 'error: no input filenames provided! nothing to do - exiting.' in c.last_result.err + c.run_sourmash("sketch", "translate") + assert ( + "error: no input filenames provided! nothing to do - exiting." + in c.last_result.err + ) def test_do_sourmash_sketchdna(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'dna', testdata1) + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "dna", testdata1) - sigfile = runtmp.output('short.fa.sig') + sigfile = runtmp.output("short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('short.fa') + assert str(sig).endswith("short.fa") def test_do_sourmash_sketchdna_check_sequence_succeed(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'dna', testdata1, '--check-sequence') + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "dna", testdata1, "--check-sequence") - sigfile = runtmp.output('short.fa.sig') + sigfile = runtmp.output("short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('short.fa') + assert str(sig).endswith("short.fa") def test_do_sourmash_sketchdna_check_sequence_fail(runtmp): - testdata1 = utils.get_test_data('shewanella.faa') + testdata1 = utils.get_test_data("shewanella.faa") - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sketch', 'dna', testdata1, '--check-sequence') + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash("sketch", "dna", testdata1, "--check-sequence") err = runtmp.last_result.err print(err) assert "ERROR when reading from " in err - assert "invalid DNA character in input k-mer: MCGIVGAVAQRDVAEILVEGLRRLEYRGYDS" in err + assert ( + "invalid DNA character in input k-mer: MCGIVGAVAQRDVAEILVEGLRRLEYRGYDS" in err + ) def test_do_sourmash_sketchdna_check_sequence_fail_singleton(runtmp): - testdata1 = utils.get_test_data('shewanella.faa') + testdata1 = utils.get_test_data("shewanella.faa") - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sketch', 'dna', testdata1, '--check-sequence', - '--singleton') + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash("sketch", "dna", testdata1, "--check-sequence", "--singleton") err = runtmp.last_result.err print(err) assert "ERROR when reading from " in err - assert "invalid DNA character in input k-mer: MCGIVGAVAQRDVAEILVEGLRRLEYRGYDS" in err + assert ( + "invalid DNA character in input k-mer: MCGIVGAVAQRDVAEILVEGLRRLEYRGYDS" in err + ) def test_do_sourmash_sketchdna_from_file(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") file_list = runtmp.output("filelist.txt") - with open(file_list, 'wt') as fp: + with open(file_list, "w") as fp: print(testdata1, file=fp) - runtmp.sourmash('sketch', 'dna', '--from-file', file_list) + runtmp.sourmash("sketch", "dna", "--from-file", file_list) - sigfile = runtmp.output('short.fa.sig') + sigfile = runtmp.output("short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('short.fa') + assert str(sig).endswith("short.fa") @utils.in_tempdir def test_do_sourmash_sketchdna_noinput(c): data = "" - cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig')] + cmd = ["sketch", "dna", "-", "-o", c.output("xxx.sig")] c.run_sourmash(*cmd, stdin_data=data) print(c.last_result.out) print(c.last_result.err) - sigfile = c.output('xxx.sig') + sigfile = c.output("xxx.sig") assert not os.path.exists(sigfile) - assert 'no sequences found' in c.last_result.err + assert "no sequences found" in c.last_result.err @utils.in_tempdir def test_do_sourmash_sketchdna_noinput_singleton(c): data = "" - cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig'), '--singleton'] + cmd = ["sketch", "dna", "-", "-o", c.output("xxx.sig"), "--singleton"] c.run_sourmash(*cmd, stdin_data=data) - sigfile = c.output('xxx.sig') + sigfile = c.output("xxx.sig") assert not os.path.exists(sigfile) - assert 'no sequences found' in c.last_result.err + assert "no sequences found" in c.last_result.err @utils.in_tempdir def test_do_sourmash_sketchdna_noinput_merge(c): data = "" - cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig'), '--merge', 'name'] + cmd = ["sketch", "dna", "-", "-o", c.output("xxx.sig"), "--merge", "name"] c.run_sourmash(*cmd, stdin_data=data) - sigfile = c.output('xxx.sig') + sigfile = c.output("xxx.sig") assert not os.path.exists(sigfile) - assert 'no sequences found' in c.last_result.err + assert "no sequences found" in c.last_result.err @utils.in_tempdir def test_do_sourmash_sketchdna_outdir(c): - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['sketch', 'dna', testdata1, - '--outdir', c.location]) + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", ["sketch", "dna", testdata1, "--outdir", c.location] + ) - sigfile = os.path.join(c.location, 'short.fa.sig') + sigfile = os.path.join(c.location, "short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('short.fa') + assert str(sig).endswith("short.fa") @utils.in_tempdir def test_do_sourmash_sketchdna_output_dir(c): # test via --output-dir not --outdir - testdata1 = utils.get_test_data('short.fa') - status, out, err = utils.runscript('sourmash', - ['sketch', 'dna', testdata1, - '--output-dir', c.location]) + testdata1 = utils.get_test_data("short.fa") + status, out, err = utils.runscript( + "sourmash", ["sketch", "dna", testdata1, "--output-dir", c.location] + ) - sigfile = os.path.join(c.location, 'short.fa.sig') + sigfile = os.path.join(c.location, "short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('short.fa') + assert str(sig).endswith("short.fa") def test_do_sourmash_sketchdna_output_valid_file(runtmp): - """ Trigger bug #123 """ - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = runtmp.output('short.fa.sig') + """Trigger bug #123""" + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = runtmp.output("short.fa.sig") - runtmp.sourmash('sketch', 'dna', '-o', sigfile, testdata1, testdata2, testdata3) + runtmp.sourmash("sketch", "dna", "-o", sigfile, testdata1, testdata2, testdata3) assert os.path.exists(sigfile) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty # is it valid json? - with open(sigfile, 'r') as f: + with open(sigfile) as f: data = json.load(f) - filesigs = [sig['filename'] for sig in data] - assert all(testdata in filesigs - for testdata in (testdata1, testdata2, testdata3)) + filesigs = [sig["filename"] for sig in data] + assert all(testdata in filesigs for testdata in (testdata1, testdata2, testdata3)) def test_do_sourmash_sketchdna_output_zipfile(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") - outfile = runtmp.output('shorts.zip') + outfile = runtmp.output("shorts.zip") - runtmp.sourmash('sketch', 'dna', '-o', outfile, testdata1, testdata2, testdata3) + runtmp.sourmash("sketch", "dna", "-o", outfile, testdata1, testdata2, testdata3) assert os.path.exists(outfile) - assert not runtmp.last_result.out # stdout should be empty + assert not runtmp.last_result.out # stdout should be empty sigs = list(sourmash.load_file_as_signatures(outfile)) assert len(sigs) == 3 def test_do_sourmash_sketchdna_output_stdout_valid(runtmp): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") - runtmp.sourmash('sketch', 'dna', '-o', '-', testdata1, testdata2, testdata3) + runtmp.sourmash("sketch", "dna", "-o", "-", testdata1, testdata2, testdata3) # is it valid json? data = json.loads(runtmp.last_result.out) - filesigs = [sig['filename'] for sig in data] - assert all(testdata in filesigs - for testdata in (testdata1, testdata2, testdata3)) + filesigs = [sig["filename"] for sig in data] + assert all(testdata in filesigs for testdata in (testdata1, testdata2, testdata3)) @utils.in_tempdir def test_do_sourmash_sketchdna_output_and_name_valid_file(c): # test --merge of multiple input files - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = c.output('short.fa.sig') - - c.run_sourmash('sketch', 'dna', '-p', 'num=500', '-o', sigfile, '--merge', - '"name"', testdata1, testdata2, testdata3) + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = c.output("short.fa.sig") + + c.run_sourmash( + "sketch", + "dna", + "-p", + "num=500", + "-o", + sigfile, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + ) assert os.path.exists(sigfile) - assert 'calculated 1 signature for 4 sequences taken from 3 files' in c.last_result.err + assert ( + "calculated 1 signature for 4 sequences taken from 3 files" in c.last_result.err + ) # is it valid json? - with open(sigfile, 'r') as f: + with open(sigfile) as f: data = json.load(f) assert len(data) == 1 - sigfile_merged = c.output('short.all.fa.sig') - c.run_sourmash('sketch', 'dna', '-p', 'num=500', '-o', sigfile_merged, - '--merge', '"name"', testdata1, testdata2, testdata3) - - with open(sigfile_merged, 'r') as f: + sigfile_merged = c.output("short.all.fa.sig") + c.run_sourmash( + "sketch", + "dna", + "-p", + "num=500", + "-o", + sigfile_merged, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + ) + + with open(sigfile_merged) as f: data_merged = json.load(f) - assert data[0]['signatures'][0]['mins'] == data_merged[0]['signatures'][0]['mins'] + assert data[0]["signatures"][0]["mins"] == data_merged[0]["signatures"][0]["mins"] @utils.in_tempdir def test_do_sourmash_sketchdna_output_and_name_valid_file_outdir(c): - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - testdata3 = utils.get_test_data('short3.fa') - sigfile = os.path.join(c.location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + testdata2 = utils.get_test_data("short2.fa") + testdata3 = utils.get_test_data("short3.fa") + sigfile = os.path.join(c.location, "short.fa.sig") - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('sketch', 'dna', '-o', sigfile, - '--merge', '"name"', - testdata1, testdata2, testdata3, - '--outdir', c.location) + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "sketch", + "dna", + "-o", + sigfile, + "--merge", + '"name"', + testdata1, + testdata2, + testdata3, + "--outdir", + c.location, + ) errmsg = c.last_result.err assert "ERROR: --output-dir doesn't make sense with -o/--output" in errmsg def test_do_sourmash_sketchdna_singleton(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'dna', '--singleton', testdata1) + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "dna", "--singleton", testdata1) - sigfile = runtmp.output('short.fa.sig') + sigfile = runtmp.output("short.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert str(sig).endswith('shortName') + assert str(sig).endswith("shortName") def test_do_sourmash_sketchdna_name(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'dna', '--merge', 'foo', testdata1, '-o', 'foo.sig') + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "dna", "--merge", "foo", testdata1, "-o", "foo.sig") - sigfile = runtmp.output('foo.sig') + sigfile = runtmp.output("foo.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert sig.name == 'foo' - - runtmp.sourmash('sketch', 'dna', '--name', 'foo', testdata1, '-o', 'foo2.sig') + assert sig.name == "foo" - sigfile2 = runtmp.output('foo2.sig') + runtmp.sourmash("sketch", "dna", "--name", "foo", testdata1, "-o", "foo2.sig") + + sigfile2 = runtmp.output("foo2.sig") assert os.path.exists(sigfile2) sig2 = next(signature.load_signatures(sigfile)) - assert sig2.name == 'foo' + assert sig2.name == "foo" assert sig.name == sig2.name def test_do_sourmash_sketchdna_name_fail_no_output(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'dna', '--merge', 'foo', testdata1) + runtmp.sourmash("sketch", "dna", "--merge", "foo", testdata1) assert runtmp.last_result.status == -1 def test_do_sourmash_sketchdna_fail_no_output(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'dna', '--merge', 'foo', testdata1) + runtmp.sourmash("sketch", "dna", "--merge", "foo", testdata1) assert runtmp.last_result.status == -1 with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'dna', '--name', 'foo', testdata1) + runtmp.sourmash("sketch", "dna", "--name", "foo", testdata1) assert runtmp.last_result.status == -1 def test_do_sourmash_sketchdna_name_from_first(runtmp): - testdata1 = utils.get_test_data('short3.fa') - runtmp.sourmash('sketch', 'dna', '--name-from-first', testdata1) + testdata1 = utils.get_test_data("short3.fa") + runtmp.sourmash("sketch", "dna", "--name-from-first", testdata1) - sigfile = runtmp.output('short3.fa.sig') + sigfile = runtmp.output("short3.fa.sig") assert os.path.exists(sigfile) sig = next(signature.load_signatures(sigfile)) - assert sig.name == 'firstname' + assert sig.name == "firstname" def test_do_sourmash_sketchdna_multik(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'dna', '-p', 'k=31,k=21', testdata1) + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "dna", "-p", "k=31,k=21", testdata1) - outfile = runtmp.output('short.fa.sig') + outfile = runtmp.output("short.fa.sig") assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 31 in ksizes def test_do_sourmash_sketchdna_multik_output(runtmp, sig_save_extension): - testdata1 = utils.get_test_data('short.fa') - outfile = runtmp.output(f'out.{sig_save_extension}') - runtmp.sourmash('sketch', 'dna', '-p', 'k=31,k=21', testdata1, - '-o', outfile) + testdata1 = utils.get_test_data("short.fa") + outfile = runtmp.output(f"out.{sig_save_extension}") + runtmp.sourmash("sketch", "dna", "-p", "k=31,k=21", testdata1, "-o", outfile) print("saved to file/path with extension:", outfile) assert os.path.exists(outfile) siglist = list(sourmash.load_file_as_signatures(outfile)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 31 in ksizes def test_do_sketch_dna_override_protein_fail(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'dna', '-p', 'k=7,num=500,protein', testdata1) + runtmp.sourmash("sketch", "dna", "-p", "k=7,num=500,protein", testdata1) assert runtmp.last_result.status != 0 - assert 'Error creating signatures: Incompatible sketch type' in runtmp.last_result.err + assert ( + "Error creating signatures: Incompatible sketch type" in runtmp.last_result.err + ) def test_do_sketch_protein_override_dna_fail(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,num=500,dna', testdata1) + runtmp.sourmash("sketch", "protein", "-p", "k=7,num=500,dna", testdata1) assert runtmp.last_result.status != 0 - assert 'Error creating signatures: Incompatible sketch type' in runtmp.last_result.err + assert ( + "Error creating signatures: Incompatible sketch type" in runtmp.last_result.err + ) def test_do_sketch_translate_multik_with_protein(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'translate', '-p', 'k=7,k=10,num=500', testdata1) + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "translate", "-p", "k=7,k=10,num=500", testdata1) - outfile = runtmp.output('short.fa.sig') + outfile = runtmp.output("short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes def test_do_sketch_translate_multik_with_protein_from_file(runtmp): - testdata1 = utils.get_test_data('short.fa') + testdata1 = utils.get_test_data("short.fa") file_list = runtmp.output("filelist.txt") - with open(file_list, 'wt') as fp: + with open(file_list, "w") as fp: print(testdata1, file=fp) - runtmp.sourmash('sketch', 'translate', '-p', 'k=7,k=10,num=500', '--from-file', file_list) + runtmp.sourmash( + "sketch", "translate", "-p", "k=7,k=10,num=500", "--from-file", file_list + ) - outfile = runtmp.output('short.fa.sig') + outfile = runtmp.output("short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes def test_do_sketch_translate_multik_with_dayhoff(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'translate', '-p', 'k=7,k=10,num=500', '--dayhoff', testdata1) + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash( + "sketch", "translate", "-p", "k=7,k=10,num=500", "--dayhoff", testdata1 + ) - outfile = runtmp.output('short.fa.sig') + outfile = runtmp.output("short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert all(x.minhash.dayhoff for x in siglist) def test_do_sketch_translate_multik_with_hp(runtmp): - testdata1 = utils.get_test_data('short.fa') - runtmp.sourmash('sketch', 'translate', '-p', 'k=7,k=10,num=500', '--hp', testdata1) + testdata1 = utils.get_test_data("short.fa") + runtmp.sourmash("sketch", "translate", "-p", "k=7,k=10,num=500", "--hp", testdata1) - outfile = runtmp.output('short.fa.sig') + outfile = runtmp.output("short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes assert all(x.minhash.hp for x in siglist) @@ -993,202 +1054,220 @@ def test_do_sketch_translate_multik_with_hp(runtmp): @utils.in_tempdir def test_do_sourmash_sketch_translate_multik_only_protein(c): # check sourmash sketch_translate with only protein, no nucl - testdata1 = utils.get_test_data('short.fa') - c.run_sourmash('sketch', 'translate', '-p', 'k=7,k=10,num=500', - testdata1) - outfile = os.path.join(c.location, 'short.fa.sig') + testdata1 = utils.get_test_data("short.fa") + c.run_sourmash("sketch", "translate", "-p", "k=7,k=10,num=500", testdata1) + outfile = os.path.join(c.location, "short.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes def test_do_sourmash_sketch_translate_bad_sequences(runtmp): """Proper error handling when Ns in dna sequence""" - testdata1 = utils.get_test_data('short.bad.fa') - runtmp.sourmash('sketch', 'translate', '-p', 'k=7,k=10,num=500', testdata1) + testdata1 = utils.get_test_data("short.bad.fa") + runtmp.sourmash("sketch", "translate", "-p", "k=7,k=10,num=500", testdata1) - outfile = runtmp.output('short.bad.fa.sig') + outfile = runtmp.output("short.bad.fa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes def test_do_sketch_protein_multik_input(runtmp): - testdata1 = utils.get_test_data('ecoli.faa') - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,k=10,num=500', testdata1) + testdata1 = utils.get_test_data("ecoli.faa") + runtmp.sourmash("sketch", "protein", "-p", "k=7,k=10,num=500", testdata1) - outfile = runtmp.output('ecoli.faa.sig') + outfile = runtmp.output("ecoli.faa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes - moltype = set([ x.minhash.moltype == 'protein' - for x in siglist ]) + moltype = set([x.minhash.moltype == "protein" for x in siglist]) assert len(moltype) == 1 assert True in moltype def test_do_sketch_protein_multik_input_from_file(runtmp): - testdata1 = utils.get_test_data('ecoli.faa') + testdata1 = utils.get_test_data("ecoli.faa") file_list = runtmp.output("filelist.txt") - with open(file_list, 'wt') as fp: + with open(file_list, "w") as fp: print(testdata1, file=fp) - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,k=10,num=500', '--from-file', file_list) + runtmp.sourmash( + "sketch", "protein", "-p", "k=7,k=10,num=500", "--from-file", file_list + ) - outfile = runtmp.output('ecoli.faa.sig') + outfile = runtmp.output("ecoli.faa.sig") assert os.path.exists(outfile) - with open(outfile, 'rt') as fp: + with open(outfile) as fp: sigdata = fp.read() siglist = list(signature.load_signatures(sigdata)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 7 in ksizes assert 10 in ksizes - moltype = set([ x.minhash.moltype == 'protein' - for x in siglist ]) + moltype = set([x.minhash.moltype == "protein" for x in siglist]) assert len(moltype) == 1 assert True in moltype def test_do_sourmash_sketchdna_multik_outfile(runtmp): - testdata1 = utils.get_test_data('short.fa') - outfile = runtmp.output('FOO.xxx') + testdata1 = utils.get_test_data("short.fa") + outfile = runtmp.output("FOO.xxx") - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,k=31', testdata1, '-o', outfile) + runtmp.sourmash("sketch", "dna", "-p", "k=21,k=31", testdata1, "-o", outfile) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - ksizes = set([ x.minhash.ksize for x in siglist ]) + ksizes = set([x.minhash.ksize for x in siglist]) assert 21 in ksizes assert 31 in ksizes def test_do_sourmash_sketchdna_with_scaled_1(runtmp): - testdata1 = utils.get_test_data('short.fa') - outfile = runtmp.output('FOO.xxx') + testdata1 = utils.get_test_data("short.fa") + outfile = runtmp.output("FOO.xxx") - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,k=31,scaled=1', testdata1, '-o', outfile) + runtmp.sourmash( + "sketch", "dna", "-p", "k=21,k=31,scaled=1", testdata1, "-o", outfile + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - scaled_vals = [ x.minhash.scaled for x in siglist ] + scaled_vals = [x.minhash.scaled for x in siglist] assert len(scaled_vals) == 2 - assert set(scaled_vals) == { 1 } + assert set(scaled_vals) == {1} def test_do_sourmash_sketchdna_with_scaled_2(runtmp): - testdata1 = utils.get_test_data('short.fa') - outfile = runtmp.output('FOO.xxx') + testdata1 = utils.get_test_data("short.fa") + outfile = runtmp.output("FOO.xxx") - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,k=31,scaled=2', testdata1, '-o', outfile) + runtmp.sourmash( + "sketch", "dna", "-p", "k=21,k=31,scaled=2", testdata1, "-o", outfile + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - max_hashes = [ x.minhash._max_hash for x in siglist ] + max_hashes = [x.minhash._max_hash for x in siglist] assert len(max_hashes) == 2 - assert set(max_hashes) == set([ int(2**64 /2.) ]) + assert set(max_hashes) == set([int(2**64 / 2.0)]) def test_do_sourmash_sketchdna_with_scaled(runtmp): - testdata1 = utils.get_test_data('short.fa') - outfile = runtmp.output('FOO.xxx') + testdata1 = utils.get_test_data("short.fa") + outfile = runtmp.output("FOO.xxx") - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,k=31,scaled=100', testdata1, '-o', outfile) + runtmp.sourmash( + "sketch", "dna", "-p", "k=21,k=31,scaled=100", testdata1, "-o", outfile + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - max_hashes = [ x.minhash._max_hash for x in siglist ] + max_hashes = [x.minhash._max_hash for x in siglist] assert len(max_hashes) == 2 - assert set(max_hashes) == set([ int(2**64 /100.) ]) + assert set(max_hashes) == set([int(2**64 / 100.0)]) def test_do_sourmash_sketchdna_with_bad_scaled(runtmp): - testdata1 = utils.get_test_data('short.fa') - outfile = runtmp.output('FOO.xxx') + testdata1 = utils.get_test_data("short.fa") + outfile = runtmp.output("FOO.xxx") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,k=31,scaled=-1', testdata1, '-o', outfile) + runtmp.sourmash( + "sketch", "dna", "-p", "k=21,k=31,scaled=-1", testdata1, "-o", outfile + ) assert runtmp.last_result.status != 0 print(runtmp.last_result.err) - assert 'ERROR: scaled value must be positive' in runtmp.last_result.err + assert "ERROR: scaled value must be positive" in runtmp.last_result.err with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,k=31,scaled=1000.5', testdata1, '-o', outfile) + runtmp.sourmash( + "sketch", "dna", "-p", "k=21,k=31,scaled=1000.5", testdata1, "-o", outfile + ) assert runtmp.last_result.status != 0 assert "cannot parse scaled='1000.5' as an integer" in runtmp.last_result.err - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,k=31,scaled=1000000000', testdata1, '-o', outfile) + runtmp.sourmash( + "sketch", "dna", "-p", "k=21,k=31,scaled=1000000000", testdata1, "-o", outfile + ) assert runtmp.last_result.status == 0 - print('XXX') + print("XXX") print(runtmp.last_result.err) - assert 'WARNING: scaled value should be <= 1e6. Continuing anyway.' in runtmp.last_result.err + assert ( + "WARNING: scaled value should be <= 1e6. Continuing anyway." + in runtmp.last_result.err + ) def test_do_sketch_with_seed(runtmp): - testdata1 = utils.get_test_data('short.fa') - outfile = runtmp.output('FOO.xxx') + testdata1 = utils.get_test_data("short.fa") + outfile = runtmp.output("FOO.xxx") - runtmp.sourmash('sketch', 'dna', '-p', 'k=21,k=31,seed=43', testdata1, '-o', outfile) + runtmp.sourmash( + "sketch", "dna", "-p", "k=21,k=31,seed=43", testdata1, "-o", outfile + ) assert os.path.exists(outfile) siglist = list(signature.load_signatures(outfile)) assert len(siglist) == 2 - seeds = [ x.minhash.seed for x in siglist ] + seeds = [x.minhash.seed for x in siglist] assert len(seeds) == 2 - assert set(seeds) == set([ 43 ]) + assert set(seeds) == set([43]) def test_do_sourmash_check_protein_comparisons(runtmp): # this test checks 2 x 2 protein comparisons with E. coli genes. - testdata1 = utils.get_test_data('ecoli.faa') + testdata1 = utils.get_test_data("ecoli.faa") - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,num=500', '--singleton', testdata1) + runtmp.sourmash("sketch", "protein", "-p", "k=7,num=500", "--singleton", testdata1) - sig1 = runtmp.output('ecoli.faa.sig') + sig1 = runtmp.output("ecoli.faa.sig") assert os.path.exists(sig1) - testdata2 = utils.get_test_data('ecoli.genes.fna') - runtmp.sourmash('sketch', 'translate', '-p', 'k=7,num=500', '--singleton', testdata2) + testdata2 = utils.get_test_data("ecoli.genes.fna") + runtmp.sourmash( + "sketch", "translate", "-p", "k=7,num=500", "--singleton", testdata2 + ) - sig2 = runtmp.output('ecoli.genes.fna.sig') + sig2 = runtmp.output("ecoli.genes.fna.sig") assert os.path.exists(sig2) # I'm not sure why load_signatures is randomizing order, but ok. @@ -1199,13 +1278,13 @@ def test_do_sourmash_check_protein_comparisons(runtmp): sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name) name1 = sig1_aa.name.split()[0] - assert name1 == 'NP_414543.1' + assert name1 == "NP_414543.1" name2 = sig2_aa.name.split()[0] - assert name2 == 'NP_414544.1' + assert name2 == "NP_414544.1" name3 = sig1_trans.name.split()[0] - assert name3 == 'gi|556503834:2801-3733' + assert name3 == "gi|556503834:2801-3733" name4 = sig2_trans.name.split()[0] - assert name4 == 'gi|556503834:337-2799' + assert name4 == "gi|556503834:337-2799" print(name1, name3, round(sig1_aa.similarity(sig1_trans), 3)) print(name2, name3, round(sig2_aa.similarity(sig1_trans), 3)) @@ -1222,10 +1301,9 @@ def test_do_sourmash_check_protein_comparisons(runtmp): def test_do_sourmash_check_knowngood_dna_comparisons(c): # this test checks against a known good signature calculated # by utils/compute-dna-mh-another-way.py - testdata1 = utils.get_test_data('ecoli.genes.fna') - c.run_sourmash('sketch', 'dna', '-p', 'k=21,num=500', - '--singleton', testdata1) - sig1 = c.output('ecoli.genes.fna.sig') + testdata1 = utils.get_test_data("ecoli.genes.fna") + c.run_sourmash("sketch", "dna", "-p", "k=21,num=500", "--singleton", testdata1) + sig1 = c.output("ecoli.genes.fna.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) @@ -1234,7 +1312,7 @@ def test_do_sourmash_check_knowngood_dna_comparisons(c): print(sig1.name) print(sig2.name) - knowngood = utils.get_test_data('benchmark.dna.sig') + knowngood = utils.get_test_data("benchmark.dna.sig") good = list(signature.load_signatures(knowngood))[0] assert sig2.similarity(good) == 1.0 @@ -1243,16 +1321,15 @@ def test_do_sourmash_check_knowngood_dna_comparisons(c): @utils.in_tempdir def test_do_sourmash_check_knowngood_dna_comparisons_use_rna(c): # check the rna ; otherwise identical to previous test. - testdata1 = utils.get_test_data('ecoli.genes.fna') - c.run_sourmash('sketch', 'rna', '-p', 'k=21,num=500', '--singleton', - testdata1) - sig1 = c.output('ecoli.genes.fna.sig') + testdata1 = utils.get_test_data("ecoli.genes.fna") + c.run_sourmash("sketch", "rna", "-p", "k=21,num=500", "--singleton", testdata1) + sig1 = c.output("ecoli.genes.fna.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) sig1, sig2 = sorted(x, key=lambda x: x.name) - knowngood = utils.get_test_data('benchmark.dna.sig') + knowngood = utils.get_test_data("benchmark.dna.sig") good = list(signature.load_signatures(knowngood))[0] assert sig2.similarity(good) == 1.0 @@ -1261,17 +1338,17 @@ def test_do_sourmash_check_knowngood_dna_comparisons_use_rna(c): def test_do_sourmash_check_knowngood_input_protein_comparisons(runtmp): # this test checks against a known good signature calculated # by utils/compute-input-prot-another-way.py - testdata1 = utils.get_test_data('ecoli.faa') + testdata1 = utils.get_test_data("ecoli.faa") - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,num=500', '--singleton', testdata1) + runtmp.sourmash("sketch", "protein", "-p", "k=7,num=500", "--singleton", testdata1) - sig1 = runtmp.output('ecoli.faa.sig') + sig1 = runtmp.output("ecoli.faa.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) sig1_aa, sig2_aa = sorted(x, key=lambda x: x.name) - knowngood = utils.get_test_data('benchmark.input_prot.sig') + knowngood = utils.get_test_data("benchmark.input_prot.sig") good_aa = list(signature.load_signatures(knowngood))[0] assert sig1_aa.similarity(good_aa) == 1.0 @@ -1280,17 +1357,19 @@ def test_do_sourmash_check_knowngood_input_protein_comparisons(runtmp): def test_do_sourmash_check_knowngood_protein_comparisons(runtmp): # this test checks against a known good signature calculated # by utils/compute-prot-mh-another-way.py - testdata1 = utils.get_test_data('ecoli.genes.fna') + testdata1 = utils.get_test_data("ecoli.genes.fna") - runtmp.sourmash('sketch', 'translate', '-p', 'k=7,num=500', '--singleton', testdata1) + runtmp.sourmash( + "sketch", "translate", "-p", "k=7,num=500", "--singleton", testdata1 + ) - sig1 = runtmp.output('ecoli.genes.fna.sig') + sig1 = runtmp.output("ecoli.genes.fna.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) sig1_trans, sig2_trans = sorted(x, key=lambda x: x.name) - knowngood = utils.get_test_data('benchmark.prot.sig') + knowngood = utils.get_test_data("benchmark.prot.sig") good_trans = list(signature.load_signatures(knowngood))[0] assert sig2_trans.similarity(good_trans) == 1.0 @@ -1298,19 +1377,26 @@ def test_do_sourmash_check_knowngood_protein_comparisons(runtmp): def test_do_sourmash_singleton_multiple_files_no_out_specified(runtmp): # this test checks that --singleton -o works - testdata1 = utils.get_test_data('ecoli.faa') - testdata2 = utils.get_test_data('shewanella.faa') + testdata1 = utils.get_test_data("ecoli.faa") + testdata2 = utils.get_test_data("shewanella.faa") - runtmp.sourmash('sketch', 'protein', '-p', 'k=7', '--singleton', - testdata1, testdata2) + runtmp.sourmash( + "sketch", "protein", "-p", "k=7", "--singleton", testdata1, testdata2 + ) print(runtmp.last_result.err) - assert "saved 2 signature(s) to 'ecoli.faa.sig'. Note: signature license is CC0." in runtmp.last_result.err - assert "saved 2 signature(s) to 'shewanella.faa.sig'. Note: signature license is CC0." in runtmp.last_result.err - - sig1 = runtmp.output('ecoli.faa.sig') + assert ( + "saved 2 signature(s) to 'ecoli.faa.sig'. Note: signature license is CC0." + in runtmp.last_result.err + ) + assert ( + "saved 2 signature(s) to 'shewanella.faa.sig'. Note: signature license is CC0." + in runtmp.last_result.err + ) + + sig1 = runtmp.output("ecoli.faa.sig") assert os.path.exists(sig1) - sig2 = runtmp.output('shewanella.faa.sig') + sig2 = runtmp.output("shewanella.faa.sig") assert os.path.exists(sig2) x = list(signature.load_signatures(sig1)) @@ -1324,27 +1410,39 @@ def test_do_sourmash_singleton_multiple_files_no_out_specified(runtmp): assert len(x) == 2 assert len(y) == 2 - idents = [ ss.name.split()[0] for ss in x ] + idents = [ss.name.split()[0] for ss in x] print(idents) - assert set(['NP_414543.1', 'NP_414544.1' ]) == set(idents) + assert set(["NP_414543.1", "NP_414544.1"]) == set(idents) - idents = [ ss.name.split()[0] for ss in y ] + idents = [ss.name.split()[0] for ss in y] print(idents) - assert set(['WP_006079348.1', 'WP_006079351.1']) == set(idents) + assert set(["WP_006079348.1", "WP_006079351.1"]) == set(idents) def test_do_sourmash_singleton_multiple_files_output(runtmp): # this test checks that --singleton -o works - testdata1 = utils.get_test_data('ecoli.faa') - testdata2 = utils.get_test_data('shewanella.faa') - - runtmp.sourmash('sketch', 'protein', '-p', 'k=7', '--singleton', - testdata1, testdata2, '-o', 'output.sig') + testdata1 = utils.get_test_data("ecoli.faa") + testdata2 = utils.get_test_data("shewanella.faa") + + runtmp.sourmash( + "sketch", + "protein", + "-p", + "k=7", + "--singleton", + testdata1, + testdata2, + "-o", + "output.sig", + ) print(runtmp.last_result.err) - assert "saved 4 signature(s) to 'output.sig'. Note: signature license is CC0." in runtmp.last_result.err + assert ( + "saved 4 signature(s) to 'output.sig'. Note: signature license is CC0." + in runtmp.last_result.err + ) - sig1 = runtmp.output('output.sig') + sig1 = runtmp.output("output.sig") assert os.path.exists(sig1) x = list(signature.load_signatures(sig1)) @@ -1353,23 +1451,37 @@ def test_do_sourmash_singleton_multiple_files_output(runtmp): assert len(x) == 4 - idents = [ ss.name.split()[0] for ss in x ] + idents = [ss.name.split()[0] for ss in x] print(idents) - assert set(['NP_414543.1', 'NP_414544.1', 'WP_006079348.1', 'WP_006079351.1']) == set(idents) + assert set( + ["NP_414543.1", "NP_414544.1", "WP_006079348.1", "WP_006079351.1"] + ) == set(idents) def test_do_sourmash_singleton_multiple_files_output_zip(runtmp): # this test checks that --singleton -o works - testdata1 = utils.get_test_data('ecoli.faa') - testdata2 = utils.get_test_data('shewanella.faa') - - runtmp.sourmash('sketch', 'protein', '-p', 'k=7', '--singleton', - testdata1, testdata2, '-o', 'output.zip') + testdata1 = utils.get_test_data("ecoli.faa") + testdata2 = utils.get_test_data("shewanella.faa") + + runtmp.sourmash( + "sketch", + "protein", + "-p", + "k=7", + "--singleton", + testdata1, + testdata2, + "-o", + "output.zip", + ) print(runtmp.last_result.err) - assert "saved 4 signature(s) to 'output.zip'. Note: signature license is CC0." in runtmp.last_result.err + assert ( + "saved 4 signature(s) to 'output.zip'. Note: signature license is CC0." + in runtmp.last_result.err + ) - sig1 = runtmp.output('output.zip') + sig1 = runtmp.output("output.zip") assert os.path.exists(sig1) x = list(sourmash.load_file_as_signatures(sig1)) @@ -1378,35 +1490,37 @@ def test_do_sourmash_singleton_multiple_files_output_zip(runtmp): assert len(x) == 4 - idents = [ ss.name.split()[0] for ss in x ] + idents = [ss.name.split()[0] for ss in x] print(idents) - assert set(['NP_414543.1', 'NP_414544.1', 'WP_006079348.1', 'WP_006079351.1']) == set(idents) + assert set( + ["NP_414543.1", "NP_414544.1", "WP_006079348.1", "WP_006079351.1"] + ) == set(idents) def test_protein_with_stop_codons(runtmp): # compare protein seq with/without stop codons, via cli and also python # apis - testdata1 = utils.get_test_data('ecoli.faa') + testdata1 = utils.get_test_data("ecoli.faa") with screed.open(testdata1) as f: - ecoli_seq = [ record.sequence for record in f ] + ecoli_seq = [record.sequence for record in f] # first, via CLI w/o stop codons - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,scaled=1', testdata1) - sig1 = runtmp.output('ecoli.faa.sig') + runtmp.sourmash("sketch", "protein", "-p", "k=7,scaled=1", testdata1) + sig1 = runtmp.output("ecoli.faa.sig") assert os.path.exists(sig1) x = signature.load_one_signature(sig1) cli_mh1 = x.minhash # second, via CLI w/stop codons - ecoli_stop = runtmp.output('ecoli.stop.faa') - with open(ecoli_stop, 'wt') as fp: + ecoli_stop = runtmp.output("ecoli.stop.faa") + with open(ecoli_stop, "w") as fp: for seq in ecoli_seq: - fp.write(f'>seq\n{seq}*\n') + fp.write(f">seq\n{seq}*\n") - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,scaled=1', ecoli_stop) - sig2 = runtmp.output('ecoli.stop.faa.sig') + runtmp.sourmash("sketch", "protein", "-p", "k=7,scaled=1", ecoli_stop) + sig2 = runtmp.output("ecoli.stop.faa.sig") assert os.path.exists(sig2) x = signature.load_one_signature(sig2) @@ -1420,7 +1534,7 @@ def test_protein_with_stop_codons(runtmp): # now calculate sketch with MinHash and stop codons... py_mh2 = MinHash(n=0, ksize=7, is_protein=True, scaled=1) for seq in ecoli_seq: - py_mh2.add_protein(seq + '*') + py_mh2.add_protein(seq + "*") # and, last, calculate hashes separately with seq_to_hashes h_mh1 = MinHash(n=0, ksize=7, is_protein=True, scaled=1) @@ -1430,7 +1544,7 @@ def test_protein_with_stop_codons(runtmp): h = h_mh1.seq_to_hashes(seq, is_protein=1) h_mh1.add_many(h) - h = h_mh2.seq_to_hashes(seq + '*', is_protein=1) + h = h_mh2.seq_to_hashes(seq + "*", is_protein=1) h_mh2.add_many(h) # check! @@ -1453,26 +1567,26 @@ def test_hp_with_stop_codons(runtmp): # compare hp seq with/without stop codons, via cli and also python # apis - testdata1 = utils.get_test_data('ecoli.faa') + testdata1 = utils.get_test_data("ecoli.faa") with screed.open(testdata1) as f: - ecoli_seq = [ record.sequence for record in f ] + ecoli_seq = [record.sequence for record in f] # first, via CLI w/o stop codons - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,scaled=1,hp', testdata1) - sig1 = runtmp.output('ecoli.faa.sig') + runtmp.sourmash("sketch", "protein", "-p", "k=7,scaled=1,hp", testdata1) + sig1 = runtmp.output("ecoli.faa.sig") assert os.path.exists(sig1) x = signature.load_one_signature(sig1) cli_mh1 = x.minhash # second, via CLI w/stop codons - ecoli_stop = runtmp.output('ecoli.stop.faa') - with open(ecoli_stop, 'wt') as fp: + ecoli_stop = runtmp.output("ecoli.stop.faa") + with open(ecoli_stop, "w") as fp: for seq in ecoli_seq: - fp.write(f'>seq\n{seq}*\n') + fp.write(f">seq\n{seq}*\n") - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,scaled=1,hp', ecoli_stop) - sig2 = runtmp.output('ecoli.stop.faa.sig') + runtmp.sourmash("sketch", "protein", "-p", "k=7,scaled=1,hp", ecoli_stop) + sig2 = runtmp.output("ecoli.stop.faa.sig") assert os.path.exists(sig2) x = signature.load_one_signature(sig2) @@ -1486,7 +1600,7 @@ def test_hp_with_stop_codons(runtmp): # now calculate sketch with MinHash and stop codons... py_mh2 = MinHash(n=0, ksize=7, hp=True, scaled=1) for seq in ecoli_seq: - py_mh2.add_protein(seq + '*') + py_mh2.add_protein(seq + "*") # and, last, calculate hashes separately with seq_to_hashes h_mh1 = MinHash(n=0, ksize=7, hp=True, scaled=1) @@ -1496,7 +1610,7 @@ def test_hp_with_stop_codons(runtmp): h = h_mh1.seq_to_hashes(seq, is_protein=1) h_mh1.add_many(h) - h = h_mh2.seq_to_hashes(seq + '*', is_protein=1) + h = h_mh2.seq_to_hashes(seq + "*", is_protein=1) h_mh2.add_many(h) # check! @@ -1519,26 +1633,26 @@ def test_dayhoff_with_stop_codons(runtmp): # compare dayhoff seq with/without stop codons, via cli and also python # apis - testdata1 = utils.get_test_data('ecoli.faa') + testdata1 = utils.get_test_data("ecoli.faa") with screed.open(testdata1) as f: - ecoli_seq = [ record.sequence for record in f] + ecoli_seq = [record.sequence for record in f] # first, via CLI w/o stop codons - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,scaled=1,dayhoff', testdata1) - sig1 = runtmp.output('ecoli.faa.sig') + runtmp.sourmash("sketch", "protein", "-p", "k=7,scaled=1,dayhoff", testdata1) + sig1 = runtmp.output("ecoli.faa.sig") assert os.path.exists(sig1) x = signature.load_one_signature(sig1) cli_mh1 = x.minhash # second, via CLI w/stop codons - ecoli_stop = runtmp.output('ecoli.stop.faa') - with open(ecoli_stop, 'wt') as fp: + ecoli_stop = runtmp.output("ecoli.stop.faa") + with open(ecoli_stop, "w") as fp: for seq in ecoli_seq: - fp.write(f'>seq\n{seq}*\n') + fp.write(f">seq\n{seq}*\n") - runtmp.sourmash('sketch', 'protein', '-p', 'k=7,scaled=1,dayhoff', ecoli_stop) - sig2 = runtmp.output('ecoli.stop.faa.sig') + runtmp.sourmash("sketch", "protein", "-p", "k=7,scaled=1,dayhoff", ecoli_stop) + sig2 = runtmp.output("ecoli.stop.faa.sig") assert os.path.exists(sig2) x = signature.load_one_signature(sig2) @@ -1552,7 +1666,7 @@ def test_dayhoff_with_stop_codons(runtmp): # now calculate sketch with MinHash and stop codons... py_mh2 = MinHash(n=0, ksize=7, dayhoff=True, scaled=1) for seq in ecoli_seq: - py_mh2.add_protein(seq + '*') + py_mh2.add_protein(seq + "*") # and, last, calculate hashes separately with seq_to_hashes h_mh1 = MinHash(n=0, ksize=7, dayhoff=True, scaled=1) @@ -1562,7 +1676,7 @@ def test_dayhoff_with_stop_codons(runtmp): h = h_mh1.seq_to_hashes(seq, is_protein=1) h_mh1.add_many(h) - h = h_mh2.seq_to_hashes(seq + '*', is_protein=1) + h = h_mh2.seq_to_hashes(seq + "*", is_protein=1) h_mh2.add_many(h) # check! @@ -1586,66 +1700,84 @@ def test_dayhoff_with_stop_codons(runtmp): def test_fromfile_dna(runtmp): # does it run? yes, hopefully. - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna') + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.zip')) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + assert os.path.exists(runtmp.output("out.zip")) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) siglist = list(idx.signatures()) assert len(siglist) == 1 ss = siglist[0] - assert ss.name == 'GCA_903797575 Salmonella enterica' - assert ss.minhash.moltype == 'DNA' + assert ss.name == "GCA_903797575 Salmonella enterica" + assert ss.minhash.moltype == "DNA" assert "** 1 total requested; output 1, skipped 0" in runtmp.last_result.err def test_fromfile_dna_csv_gz(runtmp): # test with a gzipped csv - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) # gzip the CSV file - with open(runtmp.output('sketch_fromfile/salmonella.csv'), 'rb') as infp: - with gzip.open(runtmp.output('salmonella.csv.gz'), 'w') as outfp: + with open(runtmp.output("sketch_fromfile/salmonella.csv"), "rb") as infp: + with gzip.open(runtmp.output("salmonella.csv.gz"), "w") as outfp: outfp.write(infp.read()) - runtmp.sourmash('sketch', 'fromfile', 'salmonella.csv.gz', - '-o', 'out.zip', '-p', 'dna') + runtmp.sourmash( + "sketch", "fromfile", "salmonella.csv.gz", "-o", "out.zip", "-p", "dna" + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.zip')) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + assert os.path.exists(runtmp.output("out.zip")) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) siglist = list(idx.signatures()) assert len(siglist) == 1 ss = siglist[0] - assert ss.name == 'GCA_903797575 Salmonella enterica' - assert ss.minhash.moltype == 'DNA' + assert ss.name == "GCA_903797575 Salmonella enterica" + assert ss.minhash.moltype == "DNA" assert "** 1 total requested; output 1, skipped 0" in runtmp.last_result.err def test_fromfile_dna_empty(runtmp): # test what happens on empty files. - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) # zero out the file - with gzip.open(runtmp.output('sketch_fromfile/GCA_903797575.1_PARATYPHIC668_genomic.fna.gz'), 'w') as fp: + with gzip.open( + runtmp.output("sketch_fromfile/GCA_903797575.1_PARATYPHIC668_genomic.fna.gz"), + "w", + ): pass # now what happens? with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + ) print(runtmp.last_result.out) err = runtmp.last_result.err @@ -1656,125 +1788,172 @@ def test_fromfile_dna_empty(runtmp): def test_fromfile_dna_check_sequence_succeed(runtmp): # does it run? yes, hopefully. - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna', '--check-sequence') + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + "--check-sequence", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.zip')) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + assert os.path.exists(runtmp.output("out.zip")) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) siglist = list(idx.signatures()) assert len(siglist) == 1 ss = siglist[0] - assert ss.name == 'GCA_903797575 Salmonella enterica' - assert ss.minhash.moltype == 'DNA' + assert ss.name == "GCA_903797575 Salmonella enterica" + assert ss.minhash.moltype == "DNA" assert "** 1 total requested; output 1, skipped 0" in runtmp.last_result.err def test_fromfile_dna_check_sequence_fail(runtmp): # does it run? yes, hopefully. - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'fromfile', - 'sketch_fromfile/salmonella-badseq.csv', - '-o', 'out.zip', '-p', 'dna', '--check-sequence') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella-badseq.csv", + "-o", + "out.zip", + "-p", + "dna", + "--check-sequence", + ) print(runtmp.last_result.out) err = runtmp.last_result.err print(err) assert "ERROR when reading from " in err - assert "invalid DNA character in input k-mer: MTNILKLFSRKAGEPLDSLAVKSVRQHLSGD" in err + assert ( + "invalid DNA character in input k-mer: MTNILKLFSRKAGEPLDSLAVKSVRQHLSGD" in err + ) def test_fromfile_dna_and_protein(runtmp): # does it run and produce DNA _and_ protein signatures? - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna', '-p', 'protein') + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + "-p", + "protein", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.zip')) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + assert os.path.exists(runtmp.output("out.zip")) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) siglist = list(idx.signatures()) assert len(siglist) == 2 - prot_sig = [ ss for ss in siglist if ss.minhash.moltype == 'protein' ] + prot_sig = [ss for ss in siglist if ss.minhash.moltype == "protein"] assert len(prot_sig) == 1 prot_sig = prot_sig[0] - assert prot_sig.name == 'GCA_903797575 Salmonella enterica' + assert prot_sig.name == "GCA_903797575 Salmonella enterica" - dna_sig = [ ss for ss in siglist if ss.minhash.moltype == 'DNA' ] + dna_sig = [ss for ss in siglist if ss.minhash.moltype == "DNA"] assert len(dna_sig) == 1 dna_sig = dna_sig[0] - assert dna_sig.name == 'GCA_903797575 Salmonella enterica' + assert dna_sig.name == "GCA_903797575 Salmonella enterica" assert "** 2 total requested; output 2, skipped 0" in runtmp.last_result.err def test_fromfile_dna_and_protein_and_hp_and_dayhoff(runtmp): # does it run and produce DNA _and_ protein signatures? - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna', '-p', 'dna,k=25', - '-p', 'protein', - '-p', 'hp', '-p', 'dayhoff') + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + "-p", + "dna,k=25", + "-p", + "protein", + "-p", + "hp", + "-p", + "dayhoff", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.zip')) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + assert os.path.exists(runtmp.output("out.zip")) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) siglist = list(idx.signatures()) assert len(siglist) == 5 - prot_sig = [ ss for ss in siglist if ss.minhash.moltype == 'protein' ] + prot_sig = [ss for ss in siglist if ss.minhash.moltype == "protein"] assert len(prot_sig) == 1 prot_sig = prot_sig[0] - assert prot_sig.name == 'GCA_903797575 Salmonella enterica' + assert prot_sig.name == "GCA_903797575 Salmonella enterica" - prot_sig = [ ss for ss in siglist if ss.minhash.moltype == 'hp' ] + prot_sig = [ss for ss in siglist if ss.minhash.moltype == "hp"] assert len(prot_sig) == 1 prot_sig = prot_sig[0] - assert prot_sig.name == 'GCA_903797575 Salmonella enterica' + assert prot_sig.name == "GCA_903797575 Salmonella enterica" - prot_sig = [ ss for ss in siglist if ss.minhash.moltype == 'dayhoff' ] + prot_sig = [ss for ss in siglist if ss.minhash.moltype == "dayhoff"] assert len(prot_sig) == 1 prot_sig = prot_sig[0] - assert prot_sig.name == 'GCA_903797575 Salmonella enterica' + assert prot_sig.name == "GCA_903797575 Salmonella enterica" - dna_sig = [ ss for ss in siglist if ss.minhash.moltype == 'DNA' ] + dna_sig = [ss for ss in siglist if ss.minhash.moltype == "DNA"] assert len(dna_sig) == 2 dna_sig = dna_sig[0] - assert dna_sig.name == 'GCA_903797575 Salmonella enterica' + assert dna_sig.name == "GCA_903797575 Salmonella enterica" assert "** 5 total requested; output 5, skipped 0" in runtmp.last_result.err def test_fromfile_dna_and_protein_noname(runtmp): # nothing in the name column - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'fromfile', - 'sketch_fromfile/salmonella-noname.csv', - '-o', 'out.zip', '-p', 'dna', '-p', 'protein') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella-noname.csv", + "-o", + "out.zip", + "-p", + "dna", + "-p", + "protein", + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -1786,14 +1965,22 @@ def test_fromfile_dna_and_protein_noname(runtmp): def test_fromfile_dna_and_protein_dup_name(runtmp): # duplicate names - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'fromfile', - 'sketch_fromfile/salmonella.csv', - 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna', '-p', 'protein') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + "-p", + "protein", + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -1806,15 +1993,23 @@ def test_fromfile_dna_and_protein_dup_name(runtmp): def test_fromfile_dna_and_protein_dup_name_report(runtmp): # duplicate names - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'fromfile', - 'sketch_fromfile/salmonella.csv', - 'sketch_fromfile/salmonella.csv', - '--report-duplicated', - '-o', 'out.zip', '-p', 'dna', '-p', 'protein') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "sketch_fromfile/salmonella.csv", + "--report-duplicated", + "-o", + "out.zip", + "-p", + "dna", + "-p", + "protein", + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -1827,13 +2022,19 @@ def test_fromfile_dna_and_protein_dup_name_report(runtmp): def test_fromfile_dna_and_protein_missing(runtmp): # test what happens when missing protein. - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sketch', 'fromfile', - 'sketch_fromfile/salmonella-missing.csv', - '-o', 'out.zip', '-p', 'protein') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella-missing.csv", + "-o", + "out.zip", + "-p", + "protein", + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -1841,19 +2042,29 @@ def test_fromfile_dna_and_protein_missing(runtmp): print(out) print(err) - assert "WARNING: fromfile entry 'GCA_903797575 Salmonella enterica' is missing a proteome" in err + assert ( + "WARNING: fromfile entry 'GCA_903797575 Salmonella enterica' is missing a proteome" + in err + ) assert "** ERROR: we cannot build some of the requested signatures." in err assert "** 1 total signatures (for 1 names) cannot be built." in err def test_fromfile_dna_and_protein_missing_ignore(runtmp): # test what happens when missing protein + --ignore-missing - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', - 'sketch_fromfile/salmonella-missing.csv', - '-o', 'out.zip', '-p', 'protein', '--ignore-missing') + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella-missing.csv", + "-o", + "out.zip", + "-p", + "protein", + "--ignore-missing", + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -1861,7 +2072,10 @@ def test_fromfile_dna_and_protein_missing_ignore(runtmp): print(out) print(err) - assert "WARNING: fromfile entry 'GCA_903797575 Salmonella enterica' is missing a proteome" in err + assert ( + "WARNING: fromfile entry 'GCA_903797575 Salmonella enterica' is missing a proteome" + in err + ) assert "** ERROR: we cannot build some of the requested signatures." in err assert "** 1 total signatures (for 1 names) cannot be built." in err @@ -1872,21 +2086,35 @@ def test_fromfile_dna_and_protein_missing_ignore(runtmp): def test_fromfile_no_overwrite(runtmp): # test --force-output-already-exists - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna') + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.zip')) + assert os.path.exists(runtmp.output("out.zip")) # now run again; will fail since already exists - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'protein') + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "protein", + ) err = runtmp.last_result.err @@ -1896,55 +2124,81 @@ def test_fromfile_no_overwrite(runtmp): def test_fromfile_force_overwrite(runtmp): # test --force-output-already-exists - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna') + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.zip')) + assert os.path.exists(runtmp.output("out.zip")) # now run again, with --force - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'protein', '--force-output') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "protein", + "--force-output", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.zip')) - idx = sourmash.load_file_as_index(runtmp.output('out.zip')) + assert os.path.exists(runtmp.output("out.zip")) + idx = sourmash.load_file_as_index(runtmp.output("out.zip")) siglist = list(idx.signatures()) assert len(siglist) == 2 - names = list(set([ ss.name for ss in siglist ])) - assert names[0] == 'GCA_903797575 Salmonella enterica' + names = list(set([ss.name for ss in siglist])) + assert names[0] == "GCA_903797575 Salmonella enterica" assert "** 1 total requested; output 1, skipped 0" in runtmp.last_result.err def test_fromfile_need_params(runtmp): # check that we need a -p - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip') + runtmp.sourmash( + "sketch", "fromfile", "sketch_fromfile/salmonella.csv", "-o", "out.zip" + ) print(str(exc)) - assert "Error creating signatures: No default moltype and none specified in param string" in str(exc) + assert ( + "Error creating signatures: No default moltype and none specified in param string" + in str(exc) + ) def test_fromfile_seed_not_allowed(runtmp): # check that we cannot adjust 'seed' - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna,seed=43') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna,seed=43", + ) print(str(exc)) assert "ERROR: cannot set 'seed' in 'sketch fromfile'" in str(exc) @@ -1952,32 +2206,49 @@ def test_fromfile_seed_not_allowed(runtmp): def test_fromfile_license_not_allowed(runtmp): # check that license is CC0 - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-o', 'out.zip', '-p', 'dna', - '--license', 'BSD') + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-o", + "out.zip", + "-p", + "dna", + "--license", + "BSD", + ) print(str(exc)) - assert 'sourmash only supports CC0-licensed signatures' in str(exc) + assert "sourmash only supports CC0-licensed signatures" in str(exc) def test_fromfile_dna_and_protein_csv_output(runtmp): # does it run and produce DNA _and_ protein signatures? - test_inp = utils.get_test_data('sketch_fromfile') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '--output-csv', 'out.csv', '-p', 'dna', '-p', 'protein') + test_inp = utils.get_test_data("sketch_fromfile") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "--output-csv", + "out.csv", + "-p", + "dna", + "-p", + "protein", + ) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert os.path.exists(runtmp.output('out.csv')) + assert os.path.exists(runtmp.output("out.csv")) - with open(runtmp.output('out.csv'), newline='') as fp: + with open(runtmp.output("out.csv"), newline="") as fp: r = csv.DictReader(fp) # filename,sketchtype,output_index,name,param_strs @@ -1985,88 +2256,120 @@ def test_fromfile_dna_and_protein_csv_output(runtmp): for row in r: x.append(row) - x.sort(key=lambda x: x['filename']) + x.sort(key=lambda x: x["filename"]) assert len(x) == 2 - assert x[0]['sketchtype'] == 'dna' - assert x[0]['param_strs'] == '-p dna,k=31,scaled=1000' - assert x[0]['filename'] == 'sketch_fromfile/GCA_903797575.1_PARATYPHIC668_genomic.fna.gz' - - assert x[1]['sketchtype'] == 'protein' - assert x[1]['param_strs'] == '-p protein,k=10,scaled=200' - assert x[1]['filename'] == 'sketch_fromfile/GCA_903797575.1_PARATYPHIC668_protein.faa.gz' + assert x[0]["sketchtype"] == "dna" + assert x[0]["param_strs"] == "-p dna,k=31,scaled=1000" + assert ( + x[0]["filename"] + == "sketch_fromfile/GCA_903797575.1_PARATYPHIC668_genomic.fna.gz" + ) + + assert x[1]["sketchtype"] == "protein" + assert x[1]["param_strs"] == "-p protein,k=10,scaled=200" + assert ( + x[1]["filename"] + == "sketch_fromfile/GCA_903797575.1_PARATYPHIC668_protein.faa.gz" + ) # same name... - assert x[0]['name'] == x[1]['name'] == "GCA_903797575 Salmonella enterica" + assert x[0]["name"] == x[1]["name"] == "GCA_903797575 Salmonella enterica" # ...different output index. - assert x[1]['output_index'] != x[0]['output_index'] + assert x[1]["output_index"] != x[0]["output_index"] def test_fromfile_dna_and_protein_already_exists(runtmp): # does it properly ignore existing (--already-done) sigs? - test_inp = utils.get_test_data('sketch_fromfile') - already_done = utils.get_test_data('sketch_fromfile/salmonella-dna-protein.zip') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-p', 'dna', '-p', 'protein', - '--already-done', already_done, - '--output-manifest', 'matching.csv') + test_inp = utils.get_test_data("sketch_fromfile") + already_done = utils.get_test_data("sketch_fromfile/salmonella-dna-protein.zip") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-p", + "dna", + "-p", + "protein", + "--already-done", + already_done, + "--output-manifest", + "matching.csv", + ) print(runtmp.last_result.out) err = runtmp.last_result.err print(err) - assert 'Loaded 1 pre-existing names from manifest(s)' in err - assert 'Read 1 rows, requesting that 2 signatures be built.' in err - assert '** 0 new signatures to build from 0 files;' in err - assert '** Nothing to build. Exiting!' in err + assert "Loaded 1 pre-existing names from manifest(s)" in err + assert "Read 1 rows, requesting that 2 signatures be built." in err + assert "** 0 new signatures to build from 0 files;" in err + assert "** Nothing to build. Exiting!" in err - assert "output 2 already-done signatures to 'matching.csv' in manifest format." in err - mf = manifest.CollectionManifest.load_from_filename(runtmp.output('matching.csv')) + assert ( + "output 2 already-done signatures to 'matching.csv' in manifest format." in err + ) + mf = manifest.CollectionManifest.load_from_filename(runtmp.output("matching.csv")) assert len(mf) == 2 def test_fromfile_dna_and_protein_partly_already_exists(runtmp): # does it properly ignore existing (--already-done) sigs? - test_inp = utils.get_test_data('sketch_fromfile') - already_done = utils.get_test_data('sketch_fromfile/salmonella-dna-protein.zip') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella-mult.csv', - '-p', 'dna', '-p', 'protein', - '--already-done', already_done) + test_inp = utils.get_test_data("sketch_fromfile") + already_done = utils.get_test_data("sketch_fromfile/salmonella-dna-protein.zip") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella-mult.csv", + "-p", + "dna", + "-p", + "protein", + "--already-done", + already_done, + ) print(runtmp.last_result.out) err = runtmp.last_result.err print(err) - assert 'Loaded 1 pre-existing names from manifest(s)' in err - assert 'Read 2 rows, requesting that 4 signatures be built.' in err - assert '** 2 new signatures to build from 2 files;' in err + assert "Loaded 1 pre-existing names from manifest(s)" in err + assert "Read 2 rows, requesting that 4 signatures be built." in err + assert "** 2 new signatures to build from 2 files;" in err assert "** 2 already exist, so skipping those." in err assert "** 4 total requested; output 2, skipped 2" in err def test_fromfile_dna_and_protein_already_exists_noname(runtmp): # check that no name in already_exists is handled - test_inp = utils.get_test_data('sketch_fromfile') - already_done = utils.get_test_data('sketch_fromfile/salmonella-dna-protein.zip') - shutil.copytree(test_inp, runtmp.output('sketch_fromfile')) + test_inp = utils.get_test_data("sketch_fromfile") + already_done = utils.get_test_data("sketch_fromfile/salmonella-dna-protein.zip") + shutil.copytree(test_inp, runtmp.output("sketch_fromfile")) # run rename to get rid of names - runtmp.sourmash('sig', 'rename', already_done, '', - '-o', 'already-done.zip') - - runtmp.sourmash('sketch', 'fromfile', 'sketch_fromfile/salmonella.csv', - '-p', 'dna', '-p', 'protein', - '--already-done', 'already-done.zip') + runtmp.sourmash("sig", "rename", already_done, "", "-o", "already-done.zip") + + runtmp.sourmash( + "sketch", + "fromfile", + "sketch_fromfile/salmonella.csv", + "-p", + "dna", + "-p", + "protein", + "--already-done", + "already-done.zip", + ) print(runtmp.last_result.out) err = runtmp.last_result.err print(err) - assert 'Loaded 0 pre-existing names from manifest(s)' in err - assert 'Read 1 rows, requesting that 2 signatures be built.' in err - assert '** 2 new signatures to build from 2 files;' in err - assert '** 2 total requested; output 2, skipped 0' in err + assert "Loaded 0 pre-existing names from manifest(s)" in err + assert "Read 1 rows, requesting that 2 signatures be built." in err + assert "** 2 new signatures to build from 2 files;" in err + assert "** 2 total requested; output 2, skipped 0" in err diff --git a/tests/test_sqlite_index.py b/tests/test_sqlite_index.py index 74c4692c06..816719e602 100644 --- a/tests/test_sqlite_index.py +++ b/tests/test_sqlite_index.py @@ -6,9 +6,12 @@ import sourmash from sourmash.exceptions import IndexNotSupported -from sourmash.index.sqlite_index import (SqliteIndex, load_sqlite_index, - SqliteCollectionManifest, - LCA_SqliteDatabase) +from sourmash.index.sqlite_index import ( + SqliteIndex, + load_sqlite_index, + SqliteCollectionManifest, + LCA_SqliteDatabase, +) from sourmash.index import StandaloneManifestIndex from sourmash import load_one_signature, SourmashSignature @@ -23,7 +26,7 @@ def test_sqlite_index_prefetch_empty(): # check that an exception is raised upon for an empty database - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) sqlidx = SqliteIndex.create(":memory:") @@ -41,26 +44,27 @@ def test_sqlite_index_bad_version(runtmp): # create a sqlite database with a bad index version in the # sourmash_internal table, see what happens :) - dbfile = runtmp.output('xyz.sqldb') + dbfile = runtmp.output("xyz.sqldb") conn = sqlite3.connect(dbfile) c = conn.cursor() SqliteIndex._create_tables(c) # 0.9 doesn't exist/is bad version - c.execute('UPDATE sourmash_internal SET value=? WHERE key=?', - ('0.9', 'SqliteIndex')) + c.execute( + "UPDATE sourmash_internal SET value=? WHERE key=?", ("0.9", "SqliteIndex") + ) conn.commit() with pytest.raises(IndexNotSupported): - idx = sourmash.load_file_as_index(dbfile) + sourmash.load_file_as_index(dbfile) def test_sqlite_index_bad_version_unique(runtmp): # try to insert duplicate sqlite index info into sourmash_internal; fail - dbfile = runtmp.output('xyz.sqldb') + dbfile = runtmp.output("xyz.sqldb") conn = sqlite3.connect(dbfile) c = conn.cursor() @@ -68,13 +72,17 @@ def test_sqlite_index_bad_version_unique(runtmp): # can't insert duplicate key with pytest.raises(sqlite3.IntegrityError): - c.execute('INSERT INTO sourmash_internal (value, key) VALUES (?, ?)', - ('1.1', 'SqliteIndex')) + c.execute( + "INSERT INTO sourmash_internal (value, key) VALUES (?, ?)", + ("1.1", "SqliteIndex"), + ) def test_index_search_subj_scaled_is_lower(): # check that subject sketches are appropriately downsampled - sigfile = utils.get_test_data('scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz') + sigfile = utils.get_test_data( + "scaled100/GCF_000005845.2_ASM584v2_genomic.fna.gz.sig.gz" + ) ss = sourmash.load_one_signature(sigfile) # double check :) @@ -95,15 +103,15 @@ def test_index_search_subj_scaled_is_lower(): def test_sqlite_index_save_load(runtmp): - sig2 = utils.get_test_data('2.fa.sig') - sig47 = utils.get_test_data('47.fa.sig') - sig63 = utils.get_test_data('63.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") + sig47 = utils.get_test_data("47.fa.sig") + sig63 = utils.get_test_data("63.fa.sig") ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) - filename = runtmp.output('foo') + filename = runtmp.output("foo") sqlidx = SqliteIndex.create(filename) sqlidx.insert(ss2) sqlidx.insert(ss47) @@ -122,7 +130,7 @@ def test_sqlite_index_save_load(runtmp): def test_sqlite_index_multik_select(): # this loads three ksizes, 21/31/51 - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") siglist = sourmash.load_file_as_signatures(sig2) sqlidx = SqliteIndex.create(":memory:") @@ -130,11 +138,11 @@ def test_sqlite_index_multik_select(): sqlidx.insert(ss) # select most specifically - sqlidx2 = sqlidx.select(ksize=31, moltype='DNA') + sqlidx2 = sqlidx.select(ksize=31, moltype="DNA") assert len(sqlidx2) == 1 # all are DNA: - sqlidx2 = sqlidx.select(moltype='DNA') + sqlidx2 = sqlidx.select(moltype="DNA") assert len(sqlidx2) == 3 @@ -156,7 +164,7 @@ def test_sqlite_index_insert_num_fail(): # cannot insert 'num' signatures sqlidx = SqliteIndex.create(":memory:") - sig47 = utils.get_test_data('num/47.fa.sig') + sig47 = utils.get_test_data("num/47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) assert ss47.minhash.num != 0 @@ -170,7 +178,7 @@ def test_sqlite_index_insert_abund_fail(): # cannot insert 'num' signatures sqlidx = SqliteIndex.create(":memory:") - sig47 = utils.get_test_data('track_abund/47.fa.sig') + sig47 = utils.get_test_data("track_abund/47.fa.sig") ss47 = sourmash.load_one_signature(sig47, ksize=31) with pytest.raises(ValueError) as exc: @@ -183,7 +191,7 @@ def test_sqlite_index_moltype_multi_fail(): # check that we cannot store sigs with multiple scaled values. # this loads multiple ksizes (19, 31) and moltypes (DNA, protein, hp, etc) - filename = utils.get_test_data('prot/all.zip') + filename = utils.get_test_data("prot/all.zip") siglist = sourmash.load_file_as_signatures(filename) siglist = list(siglist) @@ -203,7 +211,7 @@ def test_sqlite_index_picklist_select(): # test select with a picklist # this loads three ksizes, 21/31/51 - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") siglist = sourmash.load_file_as_signatures(sig2) sqlidx = SqliteIndex.create(":memory:") @@ -211,22 +219,22 @@ def test_sqlite_index_picklist_select(): sqlidx.insert(ss) # construct a picklist... - picklist = SignaturePicklist('md5prefix8') - picklist.init(['f3a90d4e']) + picklist = SignaturePicklist("md5prefix8") + picklist.init(["f3a90d4e"]) # select on picklist sqlidx2 = sqlidx.select(picklist=picklist) assert len(sqlidx2) == 1 ss = list(sqlidx2.signatures())[0] assert ss.minhash.ksize == 31 - assert ss.md5sum().startswith('f3a90d4e55') + assert ss.md5sum().startswith("f3a90d4e55") def test_sqlite_index_picklist_select_exclude(): # test select with a picklist, but exclude # this loads three ksizes, 21/31/51 - sig2 = utils.get_test_data('2.fa.sig') + sig2 = utils.get_test_data("2.fa.sig") siglist = sourmash.load_file_as_signatures(sig2) sqlidx = SqliteIndex.create(":memory:") @@ -234,8 +242,8 @@ def test_sqlite_index_picklist_select_exclude(): sqlidx.insert(ss) # construct a picklist... - picklist = SignaturePicklist('md5prefix8', pickstyle=PickStyle.EXCLUDE) - picklist.init(['f3a90d4e']) + picklist = SignaturePicklist("md5prefix8", pickstyle=PickStyle.EXCLUDE) + picklist.init(["f3a90d4e"]) # select on picklist sqlidx2 = sqlidx.select(picklist=picklist) @@ -245,8 +253,10 @@ def test_sqlite_index_picklist_select_exclude(): for ss in list(sqlidx2.signatures()): md5s.add(ss.md5sum()) ksizes.add(ss.minhash.ksize) - assert md5s == set(['f372e47893edd349e5956f8b0d8dcbf7','43f3b48e59443092850964d355a20ac0']) - assert ksizes == set([21,51]) + assert md5s == set( + ["f372e47893edd349e5956f8b0d8dcbf7", "43f3b48e59443092850964d355a20ac0"] + ) + assert ksizes == set([21, 51]) def test_sqlite_jaccard_ordering(): @@ -265,10 +275,10 @@ def test_sqlite_jaccard_ordering(): def _intersect(x, y): return x.intersection_and_union_size(y)[0] - print('a intersect b:', _intersect(a, b)) - print('a intersect c:', _intersect(a, c)) - print('a jaccard b:', a.jaccard(b)) - print('a jaccard c:', a.jaccard(c)) + print("a intersect b:", _intersect(a, b)) + print("a intersect c:", _intersect(a, c)) + print("a jaccard b:", a.jaccard(b)) + print("a jaccard c:", a.jaccard(c)) assert _intersect(a, b) > _intersect(a, c) assert a.jaccard(b) < a.jaccard(c) @@ -277,9 +287,9 @@ def _intersect(x, y): assert a.jaccard(c) > 0.15 # now - make signatures, try out :) - ss_a = sourmash.SourmashSignature(a, name='A') - ss_b = sourmash.SourmashSignature(b, name='B') - ss_c = sourmash.SourmashSignature(c, name='C') + ss_a = sourmash.SourmashSignature(a, name="A") + ss_b = sourmash.SourmashSignature(b, name="B") + ss_c = sourmash.SourmashSignature(c, name="C") sqlidx = SqliteIndex.create(":memory:") sqlidx.insert(ss_a) @@ -303,7 +313,7 @@ def test_sqlite_index_scaled1(): mh1.add_hash(2**64 - 1) mh1.add_hash(2**64 - 2) mh1.add_hash(2**64 - 3) - ss1 = sourmash.SourmashSignature(mh1, name='ss 1') + ss1 = sourmash.SourmashSignature(mh1, name="ss 1") mh2 = sourmash.MinHash(0, 31, scaled=1) mh2.add_hash(2**64 - 1) @@ -312,7 +322,7 @@ def test_sqlite_index_scaled1(): mh2.add_hash(0) mh2.add_hash(1) mh2.add_hash(2) - ss2 = sourmash.SourmashSignature(mh2, name='ss 2') + ss2 = sourmash.SourmashSignature(mh2, name="ss 2") sqlidx.insert(ss1) sqlidx.insert(ss2) @@ -340,7 +350,7 @@ def test_sqlite_index_scaled1(): def test_sqlite_index_load_existing(): # try loading an existing sqlite index - filename = utils.get_test_data('sqlite/index.sqldb') + filename = utils.get_test_data("sqlite/index.sqldb") sqlidx = sourmash.load_file_as_index(filename) assert isinstance(sqlidx, SqliteIndex) @@ -350,11 +360,11 @@ def test_sqlite_index_load_existing(): def test_sqlite_index_create_load_existing(runtmp): # try creating then loading an existing sqlite index; create from CLI - filename = runtmp.output('idx.sqldb') - sig1 = utils.get_test_data('47.fa.sig') - sig2 = utils.get_test_data('63.fa.sig') + filename = runtmp.output("idx.sqldb") + sig1 = utils.get_test_data("47.fa.sig") + sig2 = utils.get_test_data("63.fa.sig") - runtmp.sourmash('sig', 'cat', sig1, sig2, '-o', filename) + runtmp.sourmash("sig", "cat", sig1, sig2, "-o", filename) sqlidx = sourmash.load_file_as_index(filename) assert isinstance(sqlidx, SqliteIndex) @@ -365,12 +375,12 @@ def test_sqlite_index_create_load_existing(runtmp): def test_sqlite_index_create_load_insert_existing(runtmp): # try creating, loading, inserting into an existing sqlite index - filename = runtmp.output('idx.sqldb') - sig1 = utils.get_test_data('47.fa.sig') - sig2 = utils.get_test_data('63.fa.sig') - sig3 = utils.get_test_data('2.fa.sig') + filename = runtmp.output("idx.sqldb") + sig1 = utils.get_test_data("47.fa.sig") + sig2 = utils.get_test_data("63.fa.sig") + sig3 = utils.get_test_data("2.fa.sig") - runtmp.sourmash('sig', 'cat', sig1, sig2, '-o', filename) + runtmp.sourmash("sig", "cat", sig1, sig2, "-o", filename) sqlidx = sourmash.load_file_as_index(filename) assert isinstance(sqlidx, SqliteIndex) @@ -382,7 +392,7 @@ def test_sqlite_index_create_load_insert_existing(runtmp): sqlidx.insert(ss3) sqlidx.commit() - runtmp.sourmash('sig', 'describe', filename) + runtmp.sourmash("sig", "describe", filename) print(runtmp.last_result.out) assert "md5: f3a90d4e5528864a5bcc8434b0d0c3b1" in runtmp.last_result.out @@ -390,12 +400,12 @@ def test_sqlite_index_create_load_insert_existing(runtmp): def test_sqlite_index_create_load_insert_existing_cli(runtmp): # try creating, loading, inserting into an existing sqlite index from cli # (aka "append" to existing database) - filename = runtmp.output('idx.sqldb') - sig1 = utils.get_test_data('47.fa.sig') - sig2 = utils.get_test_data('63.fa.sig') - sig3 = utils.get_test_data('2.fa.sig') + filename = runtmp.output("idx.sqldb") + sig1 = utils.get_test_data("47.fa.sig") + sig2 = utils.get_test_data("63.fa.sig") + sig3 = utils.get_test_data("2.fa.sig") - runtmp.sourmash('sig', 'cat', sig1, sig2, '-o', filename) + runtmp.sourmash("sig", "cat", sig1, sig2, "-o", filename) sqlidx = sourmash.load_file_as_index(filename) assert isinstance(sqlidx, SqliteIndex) @@ -404,7 +414,7 @@ def test_sqlite_index_create_load_insert_existing_cli(runtmp): assert len(siglist) == 2 # add a third - runtmp.sourmash('sig', 'cat', sig3, '-o', filename, '-k', '31') + runtmp.sourmash("sig", "cat", sig3, "-o", filename, "-k", "31") siglist = list(sqlidx.signatures()) assert len(siglist) == 3 @@ -414,26 +424,27 @@ def test_sqlite_manifest_bad_version(runtmp): # create a sqlite database with a bad manifest version in the # sourmash_internal table, see what happens :) - dbfile = runtmp.output('xyz.sqlmf') + dbfile = runtmp.output("xyz.sqlmf") conn = sqlite3.connect(dbfile) c = conn.cursor() SqliteCollectionManifest._create_tables(c) # 0.9 doesn't exist/bad version - c.execute('UPDATE sourmash_internal SET value=? WHERE key=?', - ('0.9', 'SqliteManifest')) + c.execute( + "UPDATE sourmash_internal SET value=? WHERE key=?", ("0.9", "SqliteManifest") + ) conn.commit() with pytest.raises(IndexNotSupported): - mf = CollectionManifest.load_from_filename(dbfile) + CollectionManifest.load_from_filename(dbfile) def test_sqlite_manifest_bad_version_unique(runtmp): # try to insert duplicate sqlite manifest info into sourmash_internal; fail - dbfile = runtmp.output('xyz.sqldb') + dbfile = runtmp.output("xyz.sqldb") conn = sqlite3.connect(dbfile) c = conn.cursor() @@ -441,15 +452,17 @@ def test_sqlite_manifest_bad_version_unique(runtmp): # can't insert duplicate key with pytest.raises(sqlite3.IntegrityError): - c.execute('INSERT INTO sourmash_internal (value, key) VALUES (?, ?)', - ('1.1', 'SqliteManifest')) + c.execute( + "INSERT INTO sourmash_internal (value, key) VALUES (?, ?)", + ("1.1", "SqliteManifest"), + ) def test_sqlite_manifest_basic(): # test some features of the SQLite-based manifest. - sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) + sig2 = load_one_signature(utils.get_test_data("2.fa.sig"), ksize=31) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) sqlidx = SqliteIndex.create(":memory:") @@ -479,15 +492,17 @@ def test_sqlite_manifest_basic(): def test_sqlite_manifest_round_trip(): # check that we can go from regular mf -> sqlite mf -> regular again. - sig2 = load_one_signature(utils.get_test_data('2.fa.sig'), ksize=31) - sig47 = load_one_signature(utils.get_test_data('47.fa.sig'), ksize=31) - sig63 = load_one_signature(utils.get_test_data('63.fa.sig'), ksize=31) + sig2 = load_one_signature(utils.get_test_data("2.fa.sig"), ksize=31) + sig47 = load_one_signature(utils.get_test_data("47.fa.sig"), ksize=31) + sig63 = load_one_signature(utils.get_test_data("63.fa.sig"), ksize=31) rows = [] - rows.append(CollectionManifest.make_manifest_row(sig47, None, - include_signature=False)) - rows.append(CollectionManifest.make_manifest_row(sig63, None, - include_signature=False)) + rows.append( + CollectionManifest.make_manifest_row(sig47, None, include_signature=False) + ) + rows.append( + CollectionManifest.make_manifest_row(sig63, None, include_signature=False) + ) nosql_mf = CollectionManifest(rows) sqlite_mf = SqliteCollectionManifest.load_from_manifest(nosql_mf) @@ -507,13 +522,12 @@ def test_sqlite_manifest_round_trip(): def test_sqlite_manifest_create(runtmp): # test creation and summarization of a manifest of prot.zip - zipfile = utils.get_test_data('prot/all.zip') + zipfile = utils.get_test_data("prot/all.zip") # create manifest - runtmp.sourmash('sig', 'manifest', '-F', 'sql', zipfile, - '-o', 'mf.sqlmf') + runtmp.sourmash("sig", "manifest", "-F", "sql", zipfile, "-o", "mf.sqlmf") - sqlmf = runtmp.output('mf.sqlmf') + sqlmf = runtmp.output("mf.sqlmf") assert os.path.exists(sqlmf) # verify it's loadable as the right type @@ -521,7 +535,7 @@ def test_sqlite_manifest_create(runtmp): assert isinstance(idx, StandaloneManifestIndex) # summarize - runtmp.sourmash('sig', 'fileinfo', 'mf.sqlmf') + runtmp.sourmash("sig", "fileinfo", "mf.sqlmf") out = runtmp.last_result.out print(out) @@ -540,41 +554,38 @@ def test_sqlite_manifest_create(runtmp): def test_sqlite_manifest_create_noload_sigs(runtmp): # sigs should not be loadable from manifest this way... - zipfile = utils.get_test_data('prot/all.zip') + zipfile = utils.get_test_data("prot/all.zip") # create manifest - runtmp.sourmash('sig', 'manifest', '-F', 'sql', zipfile, - '-o', 'mf.sqlmf') + runtmp.sourmash("sig", "manifest", "-F", "sql", zipfile, "-o", "mf.sqlmf") # 'describe' should not be able to load the sqlmf b/c prefix is wrong with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'describe', 'mf.sqlmf') + runtmp.sourmash("sig", "describe", "mf.sqlmf") def test_sqlite_manifest_create_yesload_sigs(runtmp): # should be able to load after copying files - zipfile = utils.get_test_data('prot/all.zip') - shutil.copytree(utils.get_test_data('prot'), runtmp.output('prot')) + zipfile = utils.get_test_data("prot/all.zip") + shutil.copytree(utils.get_test_data("prot"), runtmp.output("prot")) # create manifest - runtmp.sourmash('sig', 'manifest', '-F', 'sql', zipfile, - '-o', 'prot/mf.sqlmf') + runtmp.sourmash("sig", "manifest", "-F", "sql", zipfile, "-o", "prot/mf.sqlmf") # 'describe' should now be able to load the sqlmf, which is cool - runtmp.sourmash('sig', 'describe', 'prot/mf.sqlmf') + runtmp.sourmash("sig", "describe", "prot/mf.sqlmf") print(runtmp.last_result.out) def test_sqlite_manifest_num(runtmp): # should be able to produce sql manifests with 'num' sketches in them - numsig = utils.get_test_data('num/47.fa.sig') + numsig = utils.get_test_data("num/47.fa.sig") # create mf - runtmp.sourmash('sig', 'manifest', '-F', 'sql', numsig, - '-o', 'mf.sqlmf') + runtmp.sourmash("sig", "manifest", "-F", "sql", numsig, "-o", "mf.sqlmf") # do summarize: - runtmp.sourmash('sig', 'summarize', 'mf.sqlmf') + runtmp.sourmash("sig", "summarize", "mf.sqlmf") out = runtmp.last_result.out print(out) @@ -586,14 +597,13 @@ def test_sqlite_manifest_num(runtmp): def test_sqlite_manifest_num_select(runtmp): # should be able to _select_ sql manifests with 'num' sketches in them - numsig = utils.get_test_data('num/47.fa.sig') + numsig = utils.get_test_data("num/47.fa.sig") # create mf - runtmp.sourmash('sig', 'manifest', '-F', 'sql', numsig, - '-o', 'mf.sqlmf') + runtmp.sourmash("sig", "manifest", "-F", "sql", numsig, "-o", "mf.sqlmf") # load as index - idx = sourmash.load_file_as_index(runtmp.output('mf.sqlmf')) + idx = sourmash.load_file_as_index(runtmp.output("mf.sqlmf")) # select print(list(idx.manifest.rows)) @@ -604,25 +614,24 @@ def test_sqlite_manifest_num_select(runtmp): def test_sqlite_manifest_locations(runtmp): # check what locations returns... may return too many, that's ok. - prot = utils.get_test_data('prot') + prot = utils.get_test_data("prot") - runtmp.sourmash('sig', 'manifest', '-F', 'sql', prot, - '-o', 'mf.sqlmf') + runtmp.sourmash("sig", "manifest", "-F", "sql", prot, "-o", "mf.sqlmf") # load as index - idx = sourmash.load_file_as_index(runtmp.output('mf.sqlmf')) + idx = sourmash.load_file_as_index(runtmp.output("mf.sqlmf")) - picklist = SignaturePicklist('identprefix') - picklist.pickset = set(['GCA_001593925']) + picklist = SignaturePicklist("identprefix") + picklist.pickset = set(["GCA_001593925"]) idx = idx.select(picklist=picklist) sql_locations = set(idx.manifest.locations()) - row_locations = set(row['internal_location'] for row in idx.manifest.rows) + row_locations = set(row["internal_location"] for row in idx.manifest.rows) assert sql_locations.issuperset(row_locations) - assert 'dna-sig.sig.gz' in sql_locations # this is unnecessary... - assert 'dna-sig.sig.gz' not in row_locations # ...this is correct :) + assert "dna-sig.sig.gz" in sql_locations # this is unnecessary... + assert "dna-sig.sig.gz" not in row_locations # ...this is correct :) def test_sqlite_manifest_create_insert(runtmp): @@ -631,126 +640,125 @@ def test_sqlite_manifest_create_insert(runtmp): mfname = runtmp.output("some.sqlmf") mf = SqliteCollectionManifest.create(mfname) - sigfile = utils.get_test_data('47.fa.sig') + sigfile = utils.get_test_data("47.fa.sig") ss = sourmash.load_one_signature(sigfile) - mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, 'some.sig')) + mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, "some.sig")) mf.conn.commit() # copy sig in since we want it to resolve... - shutil.copyfile(sigfile, runtmp.output('some.sig')) + shutil.copyfile(sigfile, runtmp.output("some.sig")) # 'describe' should work here, to resolve actual sigs. - runtmp.sourmash('sig', 'describe', mfname) + runtmp.sourmash("sig", "describe", mfname) print(runtmp.last_result.out) - assert 'md5: 09a08691ce52952152f0e866a59f6261' in runtmp.last_result.out + assert "md5: 09a08691ce52952152f0e866a59f6261" in runtmp.last_result.out def test_sqlite_manifest_create_insert_2(runtmp): # try out creating a sqlite manifest from cli and then _insert_row into it # copy sig in since we want it to resolve... - sigfile = utils.get_test_data('47.fa.sig') - shutil.copyfile(sigfile, runtmp.output('some.sig')) + sigfile = utils.get_test_data("47.fa.sig") + shutil.copyfile(sigfile, runtmp.output("some.sig")) - runtmp.sourmash('sig', 'manifest', 'some.sig', '-F', 'sql', - '-o', 'some.sqlmf') + runtmp.sourmash("sig", "manifest", "some.sig", "-F", "sql", "-o", "some.sqlmf") mfname = runtmp.output("some.sqlmf") mf = CollectionManifest.load_from_filename(mfname) - ss = sourmash.load_one_signature(runtmp.output('some.sig')) - mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, 'some.sig')) + ss = sourmash.load_one_signature(runtmp.output("some.sig")) + mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, "some.sig")) mf.conn.commit() # 'describe' should work here, to resolve actual sigs. - runtmp.sourmash('sig', 'describe', mfname) + runtmp.sourmash("sig", "describe", mfname) print(runtmp.last_result.out) - assert 'md5: 09a08691ce52952152f0e866a59f6261' in runtmp.last_result.out + assert "md5: 09a08691ce52952152f0e866a59f6261" in runtmp.last_result.out def test_sqlite_manifest_existing(runtmp): # try out an existing sqlite manifest - prefix = runtmp.output('protdir') - mf = runtmp.output('protdir/prot.sqlmf') - shutil.copytree(utils.get_test_data('prot'), prefix) - shutil.copyfile(utils.get_test_data('sqlite/prot.sqlmf'), mf) + prefix = runtmp.output("protdir") + mf = runtmp.output("protdir/prot.sqlmf") + shutil.copytree(utils.get_test_data("prot"), prefix) + shutil.copyfile(utils.get_test_data("sqlite/prot.sqlmf"), mf) - runtmp.sourmash('sig', 'describe', mf) + runtmp.sourmash("sig", "describe", mf) print(runtmp.last_result.out) def test_sqlite_manifest_existing_insert(runtmp): # try out an existing sqlite manifest - insert into it - prefix = runtmp.output('protdir') - shutil.copytree(utils.get_test_data('prot'), prefix) + prefix = runtmp.output("protdir") + shutil.copytree(utils.get_test_data("prot"), prefix) - mfname = runtmp.output('protdir/prot.sqlmf') - shutil.copyfile(utils.get_test_data('sqlite/prot.sqlmf'), mfname) + mfname = runtmp.output("protdir/prot.sqlmf") + shutil.copyfile(utils.get_test_data("sqlite/prot.sqlmf"), mfname) mf = CollectionManifest.load_from_filename(mfname) assert isinstance(mf, SqliteCollectionManifest) - sigfile = utils.get_test_data('47.fa.sig') + sigfile = utils.get_test_data("47.fa.sig") ss = sourmash.load_one_signature(sigfile) - mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, 'some.sig')) + mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, "some.sig")) mf.conn.commit() # copy sig in since we want it to resolve... - shutil.copyfile(sigfile, runtmp.output('protdir/some.sig')) + shutil.copyfile(sigfile, runtmp.output("protdir/some.sig")) # 'describe' should work here. - runtmp.sourmash('sig', 'describe', mfname) + runtmp.sourmash("sig", "describe", mfname) print(runtmp.last_result.out) def test_sqlite_manifest_existing_mf_only(runtmp): # try out an existing sqlite manifest, but without underlying files -> fail - mf = runtmp.output('prot.sqlmf') - shutil.copyfile(utils.get_test_data('sqlite/prot.sqlmf'), mf) + mf = runtmp.output("prot.sqlmf") + shutil.copyfile(utils.get_test_data("sqlite/prot.sqlmf"), mf) # 'fileinfo' should work... - runtmp.sourmash('sig', 'fileinfo', mf) + runtmp.sourmash("sig", "fileinfo", mf) print(runtmp.last_result.out) - assert 'num signatures: 7' in runtmp.last_result.out + assert "num signatures: 7" in runtmp.last_result.out # ...but 'describe' should fail, since it needs actual sigs. - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sig', 'describe', mf) + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash("sig", "describe", mf) print(runtmp.last_result.err) - assert 'ERROR: Error while reading signatures from' in runtmp.last_result.err + assert "ERROR: Error while reading signatures from" in runtmp.last_result.err def test_sqlite_manifest_existing_mfonly_insert(runtmp): # try out an existing sqlite manifest - insert into it, but fail describe - mfname = runtmp.output('prot.sqlmf') - shutil.copyfile(utils.get_test_data('sqlite/prot.sqlmf'), mfname) + mfname = runtmp.output("prot.sqlmf") + shutil.copyfile(utils.get_test_data("sqlite/prot.sqlmf"), mfname) mf = CollectionManifest.load_from_filename(mfname) assert isinstance(mf, SqliteCollectionManifest) - sigfile = utils.get_test_data('47.fa.sig') + sigfile = utils.get_test_data("47.fa.sig") ss = sourmash.load_one_signature(sigfile) mf._insert_row(mf.conn.cursor(), mf.make_manifest_row(ss, sigfile)) mf.conn.commit() # 'fileinfo' should work... - runtmp.sourmash('sig', 'fileinfo', mfname) + runtmp.sourmash("sig", "fileinfo", mfname) print(runtmp.last_result.out) - assert 'num signatures: 8' in runtmp.last_result.out + assert "num signatures: 8" in runtmp.last_result.out # ...but 'describe' should fail, since it needs actual sigs. - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('sig', 'describe', mfname) + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash("sig", "describe", mfname) def test_sqlite_manifest_load_existing_index(): # try loading an existing sqlite index as a manifest - filename = utils.get_test_data('sqlite/index.sqldb') + filename = utils.get_test_data("sqlite/index.sqldb") mf = CollectionManifest.load_from_filename(filename) assert isinstance(mf, SqliteCollectionManifest) @@ -759,14 +767,14 @@ def test_sqlite_manifest_load_existing_index(): def test_sqlite_manifest_load_existing_index_insert_fail(): # try loading an existing sqlite index as a manifest; insert should fail - filename = utils.get_test_data('sqlite/index.sqldb') + filename = utils.get_test_data("sqlite/index.sqldb") mf = CollectionManifest.load_from_filename(filename) assert isinstance(mf, SqliteCollectionManifest) assert len(mf) == 2 # try insert - should fail - sigfile = utils.get_test_data('47.fa.sig') + sigfile = utils.get_test_data("47.fa.sig") ss = sourmash.load_one_signature(sigfile) with pytest.raises(Exception) as exc: @@ -787,7 +795,7 @@ def test_sqlite_manifest_create_load_empty(runtmp): def test_sqlite_lca_db_load_existing(): # try loading an existing sqlite index - filename = utils.get_test_data('sqlite/lca.sqldb') + filename = utils.get_test_data("sqlite/lca.sqldb") sqlidx = sourmash.load_file_as_index(filename) assert isinstance(sqlidx, LCA_SqliteDatabase) @@ -797,27 +805,26 @@ def test_sqlite_lca_db_load_existing(): def test_sqlite_lca_db_select(): # try loading an existing sqlite index - filename = utils.get_test_data('sqlite/lca.sqldb') + filename = utils.get_test_data("sqlite/lca.sqldb") sqlidx = sourmash.load_file_as_index(filename) assert isinstance(sqlidx, LCA_SqliteDatabase) sqlidx2 = sqlidx.select(ksize=31) - x = list(sqlidx2.hashvals) # only on LCA_SqliteDatabase + list(sqlidx2.hashvals) # only on LCA_SqliteDatabase assert isinstance(sqlidx2, LCA_SqliteDatabase) def test_sqlite_lca_db_create_load_existing(runtmp): # try creating (from CLI) then loading (from API) an LCA db - filename = runtmp.output('lca.sqldb') - sig1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - sig2 = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') + filename = runtmp.output("lca.sqldb") + sig1 = utils.get_test_data("lca/TARA_ASE_MAG_00031.sig") + sig2 = utils.get_test_data("lca/TARA_PSW_MAG_00136.sig") - runtmp.sourmash('sig', 'flatten', sig1, sig2, '-o', filename, '-k', '31') + runtmp.sourmash("sig", "flatten", sig1, sig2, "-o", filename, "-k", "31") # load tax - tax_csv = utils.get_test_data('sqlite/delmont-6.csv') - runtmp.sourmash('tax', 'prepare', '-t', tax_csv, - '-o', filename, '-F', 'sql') + tax_csv = utils.get_test_data("sqlite/delmont-6.csv") + runtmp.sourmash("tax", "prepare", "-t", tax_csv, "-o", filename, "-F", "sql") sqlidx = sourmash.load_file_as_index(filename) assert isinstance(sqlidx, LCA_SqliteDatabase) @@ -829,63 +836,62 @@ def test_sqlite_lca_db_create_load_existing(runtmp): def test_sqlite_lca_db_load_empty(runtmp): # try creating then loading an _empty_ LCA_SqliteDatabase - dbname = runtmp.output('empty.sqldb') + dbname = runtmp.output("empty.sqldb") # create empty SqliteIndex... - runtmp.sourmash('sig', 'cat', '-o', dbname) + runtmp.sourmash("sig", "cat", "-o", dbname) assert os.path.exists(dbname) # ...and create empty sourmash_taxonomy tables in there... - empty_tax = utils.get_test_data('scaled/empty-lineage.csv') - runtmp.sourmash('tax', 'prepare', '-F', 'sql', '-t', empty_tax, - '-o', dbname) + empty_tax = utils.get_test_data("scaled/empty-lineage.csv") + runtmp.sourmash("tax", "prepare", "-F", "sql", "-t", empty_tax, "-o", dbname) - runtmp.sourmash('sig', 'describe', dbname) - assert 'loaded 0 signatures' in runtmp.last_result.err + runtmp.sourmash("sig", "describe", dbname) + assert "loaded 0 signatures" in runtmp.last_result.err def test_sqlite_lca_db_create_readonly(runtmp): # try running 'prepare' on a read-only sqlite db, check error message. - dbname = runtmp.output('empty.sqldb') + dbname = runtmp.output("empty.sqldb") # create empty SqliteIndex... - runtmp.sourmash('sig', 'cat', '-o', dbname) + runtmp.sourmash("sig", "cat", "-o", dbname) assert os.path.exists(dbname) # make it read only... from stat import S_IREAD, S_IRGRP, S_IROTH - os.chmod(dbname, S_IREAD|S_IRGRP|S_IROTH) + + os.chmod(dbname, S_IREAD | S_IRGRP | S_IROTH) # ...and try creating empty sourmash_taxonomy tables in there... - empty_tax = utils.get_test_data('scaled/empty-lineage.csv') + empty_tax = utils.get_test_data("scaled/empty-lineage.csv") - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.sourmash('tax', 'prepare', '-F', 'sql', '-t', empty_tax, - '-o', dbname) + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash("tax", "prepare", "-F", "sql", "-t", empty_tax, "-o", dbname) err = runtmp.last_result.err print(err) - assert not "taxonomy table already exists in" in err + assert "taxonomy table already exists in" not in err assert "attempt to write a readonly database" in err def test_sqlite_lca_db_try_load_sqlite_index(): # try loading a SqliteIndex with no tax tables from .load classmethod - dbname = utils.get_test_data('sqlite/index.sqldb') + dbname = utils.get_test_data("sqlite/index.sqldb") with pytest.raises(ValueError) as exc: - db = LCA_SqliteDatabase.load(dbname) + LCA_SqliteDatabase.load(dbname) assert "not a taxonomy database" in str(exc) def test_sqlite_lca_db_supply_lineage_db(): # try creating an LCA_SqliteDatabase object with a separate lineage DB. - dbname = utils.get_test_data('sqlite/index.sqldb') + dbname = utils.get_test_data("sqlite/index.sqldb") - tax_csv = utils.get_test_data('sqlite/shewanella-lineage.csv') + tax_csv = utils.get_test_data("sqlite/shewanella-lineage.csv") lineage_db = MultiLineageDB.load([tax_csv]) db = LCA_SqliteDatabase(dbname, lineage_db=lineage_db) @@ -893,21 +899,21 @@ def test_sqlite_lca_db_supply_lineage_db(): hashval = next(iter(db.hashvals)) lineages = db.get_lineage_assignments(hashval) print(lineages) - assert lineages[0][0].rank == 'superkingdom' - assert lineages[0][0].name == 'd__Bacteria' - assert lineages[0][-1].rank == 'species' - assert lineages[0][-1].name == 's__Shewanella baltica' - assert lineages[1][0].rank == 'superkingdom' - assert lineages[1][0].name == 'd__Bacteria' - assert lineages[0][-1].rank == 'species' - assert lineages[0][-1].name == 's__Shewanella baltica' + assert lineages[0][0].rank == "superkingdom" + assert lineages[0][0].name == "d__Bacteria" + assert lineages[0][-1].rank == "species" + assert lineages[0][-1].name == "s__Shewanella baltica" + assert lineages[1][0].rank == "superkingdom" + assert lineages[1][0].name == "d__Bacteria" + assert lineages[0][-1].rank == "species" + assert lineages[0][-1].name == "s__Shewanella baltica" def test_bad_sqlite_internal_version(): # check get_sourmash_internal - dbname = utils.get_test_data('sqlite/index.sqldb') + dbname = utils.get_test_data("sqlite/index.sqldb") conn = sqlite_utils.open_sqlite_db(dbname) c = conn.cursor() with pytest.raises(Exception): - sqlite_utils.add_sourmash_internal(c, 'SqliteIndex', '0.9') + sqlite_utils.add_sourmash_internal(c, "SqliteIndex", "0.9") diff --git a/tests/test_tax.py b/tests/test_tax.py index b37e8eaf6f..3f766f5e37 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -18,99 +18,249 @@ from sourmash.exceptions import IndexNotSupported from sourmash import sourmash_args + ## command line tests def test_run_sourmash_tax(): - status, out, err = utils.runscript('sourmash', ['tax'], fail_ok=True) - assert status != 0 # no args provided, ok ;) + status, out, err = utils.runscript("sourmash", ["tax"], fail_ok=True) + assert status != 0 # no args provided, ok ;) def test_metagenome_stdout_0(runtmp): # test basic metagenome c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax) + c.run_sourmash("tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000' in c.last_result.out - assert 'test1,superkingdom,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,phylum,0.116,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,phylum,0.088,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,phylum,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,class,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,class,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,class,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,order,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,order,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,order,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,family,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,family,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,genus,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.057,444000' in c.last_result.out - assert 'test1,genus,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,genus,0.028,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola,md5,test1.sig,0.016,138000' in c.last_result.out - assert 'test1,genus,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out - assert 'test1,species,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,species,0.028,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus,md5,test1.sig,0.016,138000' in c.last_result.out - assert 'test1,species,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.116,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.088,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,class,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,class,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,class,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,order,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,order,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,order,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,family,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,family,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,genus,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert ( + "test1,genus,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,genus,0.028,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola,md5,test1.sig,0.016,138000" + in c.last_result.out + ) + assert ( + "test1,genus,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert ( + "test1,species,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,species,0.028,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus,md5,test1.sig,0.016,138000" + in c.last_result.out + ) + assert ( + "test1,species,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) def test_metagenome_stdout_0_db(runtmp): # test basic metagenome with sqlite database c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.db') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.db") - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax) + c.run_sourmash("tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000' in c.last_result.out - assert 'test1,superkingdom,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,phylum,0.116,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,phylum,0.088,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,phylum,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,class,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,class,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,class,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,order,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,order,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,order,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,family,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,family,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,genus,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.057,444000' in c.last_result.out - assert 'test1,genus,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,genus,0.028,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola,md5,test1.sig,0.016,138000' in c.last_result.out - assert 'test1,genus,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out - assert 'test1,species,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,species,0.028,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus,md5,test1.sig,0.016,138000' in c.last_result.out - assert 'test1,species,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.116,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.088,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,class,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,class,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,class,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,order,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,order,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,order,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,family,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,family,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,genus,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert ( + "test1,genus,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,genus,0.028,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola,md5,test1.sig,0.016,138000" + in c.last_result.out + ) + assert ( + "test1,genus,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert ( + "test1,species,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,species,0.028,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus,md5,test1.sig,0.016,138000" + in c.last_result.out + ) + assert ( + "test1,species,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) def test_metagenome_summary_csv_out(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" sum_csv = csv_base + ".summarized.csv" csvout = runtmp.output(sum_csv) outdir = os.path.dirname(csvout) - runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir) + runtmp.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-dir", + outdir, + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -121,62 +271,164 @@ def test_metagenome_summary_csv_out(runtmp): sum_gather_results = [x.rstrip() for x in Path(csvout).read_text().splitlines()] assert f"saving 'csv_summary' output to '{csvout}'" in runtmp.last_result.err - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in sum_gather_results[0] - assert 'test1,superkingdom,0.2042281611487834,d__Bacteria,md5,test1.sig,0.13080306238801107,1024000' in sum_gather_results[1] - assert 'test1,superkingdom,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000' in sum_gather_results[2] - assert 'test1,phylum,0.11607499002792182,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.07265026877341586,582000' in sum_gather_results[3] - assert 'test1,phylum,0.08815317112086159,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.05815279361459521,442000' in sum_gather_results[4] - assert 'test1,phylum,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000' in sum_gather_results[5] - assert 'test1,class,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test1.sig,0.07265026877341586,582000' in sum_gather_results[6] - assert 'test1,class,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,md5,test1.sig,0.05815279361459521,442000' in sum_gather_results[7] - assert 'test1,class,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000' in sum_gather_results[8] - assert 'test1,order,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.07265026877341586,582000' in sum_gather_results[9] - assert 'test1,order,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.05815279361459521,442000' in sum_gather_results[10] - assert 'test1,order,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000' in sum_gather_results[11] - assert 'test1,family,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.07265026877341586,582000' in sum_gather_results[12] - assert 'test1,family,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.05815279361459521,442000' in sum_gather_results[13] - assert 'test1,family,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000' in sum_gather_results[14] - assert 'test1,genus,0.0885520542481053,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.05701254275940707,444000' in sum_gather_results[15] - assert 'test1,genus,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia,md5,test1.sig,0.05815279361459521,442000' in sum_gather_results[16] - assert 'test1,genus,0.027522935779816515,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola,md5,test1.sig,0.015637726014008795,138000' in sum_gather_results[17] - assert 'test1,genus,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000' in sum_gather_results[18] - assert 'test1,species,0.0885520542481053,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.05701254275940707,444000' in sum_gather_results[19] - assert 'test1,species,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli,md5,test1.sig,0.05815279361459521,442000' in sum_gather_results[20] - assert 'test1,species,0.027522935779816515,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus,md5,test1.sig,0.015637726014008795,138000' in sum_gather_results[21] - assert 'test1,species,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000' in sum_gather_results[22] + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in sum_gather_results[0] + ) + assert ( + "test1,superkingdom,0.2042281611487834,d__Bacteria,md5,test1.sig,0.13080306238801107,1024000" + in sum_gather_results[1] + ) + assert ( + "test1,superkingdom,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000" + in sum_gather_results[2] + ) + assert ( + "test1,phylum,0.11607499002792182,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.07265026877341586,582000" + in sum_gather_results[3] + ) + assert ( + "test1,phylum,0.08815317112086159,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.05815279361459521,442000" + in sum_gather_results[4] + ) + assert ( + "test1,phylum,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000" + in sum_gather_results[5] + ) + assert ( + "test1,class,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test1.sig,0.07265026877341586,582000" + in sum_gather_results[6] + ) + assert ( + "test1,class,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,md5,test1.sig,0.05815279361459521,442000" + in sum_gather_results[7] + ) + assert ( + "test1,class,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000" + in sum_gather_results[8] + ) + assert ( + "test1,order,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.07265026877341586,582000" + in sum_gather_results[9] + ) + assert ( + "test1,order,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.05815279361459521,442000" + in sum_gather_results[10] + ) + assert ( + "test1,order,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000" + in sum_gather_results[11] + ) + assert ( + "test1,family,0.11607499002792182,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.07265026877341586,582000" + in sum_gather_results[12] + ) + assert ( + "test1,family,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.05815279361459521,442000" + in sum_gather_results[13] + ) + assert ( + "test1,family,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000" + in sum_gather_results[14] + ) + assert ( + "test1,genus,0.0885520542481053,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.05701254275940707,444000" + in sum_gather_results[15] + ) + assert ( + "test1,genus,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia,md5,test1.sig,0.05815279361459521,442000" + in sum_gather_results[16] + ) + assert ( + "test1,genus,0.027522935779816515,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola,md5,test1.sig,0.015637726014008795,138000" + in sum_gather_results[17] + ) + assert ( + "test1,genus,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000" + in sum_gather_results[18] + ) + assert ( + "test1,species,0.0885520542481053,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.05701254275940707,444000" + in sum_gather_results[19] + ) + assert ( + "test1,species,0.08815317112086159,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli,md5,test1.sig,0.05815279361459521,442000" + in sum_gather_results[20] + ) + assert ( + "test1,species,0.027522935779816515,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus,md5,test1.sig,0.015637726014008795,138000" + in sum_gather_results[21] + ) + assert ( + "test1,species,0.7957718388512166,unclassified,md5,test1.sig,0.8691969376119889,3990000" + in sum_gather_results[22] + ) def test_metagenome_summary_csv_out_empty_gather_force(runtmp): # test multiple -g, empty -g file, and --force - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" sum_csv = csv_base + ".summarized.csv" csvout = runtmp.output(sum_csv) outdir = os.path.dirname(csvout) - gather_empty = runtmp.output('g.csv') + gather_empty = runtmp.output("g.csv") with open(gather_empty, "w") as fp: fp.write("") print("g_csv: ", gather_empty) - runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '-g', gather_empty, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir, '-f') + runtmp.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_csv, + "-g", + gather_empty, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-dir", + outdir, + "-f", + ) sum_gather_results = [x.rstrip() for x in Path(csvout).read_text().splitlines()] assert f"saving 'csv_summary' output to '{csvout}'" in runtmp.last_result.err - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in sum_gather_results[0] - assert 'test1,superkingdom,0.2042281611487834,d__Bacteria,md5,test1.sig,0.13080306238801107,1024000' in sum_gather_results[1] + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in sum_gather_results[0] + ) + assert ( + "test1,superkingdom,0.2042281611487834,d__Bacteria,md5,test1.sig,0.13080306238801107,1024000" + in sum_gather_results[1] + ) def test_metagenome_kreport_out(runtmp): # test 'kreport' kraken output format - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" sum_csv = csv_base + ".kreport.txt" csvout = runtmp.output(sum_csv) outdir = os.path.dirname(csvout) - runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir, '-F', "kreport") + runtmp.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-dir", + outdir, + "-F", + "kreport", + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -185,37 +437,89 @@ def test_metagenome_kreport_out(runtmp): assert runtmp.last_result.status == 0 assert os.path.exists(csvout) - kreport_results = [x.rstrip().split('\t') for x in Path(csvout).read_text().splitlines()] + kreport_results = [ + x.rstrip().split("\t") for x in Path(csvout).read_text().splitlines() + ] assert f"saving 'kreport' output to '{csvout}'" in runtmp.last_result.err print(kreport_results) - assert ['13.08', '1605999', '0', 'D', '', 'd__Bacteria'] == kreport_results[0] - assert ['86.92', '10672000', '10672000', 'U', '', 'unclassified'] == kreport_results[1] - assert ['7.27', '892000', '0', 'P', '', 'p__Bacteroidota'] == kreport_results[2] - assert ['5.82', '714000', '0', 'P', '', 'p__Proteobacteria'] == kreport_results[3] - assert ['7.27', '892000', '0', 'C', '', 'c__Bacteroidia'] == kreport_results[4] - assert ['5.82', '714000', '0', 'C', '', 'c__Gammaproteobacteria'] == kreport_results[5] - assert ['7.27', '892000', '0', 'O', '', 'o__Bacteroidales'] == kreport_results[6] - assert ['5.82', '714000', '0', 'O', '', 'o__Enterobacterales'] == kreport_results[7] - assert ['7.27', '892000', '0', 'F', '', 'f__Bacteroidaceae'] == kreport_results[8] - assert ['5.82', '714000', '0', 'F', '', 'f__Enterobacteriaceae'] == kreport_results[9] - assert ['5.70', '700000', '0', 'G', '', 'g__Prevotella'] == kreport_results[10] - assert ['5.82', '714000', '0', 'G', '', 'g__Escherichia'] == kreport_results[11] - assert ['1.56', '192000', '0', 'G', '', 'g__Phocaeicola'] == kreport_results[12] - assert ['5.70', '700000', '700000', 'S', '', 's__Prevotella copri'] == kreport_results[13] - assert ['5.82', '714000', '714000', 'S', '', 's__Escherichia coli']== kreport_results[14] - assert ['1.56', '192000', '192000', 'S', '', 's__Phocaeicola vulgatus'] == kreport_results[15] + assert ["13.08", "1605999", "0", "D", "", "d__Bacteria"] == kreport_results[0] + assert [ + "86.92", + "10672000", + "10672000", + "U", + "", + "unclassified", + ] == kreport_results[1] + assert ["7.27", "892000", "0", "P", "", "p__Bacteroidota"] == kreport_results[2] + assert ["5.82", "714000", "0", "P", "", "p__Proteobacteria"] == kreport_results[3] + assert ["7.27", "892000", "0", "C", "", "c__Bacteroidia"] == kreport_results[4] + assert [ + "5.82", + "714000", + "0", + "C", + "", + "c__Gammaproteobacteria", + ] == kreport_results[5] + assert ["7.27", "892000", "0", "O", "", "o__Bacteroidales"] == kreport_results[6] + assert ["5.82", "714000", "0", "O", "", "o__Enterobacterales"] == kreport_results[7] + assert ["7.27", "892000", "0", "F", "", "f__Bacteroidaceae"] == kreport_results[8] + assert ["5.82", "714000", "0", "F", "", "f__Enterobacteriaceae"] == kreport_results[ + 9 + ] + assert ["5.70", "700000", "0", "G", "", "g__Prevotella"] == kreport_results[10] + assert ["5.82", "714000", "0", "G", "", "g__Escherichia"] == kreport_results[11] + assert ["1.56", "192000", "0", "G", "", "g__Phocaeicola"] == kreport_results[12] + assert [ + "5.70", + "700000", + "700000", + "S", + "", + "s__Prevotella copri", + ] == kreport_results[13] + assert [ + "5.82", + "714000", + "714000", + "S", + "", + "s__Escherichia coli", + ] == kreport_results[14] + assert [ + "1.56", + "192000", + "192000", + "S", + "", + "s__Phocaeicola vulgatus", + ] == kreport_results[15] def test_metagenome_kreport_ncbi_taxid_out(runtmp): # test NCBI taxid output from kreport - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.ncbi-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.ncbi-taxonomy.csv") csv_base = "out" sum_csv = csv_base + ".kreport.txt" csvout = runtmp.output(sum_csv) outdir = os.path.dirname(csvout) - runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir, '-F', "kreport") + runtmp.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-dir", + outdir, + "-F", + "kreport", + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -224,38 +528,94 @@ def test_metagenome_kreport_ncbi_taxid_out(runtmp): assert runtmp.last_result.status == 0 assert os.path.exists(csvout) - kreport_results = [x.rstrip().split('\t') for x in Path(csvout).read_text().splitlines()] + kreport_results = [ + x.rstrip().split("\t") for x in Path(csvout).read_text().splitlines() + ] assert f"saving 'kreport' output to '{csvout}'" in runtmp.last_result.err print(kreport_results) - assert ['13.08', '1605999', '0', 'D', '2', 'Bacteria'] == kreport_results[0] - assert ['86.92', '10672000', '10672000', 'U', '', 'unclassified'] == kreport_results[1] - assert ['7.27', '892000', '0', 'P', '976', 'Bacteroidota'] == kreport_results[2] - assert ['5.82', '714000', '0', 'P', '1224', 'Pseudomonadota'] == kreport_results[3] - assert ['7.27', '892000', '0', 'C', '200643', 'Bacteroidia'] == kreport_results[4] - assert ['5.82', '714000', '0', 'C', '1236', 'Gammaproteobacteria'] == kreport_results[5] - assert ['7.27', '892000', '0', 'O', '171549', 'Bacteroidales'] == kreport_results[6] - assert ['5.82', '714000', '0', 'O', '91347', 'Enterobacterales'] == kreport_results[7] - assert ['5.70', '700000', '0', 'F', '171552', 'Prevotellaceae'] == kreport_results[8] - assert ['5.82', '714000', '0', 'F', '543', 'Enterobacteriaceae'] == kreport_results[9] - assert ['1.56', '192000', '0', 'F', '815', 'Bacteroidaceae'] == kreport_results[10] - assert ['5.70', '700000', '0', 'G', '838', 'Prevotella'] == kreport_results[11] - assert ['5.82', '714000', '0', 'G', '561', 'Escherichia'] == kreport_results[12] - assert ['1.56', '192000', '0', 'G', '909656', 'Phocaeicola'] == kreport_results[13] - assert ['5.70', '700000', '700000', 'S', '165179', 'Prevotella copri'] == kreport_results[14] - assert ['5.82', '714000', '714000', 'S', '562', 'Escherichia coli'] == kreport_results[15] - assert ['1.56', '192000', '192000', 'S', '821', 'Phocaeicola vulgatus'] == kreport_results[16] + assert ["13.08", "1605999", "0", "D", "2", "Bacteria"] == kreport_results[0] + assert [ + "86.92", + "10672000", + "10672000", + "U", + "", + "unclassified", + ] == kreport_results[1] + assert ["7.27", "892000", "0", "P", "976", "Bacteroidota"] == kreport_results[2] + assert ["5.82", "714000", "0", "P", "1224", "Pseudomonadota"] == kreport_results[3] + assert ["7.27", "892000", "0", "C", "200643", "Bacteroidia"] == kreport_results[4] + assert [ + "5.82", + "714000", + "0", + "C", + "1236", + "Gammaproteobacteria", + ] == kreport_results[5] + assert ["7.27", "892000", "0", "O", "171549", "Bacteroidales"] == kreport_results[6] + assert ["5.82", "714000", "0", "O", "91347", "Enterobacterales"] == kreport_results[ + 7 + ] + assert ["5.70", "700000", "0", "F", "171552", "Prevotellaceae"] == kreport_results[ + 8 + ] + assert ["5.82", "714000", "0", "F", "543", "Enterobacteriaceae"] == kreport_results[ + 9 + ] + assert ["1.56", "192000", "0", "F", "815", "Bacteroidaceae"] == kreport_results[10] + assert ["5.70", "700000", "0", "G", "838", "Prevotella"] == kreport_results[11] + assert ["5.82", "714000", "0", "G", "561", "Escherichia"] == kreport_results[12] + assert ["1.56", "192000", "0", "G", "909656", "Phocaeicola"] == kreport_results[13] + assert [ + "5.70", + "700000", + "700000", + "S", + "165179", + "Prevotella copri", + ] == kreport_results[14] + assert [ + "5.82", + "714000", + "714000", + "S", + "562", + "Escherichia coli", + ] == kreport_results[15] + assert [ + "1.56", + "192000", + "192000", + "S", + "821", + "Phocaeicola vulgatus", + ] == kreport_results[16] def test_metagenome_kreport_out_lemonade(runtmp): # test 'kreport' kraken output format against lemonade output - g_csv = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.csv') - tax = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv') + g_csv = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.csv") + tax = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.matches.tax.csv") csv_base = "out" sum_csv = csv_base + ".kreport.txt" csvout = runtmp.output(sum_csv) outdir = os.path.dirname(csvout) - runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir, '-F', "kreport") + runtmp.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-dir", + outdir, + "-F", + "kreport", + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -264,44 +624,80 @@ def test_metagenome_kreport_out_lemonade(runtmp): assert runtmp.last_result.status == 0 assert os.path.exists(csvout) - kreport_results = [x.rstrip().split('\t') for x in Path(csvout).read_text().splitlines()] + kreport_results = [ + x.rstrip().split("\t") for x in Path(csvout).read_text().splitlines() + ] assert f"saving 'kreport' output to '{csvout}'" in runtmp.last_result.err print(kreport_results) - assert ['5.35', '116000', '0', 'D', '', 'd__Bacteria'] == kreport_results[0] - assert ['94.65', '2054000', '2054000', 'U', '', 'unclassified'] == kreport_results[1] - assert ['5.35', '116000', '0', 'P', '', 'p__Bacteroidota'] == kreport_results[2] - assert ['5.35', '116000', '0', 'C', '', 'c__Chlorobia'] == kreport_results[3] - assert ['5.35', '116000', '0', 'O', '', 'o__Chlorobiales'] == kreport_results[4] - assert ['5.35', '116000', '0', 'F', '', 'f__Chlorobiaceae'] == kreport_results[5] - assert ['5.35', '116000', '0', 'G', '', 'g__Prosthecochloris'] == kreport_results[6] - assert ['5.35', '116000', '116000', 'S', '', 's__Prosthecochloris vibrioformis'] == kreport_results[7] + assert ["5.35", "116000", "0", "D", "", "d__Bacteria"] == kreport_results[0] + assert ["94.65", "2054000", "2054000", "U", "", "unclassified"] == kreport_results[ + 1 + ] + assert ["5.35", "116000", "0", "P", "", "p__Bacteroidota"] == kreport_results[2] + assert ["5.35", "116000", "0", "C", "", "c__Chlorobia"] == kreport_results[3] + assert ["5.35", "116000", "0", "O", "", "o__Chlorobiales"] == kreport_results[4] + assert ["5.35", "116000", "0", "F", "", "f__Chlorobiaceae"] == kreport_results[5] + assert ["5.35", "116000", "0", "G", "", "g__Prosthecochloris"] == kreport_results[6] + assert [ + "5.35", + "116000", + "116000", + "S", + "", + "s__Prosthecochloris vibrioformis", + ] == kreport_results[7] def test_metagenome_kreport_out_fail(runtmp): # kreport cannot be generated with gather results from < v4.5.0 - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" sum_csv = csv_base + ".kreport.txt" csvout = runtmp.output(sum_csv) outdir = os.path.dirname(csvout) with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir, '-F', "kreport") + runtmp.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-dir", + outdir, + "-F", + "kreport", + ) print(runtmp.last_result.status) print(runtmp.last_result.out) print(runtmp.last_result.err) - assert "ERROR: cannot produce 'kreport' format from gather results before sourmash v4.5.0" in runtmp.last_result.err + assert ( + "ERROR: cannot produce 'kreport' format from gather results before sourmash v4.5.0" + in runtmp.last_result.err + ) def test_metagenome_bioboxes_stdout(runtmp): # test CAMI bioboxes format output - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.ncbi-taxonomy.csv') - - runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-F', "bioboxes") + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.ncbi-taxonomy.csv") + + runtmp.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-F", + "bioboxes", + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -312,36 +708,97 @@ def test_metagenome_bioboxes_stdout(runtmp): assert "# Taxonomic Profiling Output" in runtmp.last_result.out assert "@SampleID:test1" in runtmp.last_result.out assert "@Version:0.10.0" in runtmp.last_result.out - assert "@Ranks:superkingdom|phylum|class|order|family|genus|species|strain" in runtmp.last_result.out + assert ( + "@Ranks:superkingdom|phylum|class|order|family|genus|species|strain" + in runtmp.last_result.out + ) assert "@__program__:sourmash" in runtmp.last_result.out assert "2 superkingdom 2 Bacteria 13.08" in runtmp.last_result.out - assert "976 phylum 2|976 Bacteria|Bacteroidota 7.27" in runtmp.last_result.out - assert "1224 phylum 2|1224 Bacteria|Pseudomonadota 5.82" in runtmp.last_result.out - assert "200643 class 2|976|200643 Bacteria|Bacteroidota|Bacteroidia 7.27" in runtmp.last_result.out - assert "1236 class 2|1224|1236 Bacteria|Pseudomonadota|Gammaproteobacteria 5.82" in runtmp.last_result.out - assert "171549 order 2|976|200643|171549 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales 7.27" in runtmp.last_result.out - assert "91347 order 2|1224|1236|91347 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales 5.82" in runtmp.last_result.out - assert "171552 family 2|976|200643|171549|171552 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae 5.70" in runtmp.last_result.out - assert "543 family 2|1224|1236|91347|543 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae 5.82" in runtmp.last_result.out - assert "815 family 2|976|200643|171549|815 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae 1.56" in runtmp.last_result.out - assert "838 genus 2|976|200643|171549|171552|838 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella 5.70" in runtmp.last_result.out - assert "561 genus 2|1224|1236|91347|543|561 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia 5.82" in runtmp.last_result.out - assert "909656 genus 2|976|200643|171549|815|909656 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae|Phocaeicola 1.56" in runtmp.last_result.out - assert "165179 species 2|976|200643|171549|171552|838|165179 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella|Prevotella copri 5.70" in runtmp.last_result.out - assert "562 species 2|1224|1236|91347|543|561|562 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia|Escherichia coli 5.82" in runtmp.last_result.out - assert "821 species 2|976|200643|171549|815|909656|821 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae|Phocaeicola|Phocaeicola vulgatus 1.56" in runtmp.last_result.out + assert ( + "976 phylum 2|976 Bacteria|Bacteroidota 7.27" + in runtmp.last_result.out + ) + assert ( + "1224 phylum 2|1224 Bacteria|Pseudomonadota 5.82" + in runtmp.last_result.out + ) + assert ( + "200643 class 2|976|200643 Bacteria|Bacteroidota|Bacteroidia 7.27" + in runtmp.last_result.out + ) + assert ( + "1236 class 2|1224|1236 Bacteria|Pseudomonadota|Gammaproteobacteria 5.82" + in runtmp.last_result.out + ) + assert ( + "171549 order 2|976|200643|171549 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales 7.27" + in runtmp.last_result.out + ) + assert ( + "91347 order 2|1224|1236|91347 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales 5.82" + in runtmp.last_result.out + ) + assert ( + "171552 family 2|976|200643|171549|171552 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae 5.70" + in runtmp.last_result.out + ) + assert ( + "543 family 2|1224|1236|91347|543 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae 5.82" + in runtmp.last_result.out + ) + assert ( + "815 family 2|976|200643|171549|815 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae 1.56" + in runtmp.last_result.out + ) + assert ( + "838 genus 2|976|200643|171549|171552|838 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella 5.70" + in runtmp.last_result.out + ) + assert ( + "561 genus 2|1224|1236|91347|543|561 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia 5.82" + in runtmp.last_result.out + ) + assert ( + "909656 genus 2|976|200643|171549|815|909656 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae|Phocaeicola 1.56" + in runtmp.last_result.out + ) + assert ( + "165179 species 2|976|200643|171549|171552|838|165179 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella|Prevotella copri 5.70" + in runtmp.last_result.out + ) + assert ( + "562 species 2|1224|1236|91347|543|561|562 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia|Escherichia coli 5.82" + in runtmp.last_result.out + ) + assert ( + "821 species 2|976|200643|171549|815|909656|821 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae|Phocaeicola|Phocaeicola vulgatus 1.56" + in runtmp.last_result.out + ) def test_metagenome_bioboxes_outfile(runtmp): # test CAMI bioboxes format output - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.ncbi-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.ncbi-taxonomy.csv") csv_base = "out" sum_csv = csv_base + ".bioboxes.profile" csvout = runtmp.output(sum_csv) outdir = os.path.dirname(csvout) - runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-F', "bioboxes", '-o', csv_base, '--output-dir', outdir,) + runtmp.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-F", + "bioboxes", + "-o", + csv_base, + "--output-dir", + outdir, + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -349,26 +806,46 @@ def test_metagenome_bioboxes_outfile(runtmp): assert runtmp.last_result.status == 0 - bb_results = [x.rstrip().split('\t') for x in Path(csvout).read_text().splitlines()] + bb_results = [x.rstrip().split("\t") for x in Path(csvout).read_text().splitlines()] assert f"saving 'bioboxes' output to '{csvout}'" in runtmp.last_result.err print(bb_results) - assert ['# Taxonomic Profiling Output'] == bb_results[0] - assert ['@SampleID:test1'] == bb_results[1] - assert ['2', 'superkingdom', '2', 'Bacteria', '13.08'] == bb_results[6] - assert ['838', 'genus', '2|976|200643|171549|171552|838', 'Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella', '5.70'] == bb_results[16] + assert ["# Taxonomic Profiling Output"] == bb_results[0] + assert ["@SampleID:test1"] == bb_results[1] + assert ["2", "superkingdom", "2", "Bacteria", "13.08"] == bb_results[6] + assert [ + "838", + "genus", + "2|976|200643|171549|171552|838", + "Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella", + "5.70", + ] == bb_results[16] def test_metagenome_krona_tsv_out(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" kr_csv = csv_base + ".krona.tsv" csvout = runtmp.output(kr_csv) outdir = os.path.dirname(csvout) print("csvout: ", csvout) - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, - '--output-format', 'krona', '--rank', 'genus', '--output-dir', outdir) + runtmp.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-format", + "krona", + "--rank", + "genus", + "--output-dir", + outdir, + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -378,27 +855,82 @@ def test_metagenome_krona_tsv_out(runtmp): assert os.path.exists(csvout) assert f"saving 'krona' output to '{csvout}'" in runtmp.last_result.err - gn_krona_results = [x.rstrip().split('\t') for x in Path(csvout).read_text().splitlines()] + gn_krona_results = [ + x.rstrip().split("\t") for x in Path(csvout).read_text().splitlines() + ] print("species krona results: \n", gn_krona_results) - assert ['fraction', 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus'] == gn_krona_results[0] - assert ['0.0885520542481053', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Prevotella'] == gn_krona_results[1] - assert ['0.08815317112086159', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Enterobacteriaceae', 'g__Escherichia'] == gn_krona_results[2] - assert ['0.027522935779816515', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Phocaeicola'] == gn_krona_results[3] - assert ['0.7957718388512166', 'unclassified', 'unclassified', 'unclassified', 'unclassified', 'unclassified', 'unclassified'] == gn_krona_results[4] + assert [ + "fraction", + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + ] == gn_krona_results[0] + assert [ + "0.0885520542481053", + "d__Bacteria", + "p__Bacteroidota", + "c__Bacteroidia", + "o__Bacteroidales", + "f__Bacteroidaceae", + "g__Prevotella", + ] == gn_krona_results[1] + assert [ + "0.08815317112086159", + "d__Bacteria", + "p__Proteobacteria", + "c__Gammaproteobacteria", + "o__Enterobacterales", + "f__Enterobacteriaceae", + "g__Escherichia", + ] == gn_krona_results[2] + assert [ + "0.027522935779816515", + "d__Bacteria", + "p__Bacteroidota", + "c__Bacteroidia", + "o__Bacteroidales", + "f__Bacteroidaceae", + "g__Phocaeicola", + ] == gn_krona_results[3] + assert [ + "0.7957718388512166", + "unclassified", + "unclassified", + "unclassified", + "unclassified", + "unclassified", + "unclassified", + ] == gn_krona_results[4] def test_metagenome_lineage_summary_out(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" lin_csv = csv_base + ".lineage_summary.tsv" csvout = runtmp.output(lin_csv) outdir = os.path.dirname(csvout) print("csvout: ", csvout) - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '-o', csv_base, '--output-format', 'lineage_summary', '--rank', - 'genus', '--output-dir', outdir) + runtmp.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-format", + "lineage_summary", + "--rank", + "genus", + "--output-dir", + outdir, + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -408,26 +940,50 @@ def test_metagenome_lineage_summary_out(runtmp): assert os.path.exists(csvout) assert f"saving 'lineage_summary' output to '{csvout}'" in runtmp.last_result.err - gn_lineage_summary = [x.rstrip().split('\t') for x in Path(csvout).read_text().splitlines()] + gn_lineage_summary = [ + x.rstrip().split("\t") for x in Path(csvout).read_text().splitlines() + ] print("species lineage summary results: \n", gn_lineage_summary) - assert ['lineage', 'test1'] == gn_lineage_summary[0] - assert ['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola', '0.027522935779816515'] == gn_lineage_summary[1] - assert ['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella', '0.0885520542481053'] == gn_lineage_summary[2] - assert ['d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia', '0.08815317112086159'] == gn_lineage_summary[3] - assert ['unclassified', '0.7957718388512166'] == gn_lineage_summary[4] + assert ["lineage", "test1"] == gn_lineage_summary[0] + assert [ + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola", + "0.027522935779816515", + ] == gn_lineage_summary[1] + assert [ + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella", + "0.0885520542481053", + ] == gn_lineage_summary[2] + assert [ + "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia", + "0.08815317112086159", + ] == gn_lineage_summary[3] + assert ["unclassified", "0.7957718388512166"] == gn_lineage_summary[4] def test_metagenome_human_format_out(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" - csvout = runtmp.output(csv_base + '.human.txt') + csvout = runtmp.output(csv_base + ".human.txt") outdir = os.path.dirname(csvout) print("csvout: ", csvout) - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '-o', csv_base, '--output-format', 'human', '--rank', - 'genus', '--output-dir', outdir) + runtmp.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-format", + "human", + "--rank", + "genus", + "--output-dir", + outdir, + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -441,104 +997,192 @@ def test_metagenome_human_format_out(runtmp): outp = fp.readlines() assert len(outp) == 6 - outp = [ x.strip() for x in outp ] + outp = [x.strip() for x in outp] print(outp) - assert outp[0] == 'sample name proportion cANI lineage' - assert outp[1] == '----------- ---------- ---- -------' - assert outp[2] == 'test1 86.9% - unclassified' - assert outp[3] == 'test1 5.8% 92.5% d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia' - assert outp[4] == 'test1 5.7% 92.5% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella' - assert outp[5] == 'test1 1.6% 89.1% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola' + assert outp[0] == "sample name proportion cANI lineage" + assert outp[1] == "----------- ---------- ---- -------" + assert outp[2] == "test1 86.9% - unclassified" + assert ( + outp[3] + == "test1 5.8% 92.5% d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia" + ) + assert ( + outp[4] + == "test1 5.7% 92.5% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella" + ) + assert ( + outp[5] + == "test1 1.6% 89.1% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola" + ) def test_metagenome_no_taxonomy_fail(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'metagenome', '-g', g_csv) - assert "error: the following arguments are required: -t/--taxonomy-csv" in str(exc.value) + c.run_sourmash("tax", "metagenome", "-g", g_csv) + assert "error: the following arguments are required: -t/--taxonomy-csv" in str( + exc.value + ) def test_metagenome_no_rank_lineage_summary(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'lineage_summary') + runtmp.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-format", + "lineage_summary", + ) print(str(exc.value)) - assert "Rank (--rank) is required for krona, lineage_summary output formats." in str(exc.value) + assert ( + "Rank (--rank) is required for krona, lineage_summary output formats." + in str(exc.value) + ) def test_metagenome_no_rank_krona(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona') + runtmp.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-format", + "krona", + ) print(str(exc.value)) - assert "Rank (--rank) is required for krona, lineage_summary output formats." in str(exc.value) + assert ( + "Rank (--rank) is required for krona, lineage_summary output formats." + in str(exc.value) + ) def test_metagenome_bad_rank_krona(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona', '--rank', 'NotARank') + runtmp.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-format", + "krona", + "--rank", + "NotARank", + ) print(str(exc.value)) - assert "Invalid '--rank'/'--position' input: 'NotARank'. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'" in runtmp.last_result.err + assert ( + "Invalid '--rank'/'--position' input: 'NotARank'. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'" + in runtmp.last_result.err + ) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona', '--rank', '5') + runtmp.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-format", + "krona", + "--rank", + "5", + ) print(str(exc.value)) - assert "Invalid '--rank'/'--position' input: '5'. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'" in runtmp.last_result.err + assert ( + "Invalid '--rank'/'--position' input: '5'. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'" + in runtmp.last_result.err + ) def test_genome_no_rank_krona(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona') + runtmp.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-o", + csv_base, + "--output-format", + "krona", + ) assert "ERROR: Rank (--rank) is required for krona output formats" in str(exc.value) def test_metagenome_rank_not_available(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--rank', 'strain') + c.run_sourmash( + "tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax, "--rank", "strain" + ) print(str(exc.value)) assert c.last_result.status == -1 - assert "No taxonomic information provided for rank strain: cannot summarize at this rank" in str(exc.value) + assert ( + "No taxonomic information provided for rank strain: cannot summarize at this rank" + in str(exc.value) + ) def test_metagenome_duplicated_taxonomy_fail(runtmp): c = runtmp # write temp taxonomy with duplicates - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") duplicated_csv = runtmp.output("duplicated_taxonomy.csv") - with open(duplicated_csv, 'w') as dup: + with open(duplicated_csv, "w") as dup: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax.append(tax[1] + 'FOO') # add first tax_assign again + tax.append(tax[1] + "FOO") # add first tax_assign again dup.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', duplicated_csv) + c.run_sourmash( + "tax", "metagenome", "-g", g_csv, "--taxonomy-csv", duplicated_csv + ) assert "cannot read taxonomy" in str(exc.value) assert "multiple lineages for identifier GCF_001881345" in str(exc.value) @@ -547,16 +1191,18 @@ def test_metagenome_duplicated_taxonomy_fail(runtmp): def test_metagenome_duplicated_taxonomy_force(runtmp): c = runtmp # write temp taxonomy with duplicates - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") duplicated_csv = runtmp.output("duplicated_taxonomy.csv") - with open(duplicated_csv, 'w') as dup: + with open(duplicated_csv, "w") as dup: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax.append(tax[1]) # add first tax_assign again + tax.append(tax[1]) # add first tax_assign again dup.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', duplicated_csv, '--force') + c.run_sourmash( + "tax", "metagenome", "-g", g_csv, "--taxonomy-csv", duplicated_csv, "--force" + ) print(c.last_result.status) print(c.last_result.out) @@ -564,55 +1210,105 @@ def test_metagenome_duplicated_taxonomy_force(runtmp): # same as stdout test - just check the first few lines assert c.last_result.status == 0 - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000' in c.last_result.out - assert 'test1,superkingdom,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,phylum,0.116,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.073,582000' in c.last_result.out - assert 'test1,phylum,0.088,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,phylum,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.116,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.073,582000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.088,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.796,unclassified,md5,test1.sig,0.869,3990000" + in c.last_result.out + ) def test_metagenome_missing_taxonomy(runtmp): c = runtmp # write temp taxonomy with missing entry - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] subset.write("\n".join(tax[:4])) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', subset_csv) + c.run_sourmash("tax", "metagenome", "-g", g_csv, "--taxonomy-csv", subset_csv) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "The following are missing from the taxonomy information: GCF_003471795" in c.last_result.err - - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,superkingdom,0.193,d__Bacteria,md5,test1.sig,0.124,970000'in c.last_result.out - assert 'test1,superkingdom,0.807,unclassified,md5,test1.sig,0.876,4044000' in c.last_result.out - assert 'test1,phylum,0.105,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.066,528000' in c.last_result.out - assert 'test1,phylum,0.088,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.058,442000' in c.last_result.out - assert 'test1,phylum,0.807,unclassified,md5,test1.sig,0.876,4044000' in c.last_result.out - assert 'test1,class,0.105,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test1.sig,0.066,528000' in c.last_result.out + assert ( + "The following are missing from the taxonomy information: GCF_003471795" + in c.last_result.err + ) + + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.193,d__Bacteria,md5,test1.sig,0.124,970000" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.807,unclassified,md5,test1.sig,0.876,4044000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.105,d__Bacteria;p__Bacteroidota,md5,test1.sig,0.066,528000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.088,d__Bacteria;p__Proteobacteria,md5,test1.sig,0.058,442000" + in c.last_result.out + ) + assert ( + "test1,phylum,0.807,unclassified,md5,test1.sig,0.876,4044000" + in c.last_result.out + ) + assert ( + "test1,class,0.105,d__Bacteria;p__Bacteroidota;c__Bacteroidia,md5,test1.sig,0.066,528000" + in c.last_result.out + ) def test_metagenome_missing_fail_taxonomy(runtmp): c = runtmp # write temp taxonomy with missing entry - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] subset.write("\n".join(tax[:4])) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', subset_csv, '--fail-on-missing-taxonomy') + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + subset_csv, + "--fail-on-missing-taxonomy", + ) print(str(exc.value)) @@ -624,162 +1320,315 @@ def test_metagenome_missing_fail_taxonomy(runtmp): def test_metagenome_multiple_taxonomy_files_missing(runtmp): c = runtmp # write temp taxonomy with duplicates - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") # gather against mult databases - g_csv = utils.get_test_data('tax/test1_x_gtdbrs202_genbank_euks.gather.csv') + g_csv = utils.get_test_data("tax/test1_x_gtdbrs202_genbank_euks.gather.csv") - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', taxonomy_csv, '--force') + c.run_sourmash( + "tax", "metagenome", "-g", g_csv, "--taxonomy-csv", taxonomy_csv, "--force" + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) - assert "of 6 gather results, lineage assignments for 2 results were missed" in c.last_result.err - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'multtest,superkingdom,0.204,d__Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000' in c.last_result.out - assert 'multtest,superkingdom,0.796,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.869,3990000' in c.last_result.out - assert 'multtest,phylum,0.116,d__Bacteria;p__Bacteroidota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out - assert 'multtest,phylum,0.088,d__Bacteria;p__Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000' in c.last_result.out - assert 'multtest,phylum,0.796,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.869,3990000' in c.last_result.out - assert 'multtest,class,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out - assert 'multtest,class,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000' in c.last_result.out - assert 'multtest,class,0.796,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.869,3990000' in c.last_result.out + assert ( + "of 6 gather results, lineage assignments for 2 results were missed" + in c.last_result.err + ) + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.204,d__Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.796,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.869,3990000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.116,d__Bacteria;p__Bacteroidota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.088,d__Bacteria;p__Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.796,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.869,3990000" + in c.last_result.out + ) + assert ( + "multtest,class,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000" + in c.last_result.out + ) + assert ( + "multtest,class,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000" + in c.last_result.out + ) + assert ( + "multtest,class,0.796,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.869,3990000" + in c.last_result.out + ) def test_metagenome_multiple_taxonomy_files(runtmp): c = runtmp # write temp taxonomy with duplicates - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - protozoa_genbank = utils.get_test_data('tax/protozoa_genbank_lineage.csv') - bacteria_refseq = utils.get_test_data('tax/bacteria_refseq_lineage.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + protozoa_genbank = utils.get_test_data("tax/protozoa_genbank_lineage.csv") + bacteria_refseq = utils.get_test_data("tax/bacteria_refseq_lineage.csv") # gather against mult databases - g_csv = utils.get_test_data('tax/test1_x_gtdbrs202_genbank_euks.gather.csv') - - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', taxonomy_csv, protozoa_genbank, bacteria_refseq) + g_csv = utils.get_test_data("tax/test1_x_gtdbrs202_genbank_euks.gather.csv") + + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + taxonomy_csv, + protozoa_genbank, + bacteria_refseq, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'multtest,superkingdom,0.204,Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000' in c.last_result.out - assert 'multtest,superkingdom,0.051,Eukaryota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000' in c.last_result.out - assert 'multtest,superkingdom,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000' in c.last_result.out - assert 'multtest,phylum,0.116,Bacteria;Bacteroidetes,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out - assert 'multtest,phylum,0.088,Bacteria;Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000' in c.last_result.out - assert 'multtest,phylum,0.051,Eukaryota;Apicomplexa,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000' in c.last_result.out - assert 'multtest,phylum,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000' in c.last_result.out - assert 'multtest,class,0.116,Bacteria;Bacteroidetes;Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.204,Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.051,Eukaryota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.116,Bacteria;Bacteroidetes,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.088,Bacteria;Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.051,Eukaryota;Apicomplexa,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000" + in c.last_result.out + ) + assert ( + "multtest,class,0.116,Bacteria;Bacteroidetes;Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000" + in c.last_result.out + ) def test_metagenome_multiple_taxonomy_files_multiple_taxonomy_args(runtmp): c = runtmp # pass in mult tax files using mult tax arguments - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - protozoa_genbank = utils.get_test_data('tax/protozoa_genbank_lineage.csv') - bacteria_refseq = utils.get_test_data('tax/bacteria_refseq_lineage.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + protozoa_genbank = utils.get_test_data("tax/protozoa_genbank_lineage.csv") + bacteria_refseq = utils.get_test_data("tax/bacteria_refseq_lineage.csv") # gather against mult databases - g_csv = utils.get_test_data('tax/test1_x_gtdbrs202_genbank_euks.gather.csv') - - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', taxonomy_csv, '-t', protozoa_genbank, '-t', bacteria_refseq) + g_csv = utils.get_test_data("tax/test1_x_gtdbrs202_genbank_euks.gather.csv") + + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + taxonomy_csv, + "-t", + protozoa_genbank, + "-t", + bacteria_refseq, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'multtest,superkingdom,0.204,Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000' in c.last_result.out - assert 'multtest,superkingdom,0.051,Eukaryota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000' in c.last_result.out - assert 'multtest,superkingdom,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000' in c.last_result.out - assert 'multtest,phylum,0.116,Bacteria;Bacteroidetes,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out - assert 'multtest,phylum,0.088,Bacteria;Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000' in c.last_result.out - assert 'multtest,phylum,0.051,Eukaryota;Apicomplexa,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000' in c.last_result.out - assert 'multtest,phylum,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000' in c.last_result.out - assert 'multtest,class,0.116,Bacteria;Bacteroidetes;Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.204,Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.051,Eukaryota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.116,Bacteria;Bacteroidetes,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.088,Bacteria;Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.051,Eukaryota;Apicomplexa,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000" + in c.last_result.out + ) + assert ( + "multtest,class,0.116,Bacteria;Bacteroidetes;Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000" + in c.last_result.out + ) def test_metagenome_multiple_taxonomy_files_multiple_taxonomy_args_empty_force(runtmp): # pass in mult tax files using mult tax arguments, with one empty, # and use --force c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - protozoa_genbank = utils.get_test_data('tax/protozoa_genbank_lineage.csv') - bacteria_refseq = utils.get_test_data('tax/bacteria_refseq_lineage.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + protozoa_genbank = utils.get_test_data("tax/protozoa_genbank_lineage.csv") + bacteria_refseq = utils.get_test_data("tax/bacteria_refseq_lineage.csv") - tax_empty = runtmp.output('t.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax_empty = runtmp.output("t.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") with open(tax_empty, "w") as fp: fp.write("") print("t_csv: ", tax_empty) # gather against mult databases - g_csv = utils.get_test_data('tax/test1_x_gtdbrs202_genbank_euks.gather.csv') - - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', taxonomy_csv, '-t', protozoa_genbank, '-t', bacteria_refseq, '-t', tax_empty, '--force') + g_csv = utils.get_test_data("tax/test1_x_gtdbrs202_genbank_euks.gather.csv") + + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + taxonomy_csv, + "-t", + protozoa_genbank, + "-t", + bacteria_refseq, + "-t", + tax_empty, + "--force", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'multtest,superkingdom,0.204,Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000' in c.last_result.out - assert 'multtest,superkingdom,0.051,Eukaryota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000' in c.last_result.out - assert 'multtest,superkingdom,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000' in c.last_result.out - assert 'multtest,phylum,0.116,Bacteria;Bacteroidetes,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out - assert 'multtest,phylum,0.088,Bacteria;Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000' in c.last_result.out - assert 'multtest,phylum,0.051,Eukaryota;Apicomplexa,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000' in c.last_result.out - assert 'multtest,phylum,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000' in c.last_result.out - assert 'multtest,class,0.116,Bacteria;Bacteroidetes;Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000' in c.last_result.out + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.204,Bacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.131,1024000" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.051,Eukaryota,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000" + in c.last_result.out + ) + assert ( + "multtest,superkingdom,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.116,Bacteria;Bacteroidetes,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.088,Bacteria;Proteobacteria,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.058,442000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.051,Eukaryota;Apicomplexa,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.245,258000" + in c.last_result.out + ) + assert ( + "multtest,phylum,0.744,unclassified,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.624,3732000" + in c.last_result.out + ) + assert ( + "multtest,class,0.116,Bacteria;Bacteroidetes;Bacteroidia,9687eeed,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,0.073,582000" + in c.last_result.out + ) def test_metagenome_empty_gather_results(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") - #creates empty gather result - g_csv = runtmp.output('g.csv') + # creates empty gather result + g_csv = runtmp.output("g.csv") with open(g_csv, "w") as fp: fp.write("") print("g_csv: ", g_csv) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax) - assert f"Cannot read gather results from '{g_csv}'. Is file empty?" in str(exc.value) + assert f"Cannot read gather results from '{g_csv}'. Is file empty?" in str( + exc.value + ) assert runtmp.last_result.status == -1 def test_metagenome_bad_gather_header(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") - bad_g_csv = runtmp.output('g.csv') + bad_g_csv = runtmp.output("g.csv") - #creates bad gather result - bad_g = [x.replace("query_bp", "nope") + "\n" for x in Path(g_csv).read_text().splitlines()] - with open(bad_g_csv, 'w') as fp: + # creates bad gather result + bad_g = [ + x.replace("query_bp", "nope") + "\n" + for x in Path(g_csv).read_text().splitlines() + ] + with open(bad_g_csv, "w") as fp: fp.writelines(bad_g) print("bad_gather_results: \n", bad_g) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-g', bad_g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "metagenome", "-g", bad_g_csv, "--taxonomy-csv", tax) print(str(exc.value)) - assert 'is missing columns needed for taxonomic summarization.' in str(exc.value) + assert "is missing columns needed for taxonomic summarization." in str(exc.value) assert runtmp.last_result.status == -1 def test_metagenome_empty_tax_lineage_input(runtmp): # test an empty tax CSV - tax_empty = runtmp.output('t.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax_empty = runtmp.output("t.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") with open(tax_empty, "w") as fp: fp.write("") print("t_csv: ", tax_empty) - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax_empty) + runtmp.run_sourmash( + "tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax_empty + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -791,16 +1640,17 @@ def test_metagenome_empty_tax_lineage_input(runtmp): def test_metagenome_empty_tax_lineage_input_force(runtmp): # test an empty tax CSV with --force - tax_empty = runtmp.output('t.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax_empty = runtmp.output("t.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") with open(tax_empty, "w") as fp: fp.write("") print("t_csv: ", tax_empty) - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax_empty, '--force') + runtmp.run_sourmash( + "tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax_empty, "--force" + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -811,17 +1661,17 @@ def test_metagenome_empty_tax_lineage_input_force(runtmp): def test_metagenome_perfect_match_warning(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") - perfect_g_csv = runtmp.output('g.csv') + perfect_g_csv = runtmp.output("g.csv") - #create a perfect gather result - with open(g_csv, 'r') as fp: - r = csv.DictReader(fp, delimiter=',') + # create a perfect gather result + with open(g_csv) as fp: + r = csv.DictReader(fp, delimiter=",") header = r.fieldnames print(header) - with open(perfect_g_csv, 'w') as out_fp: + with open(perfect_g_csv, "w") as out_fp: w = csv.DictWriter(out_fp, header) w.writeheader() for n, row in enumerate(r): @@ -834,28 +1684,31 @@ def test_metagenome_perfect_match_warning(runtmp): w.writerow(row) print(row) - runtmp.run_sourmash('tax', 'metagenome', '-g', perfect_g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "metagenome", "-g", perfect_g_csv, "--taxonomy-csv", tax) print(runtmp.last_result.status) print(runtmp.last_result.out) print(runtmp.last_result.err) assert runtmp.last_result.status == 0 - assert "WARNING: 100% match! Is query 'test1' identical to its database match, 'GCF_001881345'?" in runtmp.last_result.err + assert ( + "WARNING: 100% match! Is query 'test1' identical to its database match, 'GCF_001881345'?" + in runtmp.last_result.err + ) def test_metagenome_over100percent_error(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") - perfect_g_csv = runtmp.output('g.csv') + perfect_g_csv = runtmp.output("g.csv") - #create a perfect gather result - with open(g_csv, 'r') as fp: - r = csv.DictReader(fp, delimiter=',') + # create a perfect gather result + with open(g_csv) as fp: + r = csv.DictReader(fp, delimiter=",") header = r.fieldnames print(header) - with open(perfect_g_csv, 'w') as out_fp: + with open(perfect_g_csv, "w") as out_fp: w = csv.DictWriter(out_fp, header) w.writeheader() for n, row in enumerate(r): @@ -866,49 +1719,72 @@ def test_metagenome_over100percent_error(runtmp): print(row) with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'metagenome', '-g', perfect_g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash( + "tax", "metagenome", "-g", perfect_g_csv, "--taxonomy-csv", tax + ) print(runtmp.last_result.status) print(runtmp.last_result.out) print(runtmp.last_result.err) assert runtmp.last_result.status == -1 - assert "fraction is > 100% of the query! This should not be possible." in runtmp.last_result.err + assert ( + "fraction is > 100% of the query! This should not be possible." + in runtmp.last_result.err + ) def test_metagenome_gather_duplicate_query(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # different filename, contents identical to test1 g_res2 = runtmp.output("test2.gather.csv") - with open(g_res2, 'w') as fp: + with open(g_res2, "w") as fp: fp.write(Path(g_res).read_text()) with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res2, - '--taxonomy-csv', taxonomy_csv) + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + ) assert c.last_result.status == -1 print(str(exc.value)) - assert "Gather query test1 was found in more than one CSV. Cannot load from " in str(exc.value) + assert ( + "Gather query test1 was found in more than one CSV. Cannot load from " + in str(exc.value) + ) def test_metagenome_gather_duplicate_query_force(runtmp): # do not load same query from multiple files. c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # different filename, contents identical to test1 g_res2 = runtmp.output("test2.gather.csv") - with open(g_res2, 'w') as fp: + with open(g_res2, "w") as fp: fp.write(Path(g_res).read_text()) - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res2, - '--taxonomy-csv', taxonomy_csv, '--force') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "--force", + ) print(c.last_result.status) print(c.last_result.out) @@ -923,18 +1799,27 @@ def test_metagenome_gather_duplicate_query_force(runtmp): def test_metagenome_two_queries_human_output(runtmp): # do not load same query from multiple files. c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # make a second query with same output g_res2 = runtmp.output("test2.gather.csv") - with open(g_res2, 'w') as fp: + with open(g_res2, "w") as fp: for line in Path(g_res).read_text().splitlines(): - line = line.replace('test1', 'test2') + "\n" + line = line.replace("test1", "test2") + "\n" fp.write(line) - c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res2, - '--taxonomy-csv', taxonomy_csv, '-F', "human") + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "-F", + "human", + ) print(c.last_result.status) print(c.last_result.out) @@ -942,9 +1827,15 @@ def test_metagenome_two_queries_human_output(runtmp): assert c.last_result.status == 0 assert "test1 86.9% - unclassified" in c.last_result.out - assert "test1 5.8% 92.5% d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert ( + "test1 5.8% 92.5% d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" + in c.last_result.out + ) assert "test2 86.9% - unclassified" in c.last_result.out - assert "test2 5.8% 92.5% d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert ( + "test2 5.8% 92.5% d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" + in c.last_result.out + ) assert "test2 5.7% 92.5% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" assert "test2 1.6% 89.1% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" @@ -952,22 +1843,36 @@ def test_metagenome_two_queries_human_output(runtmp): def test_metagenome_two_queries_with_single_query_output_formats_fail(runtmp): # fail on multiple queries with single query output formats c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # make a second query with same output g_res2 = runtmp.output("test2.gather.csv") - with open(g_res2, 'w') as fp: + with open(g_res2, "w") as fp: for line in Path(g_res).read_text().splitlines(): - line = line.replace('test1', 'test2') + "\n" + line = line.replace("test1", "test2") + "\n" fp.write(line) csv_summary_out = runtmp.output("tst.summarized.csv") kreport_out = runtmp.output("tst.kreport.txt") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res2, - '--taxonomy-csv', taxonomy_csv, '-F', "csv_summary", "kreport", "--rank", "phylum", "-o", "tst") + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "-F", + "csv_summary", + "kreport", + "--rank", + "phylum", + "-o", + "tst", + ) print(str(exc.value)) assert not os.path.exists(csv_summary_out) @@ -975,29 +1880,47 @@ def test_metagenome_two_queries_with_single_query_output_formats_fail(runtmp): assert c.last_result.status == -1 assert "loaded results for 2 queries from 2 gather CSVs" in c.last_result.err - assert "WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping csv_summary, kreport" in c.last_result.err + assert ( + "WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping csv_summary, kreport" + in c.last_result.err + ) assert "ERROR: No output formats remaining." in c.last_result.err def test_metagenome_two_queries_skip_single_query_output_formats(runtmp): # remove single-query outputs when working with multiple queries c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # make a second query with same output g_res2 = runtmp.output("test2.gather.csv") - with open(g_res2, 'w') as fp: + with open(g_res2, "w") as fp: for line in Path(g_res).read_text().splitlines(): - line = line.replace('test1', 'test2') + "\n" + line = line.replace("test1", "test2") + "\n" fp.write(line) csv_summary_out = runtmp.output("tst.summarized.csv") kreport_out = runtmp.output("tst.kreport.txt") lineage_summary_out = runtmp.output("tst.lineage_summary.tsv") - c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res2, - '--taxonomy-csv', taxonomy_csv, '-F', "csv_summary", "kreport", "lineage_summary", "--rank", "phylum", "-o", "tst") + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "-F", + "csv_summary", + "kreport", + "lineage_summary", + "--rank", + "phylum", + "-o", + "tst", + ) assert not os.path.exists(csv_summary_out) assert not os.path.exists(kreport_out) @@ -1005,32 +1928,52 @@ def test_metagenome_two_queries_skip_single_query_output_formats(runtmp): assert c.last_result.status == 0 assert "loaded results for 2 queries from 2 gather CSVs" in c.last_result.err - assert "WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping csv_summary, kreport" in c.last_result.err + assert ( + "WARNING: found results for multiple gather queries. Can only output multi-query result formats: skipping csv_summary, kreport" + in c.last_result.err + ) def test_metagenome_two_queries_krona(runtmp): # for now, we enable multi-query krona. Is this desired? c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # make a second query with same output g_res2 = runtmp.output("test2.gather.csv") - with open(g_res2, 'w') as fp: + with open(g_res2, "w") as fp: for line in Path(g_res).read_text().splitlines(): - line = line.replace('test1', 'test2') + "\n" + line = line.replace("test1", "test2") + "\n" fp.write(line) - c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res2, - '--taxonomy-csv', taxonomy_csv, '-F', "krona", '--rank', 'superkingdom') + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "-F", + "krona", + "--rank", + "superkingdom", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "WARNING: results from more than one query found. Krona summarization not recommended." in c.last_result.err - assert "Percentage assignment will be normalized by the number of queries to maintain range 0-100%" in c.last_result.err + assert ( + "WARNING: results from more than one query found. Krona summarization not recommended." + in c.last_result.err + ) + assert ( + "Percentage assignment will be normalized by the number of queries to maintain range 0-100%" + in c.last_result.err + ) assert "fraction superkingdom" in c.last_result.out assert "0.2042281611487834 d__Bacteria" in c.last_result.out assert "0.7957718388512166 unclassified" in c.last_result.out @@ -1040,108 +1983,150 @@ def test_metagenome_gather_duplicate_filename(runtmp): # test that a duplicate filename is properly flagged, when passed in # twice to a single -g argument. c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') - - c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, g_res, '--taxonomy-csv', taxonomy_csv) + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") + + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + g_res, + "--taxonomy-csv", + taxonomy_csv, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert f'ignoring duplicated reference to file: {g_res}' - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000' in c.last_result.out + assert f"ignoring duplicated reference to file: {g_res}" + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000" + in c.last_result.out + ) def test_metagenome_gather_duplicate_filename_2(runtmp): # test that a duplicate filename is properly flagged, with -g a -g b c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') - - c.run_sourmash('tax', 'metagenome', '--gather-csv', g_res, '-g', g_res, '--taxonomy-csv', taxonomy_csv) + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") + + c.run_sourmash( + "tax", + "metagenome", + "--gather-csv", + g_res, + "-g", + g_res, + "--taxonomy-csv", + taxonomy_csv, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert f'ignoring duplicated reference to file: {g_res}' - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000' in c.last_result.out + assert f"ignoring duplicated reference to file: {g_res}" + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000" + in c.last_result.out + ) def test_metagenome_gather_duplicate_filename_from_file(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res}\n") f_csv.write(f"{g_res}\n") - c.run_sourmash('tax', 'metagenome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv) + c.run_sourmash( + "tax", "metagenome", "--from-file", g_from_file, "--taxonomy-csv", taxonomy_csv + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert f'ignoring duplicated reference to file: {g_res}' - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000' in c.last_result.out + assert f"ignoring duplicated reference to file: {g_res}" + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,superkingdom,0.204,d__Bacteria,md5,test1.sig,0.131,1024000" + in c.last_result.out + ) def test_genome_empty_gather_results(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") - #creates empty gather result - g_csv = runtmp.output('g.csv') + # creates empty gather result + g_csv = runtmp.output("g.csv") with open(g_csv, "w") as fp: fp.write("") print("g_csv: ", g_csv) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "genome", "-g", g_csv, "--taxonomy-csv", tax) assert runtmp.last_result.status == -1 print(runtmp.last_result.err) print(runtmp.last_result.out) - assert f"Cannot read gather results from '{g_csv}'. Is file empty?" in str(exc.value) + assert f"Cannot read gather results from '{g_csv}'. Is file empty?" in str( + exc.value + ) def test_genome_bad_gather_header(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") - bad_g_csv = runtmp.output('g.csv') + bad_g_csv = runtmp.output("g.csv") - #creates bad gather result - bad_g = [x.replace("f_unique_to_query", "nope") + "\n" for x in Path(g_csv).read_text().splitlines()] - with open(bad_g_csv, 'w') as fp: + # creates bad gather result + bad_g = [ + x.replace("f_unique_to_query", "nope") + "\n" + for x in Path(g_csv).read_text().splitlines() + ] + with open(bad_g_csv, "w") as fp: fp.writelines(bad_g) print("bad_gather_results: \n", bad_g) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'genome', '-g', bad_g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "genome", "-g", bad_g_csv, "--taxonomy-csv", tax) - assert 'is missing columns needed for taxonomic summarization.' in str(exc.value) + assert "is missing columns needed for taxonomic summarization." in str(exc.value) assert runtmp.last_result.status == -1 def test_genome_empty_tax_lineage_input(runtmp): # test an empty tax csv - tax_empty = runtmp.output('t.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax_empty = runtmp.output("t.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") with open(tax_empty, "w") as fp: fp.write("") print("t_csv: ", tax_empty) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax_empty) + runtmp.run_sourmash("tax", "genome", "-g", g_csv, "--taxonomy-csv", tax_empty) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -1155,66 +2140,124 @@ def test_genome_rank_stdout_0(runtmp): # test basic genome c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') - - c.run_sourmash('tax', 'genome', '--gather-csv', g_csv, '--taxonomy-csv', tax, - '--rank', 'species', '--containment-threshold', '0') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") + + c.run_sourmash( + "tax", + "genome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "--rank", + "species", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_rank_stdout_0_db(runtmp): # test basic genome with sqlite database c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.db') - - c.run_sourmash('tax', 'genome', '--gather-csv', g_csv, '--taxonomy-csv', - tax, '--rank', 'species', '--containment-threshold', '0') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.db") + + c.run_sourmash( + "tax", + "genome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "--rank", + "species", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) # too stringent of containment threshold: - c.run_sourmash('tax', 'genome', '--gather-csv', g_csv, '--taxonomy-csv', - tax, '--rank', 'species', '--containment-threshold', '1.0') + c.run_sourmash( + "tax", + "genome", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "--rank", + "species", + "--containment-threshold", + "1.0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000," in c.last_result.out + assert ( + "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000," + in c.last_result.out + ) def test_genome_rank_csv_0(runtmp): # test basic genome - output csv c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" cl_csv = csv_base + ".classifications.csv" csvout = runtmp.output(cl_csv) outdir = os.path.dirname(csvout) print("csvout: ", csvout) - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--rank', 'species', '-o', csv_base, '--containment-threshold', '0', - '--output-dir', outdir) + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--rank", + "species", + "-o", + csv_base, + "--containment-threshold", + "0", + "--output-dir", + outdir, + ) print(c.last_result.status) print(c.last_result.out) @@ -1223,25 +2266,46 @@ def test_genome_rank_csv_0(runtmp): assert f"saving 'classification' output to '{csvout}'" in runtmp.last_result.err assert c.last_result.status == 0 cl_results = [x.rstrip() for x in Path(csvout).read_text().splitlines()] - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in cl_results[0] - assert 'test1,match,species,0.0885520542481053,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.05701254275940707,444000' in cl_results[1] + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in cl_results[0] + ) + assert ( + "test1,match,species,0.0885520542481053,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.05701254275940707,444000" + in cl_results[1] + ) def test_genome_rank_krona(runtmp): # test basic genome - output csv c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" cl_csv = csv_base + ".krona.tsv" csvout = runtmp.output(cl_csv) outdir = os.path.dirname(csvout) print("csvout: ", csvout) - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--rank', 'species', '-o', csv_base, '--containment-threshold', '0', - '--output-format', 'krona', '--output-dir', outdir) + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--rank", + "species", + "-o", + csv_base, + "--containment-threshold", + "0", + "--output-format", + "krona", + "--output-dir", + outdir, + ) print(c.last_result.status) print(c.last_result.out) @@ -1249,26 +2313,59 @@ def test_genome_rank_krona(runtmp): assert f"saving 'krona' output to '{csvout}'" in runtmp.last_result.err assert c.last_result.status == 0 - kr_results = [x.rstrip().split('\t') for x in Path(csvout).read_text().splitlines()] + kr_results = [x.rstrip().split("\t") for x in Path(csvout).read_text().splitlines()] print(kr_results) - assert ['fraction', 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] == kr_results[0] - assert ['0.0885520542481053', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Prevotella', 's__Prevotella copri'] == kr_results[1] + assert [ + "fraction", + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + ] == kr_results[0] + assert [ + "0.0885520542481053", + "d__Bacteria", + "p__Bacteroidota", + "c__Bacteroidia", + "o__Bacteroidales", + "f__Bacteroidaceae", + "g__Prevotella", + "s__Prevotella copri", + ] == kr_results[1] def test_genome_rank_human_output(runtmp): # test basic genome - output csv c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" - csvout = runtmp.output(csv_base + '.human.txt') + csvout = runtmp.output(csv_base + ".human.txt") outdir = os.path.dirname(csvout) print("csvout: ", csvout) - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--rank', 'species', '-o', csv_base, '--containment-threshold', '0', - '--output-format', 'human', '--output-dir', outdir) + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--rank", + "species", + "-o", + csv_base, + "--containment-threshold", + "0", + "--output-format", + "human", + "--output-dir", + outdir, + ) print(c.last_result.status) print(c.last_result.out) @@ -1282,27 +2379,45 @@ def test_genome_rank_human_output(runtmp): print(outp) assert len(outp) == 3 - outp = [ x.strip() for x in outp ] + outp = [x.strip() for x in outp] - assert outp[0] == 'sample name status proportion cANI lineage' - assert outp[1] == '----------- ------ ---------- ---- -------' - assert outp[2] == 'test1 match 5.7% 92.5% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri' + assert outp[0] == "sample name status proportion cANI lineage" + assert outp[1] == "----------- ------ ---------- ---- -------" + assert ( + outp[2] + == "test1 match 5.7% 92.5% d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" + ) def test_genome_rank_lineage_csv_output(runtmp): # test basic genome - output csv c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csv_base = "out" - csvout = runtmp.output(csv_base + '.lineage.csv') + csvout = runtmp.output(csv_base + ".lineage.csv") outdir = os.path.dirname(csvout) print("csvout: ", csvout) - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--rank', 'species', '-o', csv_base, '--containment-threshold', '0', - '--output-format', 'lineage_csv', '--output-dir', outdir) + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--rank", + "species", + "-o", + csv_base, + "--containment-threshold", + "0", + "--output-format", + "lineage_csv", + "--output-dir", + outdir, + ) print(c.last_result.status) print(c.last_result.out) @@ -1314,169 +2429,291 @@ def test_genome_rank_lineage_csv_output(runtmp): outp = fp.readlines() assert len(outp) == 2 - outp = [ x.strip() for x in outp ] + outp = [x.strip() for x in outp] - assert outp[0] == 'ident,superkingdom,phylum,class,order,family,genus,species' - assert outp[1] == 'test1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri' + assert outp[0] == "ident,superkingdom,phylum,class,order,family,genus,species" + assert ( + outp[1] + == "test1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri" + ) def test_genome_gather_from_file_rank(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res}\n") - c.run_sourmash('tax', 'genome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_gather_two_files(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # make test2 results (identical to test1 except query_name and filename) g_res2 = runtmp.output("test2.gather.csv") - test2_results = [x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines()] - with open(g_res2, 'w') as fp: + test2_results = [ + x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines() + ] + with open(g_res2, "w") as fp: fp.writelines(test2_results) - c.run_sourmash('tax', 'genome', '-g', g_res, g_res2, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "-g", + g_res, + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out - assert 'test2,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test2.sig,0.057,444000' in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert ( + "test2,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test2.sig,0.057,444000" + in c.last_result.out + ) def test_genome_gather_two_files_empty_force(runtmp): # make test2 results (identical to test1 except query_name and filename) # add an empty file too, with --force -> should work c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") - g_empty_csv = runtmp.output('g_empty.csv') + g_empty_csv = runtmp.output("g_empty.csv") with open(g_empty_csv, "w") as fp: fp.write("") print("g_csv: ", g_empty_csv) g_res2 = runtmp.output("test2.gather.csv") - test2_results = [x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines()] - with open(g_res2, 'w') as fp: + test2_results = [ + x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines() + ] + with open(g_res2, "w") as fp: fp.writelines(test2_results) - c.run_sourmash('tax', 'genome', '-g', g_res, g_res2, '-g', g_empty_csv, - '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0', - '--force') + c.run_sourmash( + "tax", + "genome", + "-g", + g_res, + g_res2, + "-g", + g_empty_csv, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + "--force", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out - assert 'test2,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test2.sig,0.057,444000' in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert ( + "test2,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test2.sig,0.057,444000" + in c.last_result.out + ) def test_genome_gather_duplicate_filename(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') - - c.run_sourmash('tax', 'genome', '--gather-csv', g_res, '-g', g_res, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") + + c.run_sourmash( + "tax", + "genome", + "--gather-csv", + g_res, + "-g", + g_res, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert f'ignoring duplicated reference to file: {g_res}' - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert f"ignoring duplicated reference to file: {g_res}" + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_gather_from_file_duplicate_filename(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res}\n") f_csv.write(f"{g_res}\n") - c.run_sourmash('tax', 'genome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert f'ignoring duplicated reference to file: {g_res}' - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert f"ignoring duplicated reference to file: {g_res}" + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_gather_from_file_duplicate_query(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # different filename, contents identical to test1 g_res2 = runtmp.output("test2.gather.csv") - with open(g_res2, 'w') as fp: + with open(g_res2, "w") as fp: fp.write(Path(g_res).read_text()) g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res}\n") f_csv.write(f"{g_res2}\n") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + ) assert c.last_result.status == -1 print(str(exc.value)) - assert "Gather query test1 was found in more than one CSV. Cannot load from " in str(exc.value) + assert ( + "Gather query test1 was found in more than one CSV. Cannot load from " + in str(exc.value) + ) def test_genome_gather_from_file_duplicate_query_force(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") # different filename, contents identical to test1 g_res2 = runtmp.output("test2.gather.csv") - with open(g_res2, 'w') as fp: + with open(g_res2, "w") as fp: fp.write(Path(g_res).read_text()) g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res}\n") f_csv.write(f"{g_res2}\n") - with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0', '--force') + with pytest.raises(SourmashCommandFailed): + c.run_sourmash( + "tax", + "genome", + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + "--force", + ) print(c.last_result.status) print(c.last_result.out) @@ -1490,70 +2727,119 @@ def test_genome_gather_from_file_duplicate_query_force(runtmp): def test_genome_gather_cli_and_from_file(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") g_from_file = runtmp.output("tmp-from-file.txt") # make test2 results (identical to test1 except query_name) g_res2 = runtmp.output("test2.gather.csv") - test2_results = [x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines()] - with open(g_res2, 'w') as fp: + test2_results = [ + x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines() + ] + with open(g_res2, "w") as fp: fp.writelines(test2_results) # write test2 csv to a text file for input g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res2}\n") - c.run_sourmash('tax', 'genome', '-g', g_res, '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "-g", + g_res, + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out - assert 'test2,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test2.sig,0.057,444000' in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) + assert ( + "test2,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test2.sig,0.057,444000" + in c.last_result.out + ) def test_genome_gather_cli_and_from_file_duplicate_filename(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") g_from_file = runtmp.output("tmp-from-file.txt") # also write test1 csv to a text file for input g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res}\n") - c.run_sourmash('tax', 'genome', '-g', g_res, '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species', '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "-g", + g_res, + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert f'ignoring duplicated reference to file: {g_res}' in c.last_result.err - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert f"ignoring duplicated reference to file: {g_res}" in c.last_result.err + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_gather_from_file_below_threshold(runtmp): # What do we want the results from this to be? I think I initially thought we shouldn't report anything, # but wouldn't a "below_threshold" + superkingdom result (here, 0.204) be helpful information? c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/test1.gather.csv") g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res}\n") - c.run_sourmash('tax', 'genome', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--containment-threshold', '1') + c.run_sourmash( + "tax", + "genome", + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--containment-threshold", + "1", + ) print(c.last_result.status) print(c.last_result.out) @@ -1565,53 +2851,75 @@ def test_genome_gather_from_file_below_threshold(runtmp): def test_genome_gather_two_queries(runtmp): - ''' + """ This checks for initial bug where classification would only happen for one genome per rank when doing --containment-threshold classification - ''' + """ c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/47+63_x_gtdb-rs202.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + g_res = utils.get_test_data("tax/47+63_x_gtdb-rs202.gather.csv") # split 47+63 into two fake queries: q47, q63 g_res2 = runtmp.output("two-queries.gather.csv") q2_results = [x + "\n" for x in Path(g_res).read_text().splitlines()] # rename queries - q2_results[1] = q2_results[1].replace('47+63', 'q47') - q2_results[2] = q2_results[2].replace('47+63', 'q63') - with open(g_res2, 'w') as fp: + q2_results[1] = q2_results[1].replace("47+63", "q47") + q2_results[2] = q2_results[2].replace("47+63", "q63") + with open(g_res2, "w") as fp: for line in q2_results: print(line) fp.write(line) - c.run_sourmash('tax', 'genome', '-g', g_res2, '--taxonomy-csv', taxonomy_csv, - '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "-g", + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 assert "query_name,status,rank,fraction,lineage" in c.last_result.out - assert "q63,match,species,0.336,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Shewanellaceae;g__Shewanella;s__Shewanella baltica,491c0a81," in c.last_result.out - assert "q47,match,species,0.664,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Shewanellaceae;g__Shewanella;s__Shewanella baltica," in c.last_result.out + assert ( + "q63,match,species,0.336,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Shewanellaceae;g__Shewanella;s__Shewanella baltica,491c0a81," + in c.last_result.out + ) + assert ( + "q47,match,species,0.664,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Shewanellaceae;g__Shewanella;s__Shewanella baltica," + in c.last_result.out + ) def test_genome_rank_duplicated_taxonomy_fail(runtmp): c = runtmp # write temp taxonomy with duplicates - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") duplicated_csv = runtmp.output("duplicated_taxonomy.csv") - with open(duplicated_csv, 'w') as dup: + with open(duplicated_csv, "w") as dup: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax.append(tax[1] + 'FOO') # add first tax_assign again + tax.append(tax[1] + "FOO") # add first tax_assign again dup.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', duplicated_csv, - '--rank', 'species') + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + duplicated_csv, + "--rank", + "species", + ) assert "cannot read taxonomy assignments" in str(exc.value) assert "multiple lineages for identifier GCF_001881345" in str(exc.value) @@ -1620,16 +2928,16 @@ def test_genome_rank_duplicated_taxonomy_fail_lineages(runtmp): # write temp taxonomy with duplicates => lineages-style file c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") taxdb = tax_utils.LineageDB.load(taxonomy_csv) for k, v in taxdb.items(): print(k, v) - lineage_csv = runtmp.output('lin.csv') - with open(lineage_csv, 'w', newline="") as fp: + lineage_csv = runtmp.output("lin.csv") + with open(lineage_csv, "w", newline="") as fp: w = csv.writer(fp) - w.writerow(['name', 'lineage']) + w.writerow(["name", "lineage"]) for k, v in taxdb.items(): linstr = lca_utils.display_lineage(v) w.writerow([k, linstr]) @@ -1640,7 +2948,7 @@ def test_genome_rank_duplicated_taxonomy_fail_lineages(runtmp): w.writerow([k, linstr]) with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'summarize', lineage_csv) + c.run_sourmash("tax", "summarize", lineage_csv) print(c.last_result.out) print(c.last_result.err) @@ -1651,174 +2959,292 @@ def test_genome_rank_duplicated_taxonomy_fail_lineages(runtmp): def test_genome_rank_duplicated_taxonomy_force(runtmp): # write temp taxonomy with duplicates c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") duplicated_csv = runtmp.output("duplicated_taxonomy.csv") - with open(duplicated_csv, 'w') as dup: + with open(duplicated_csv, "w") as dup: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax.append(tax[1]) # add first tax_assign again + tax.append(tax[1]) # add first tax_assign again dup.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') - - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', duplicated_csv, - '--rank', 'species', '--force', '--containment-threshold', '0') + g_csv = utils.get_test_data("tax/test1.gather.csv") + + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + duplicated_csv, + "--rank", + "species", + "--force", + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_missing_taxonomy_ignore_threshold(runtmp): c = runtmp # write temp taxonomy with missing entry - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) subset.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') - - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, '--containment-threshold', '0') + g_csv = utils.get_test_data("tax/test1.gather.csv") + + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + subset_csv, + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "The following are missing from the taxonomy information: GCF_001881345" in c.last_result.err - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "The following are missing from the taxonomy information: GCF_001881345" + in c.last_result.err + ) + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_missing_taxonomy_recover_with_second_tax_file(runtmp): c = runtmp # write temp taxonomy with missing entry - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) subset.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') - - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, '-t', taxonomy_csv, '--containment-threshold', '0') + g_csv = utils.get_test_data("tax/test1.gather.csv") + + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + subset_csv, + "-t", + taxonomy_csv, + "--containment-threshold", + "0", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "The following are missing from the taxonomy information: GCF_001881345" not in c.last_result.err - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "The following are missing from the taxonomy information: GCF_001881345" + not in c.last_result.err + ) + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_missing_taxonomy_ignore_rank(runtmp): c = runtmp # write temp taxonomy with missing entry - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) subset.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, '--rank', 'species') + c.run_sourmash( + "tax", "genome", "-g", g_csv, "--taxonomy-csv", subset_csv, "--rank", "species" + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "The following are missing from the taxonomy information: GCF_001881345" in c.last_result.err - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "The following are missing from the taxonomy information: GCF_001881345" + in c.last_result.err + ) + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_multiple_taxonomy_files(runtmp): c = runtmp # write temp taxonomy with missing entry - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) subset.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") # using mult -t args - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, '-t', taxonomy_csv) + c.run_sourmash( + "tax", "genome", "-g", g_csv, "--taxonomy-csv", subset_csv, "-t", taxonomy_csv + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "The following are missing from the taxonomy information: GCF_001881345" not in c.last_result.err - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000,' in c.last_result.out + assert ( + "The following are missing from the taxonomy information: GCF_001881345" + not in c.last_result.err + ) + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000," + in c.last_result.out + ) # using single -t arg - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, taxonomy_csv) + c.run_sourmash( + "tax", "genome", "-g", g_csv, "--taxonomy-csv", subset_csv, taxonomy_csv + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "The following are missing from the taxonomy information: GCF_001881345" not in c.last_result.err - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000,' in c.last_result.out + assert ( + "The following are missing from the taxonomy information: GCF_001881345" + not in c.last_result.err + ) + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000," + in c.last_result.out + ) def test_genome_multiple_taxonomy_files_empty_force(runtmp): c = runtmp # write temp taxonomy with missing entry, as well as an empty file, # and use force - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) subset.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") - empty_tax = runtmp.output('tax_empty.txt') + empty_tax = runtmp.output("tax_empty.txt") with open(empty_tax, "w") as fp: fp.write("") - + # using mult -t args - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, '-t', taxonomy_csv, '-t', empty_tax, '--force') + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + subset_csv, + "-t", + taxonomy_csv, + "-t", + empty_tax, + "--force", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "The following are missing from the taxonomy information: GCF_001881345" not in c.last_result.err - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000,' in c.last_result.out + assert ( + "The following are missing from the taxonomy information: GCF_001881345" + not in c.last_result.err + ) + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000," + in c.last_result.out + ) def test_genome_missing_taxonomy_fail_threshold(runtmp): c = runtmp # write temp taxonomy with missing entry - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) subset.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, - '--fail-on-missing-taxonomy', '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + subset_csv, + "--fail-on-missing-taxonomy", + "--containment-threshold", + "0", + ) print(str(exc.value)) print(c.last_result.status) @@ -1833,18 +3259,27 @@ def test_genome_missing_taxonomy_fail_threshold(runtmp): def test_genome_missing_taxonomy_fail_rank(runtmp): c = runtmp # write temp taxonomy with missing entry - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") subset_csv = runtmp.output("subset_taxonomy.csv") - with open(subset_csv, 'w') as subset: + with open(subset_csv, "w") as subset: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) subset.write("\n".join(tax)) - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', subset_csv, - '--fail-on-missing-taxonomy', '--rank', 'species') + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + subset_csv, + "--fail-on-missing-taxonomy", + "--rank", + "species", + ) print(str(exc.value)) print(c.last_result.status) @@ -1859,12 +3294,22 @@ def test_genome_missing_taxonomy_fail_rank(runtmp): def test_genome_rank_not_available(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--rank', 'strain', '--containment-threshold', '0') + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--rank", + "strain", + "--containment-threshold", + "0", + ) print(str(exc.value)) print(c.last_result.status) @@ -1872,22 +3317,32 @@ def test_genome_rank_not_available(runtmp): print(c.last_result.err) assert c.last_result.status == -1 - assert "No taxonomic information provided for rank strain: cannot classify at this rank" in str(exc.value) + assert ( + "No taxonomic information provided for rank strain: cannot classify at this rank" + in str(exc.value) + ) def test_genome_empty_gather_results_with_header_single(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") gather_results = [x for x in Path(g_csv).read_text().splitlines()] - empty_gather_with_header = runtmp.output('g_header.csv') + empty_gather_with_header = runtmp.output("g_header.csv") # write temp empty gather results (header only) with open(empty_gather_with_header, "w") as fp: fp.write(gather_results[0]) with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', empty_gather_with_header, '--taxonomy-csv', taxonomy_csv) + c.run_sourmash( + "tax", + "genome", + "-g", + empty_gather_with_header, + "--taxonomy-csv", + taxonomy_csv, + ) print(str(exc.value)) print(c.last_result.status) @@ -1895,44 +3350,48 @@ def test_genome_empty_gather_results_with_header_single(runtmp): print(c.last_result.err) assert c.last_result.status == -1 - assert f'No gather results loaded from {empty_gather_with_header}.' in str(exc.value) - assert 'Exiting.' in str(exc.value) + assert f"No gather results loaded from {empty_gather_with_header}." in str( + exc.value + ) + assert "Exiting." in str(exc.value) def test_genome_empty_gather_results_single(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") # write temp empty gather results - empty_tax = runtmp.output('tax_header.csv') + empty_tax = runtmp.output("tax_header.csv") with open(empty_tax, "w") as fp: fp.write("") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', empty_tax, '--taxonomy-csv', taxonomy_csv) - + c.run_sourmash("tax", "genome", "-g", empty_tax, "--taxonomy-csv", taxonomy_csv) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == -1 - assert f"Cannot read gather results from '{empty_tax}'. Is file empty?" in str(exc.value) - assert 'Exiting.' in c.last_result.err + assert f"Cannot read gather results from '{empty_tax}'. Is file empty?" in str( + exc.value + ) + assert "Exiting." in c.last_result.err def test_genome_empty_gather_results_single_force(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") # write temp empty gather results (header only) - empty_tax = runtmp.output('tax_header.csv') + empty_tax = runtmp.output("tax_header.csv") with open(empty_tax, "w") as fp: fp.write("") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', empty_tax, '--taxonomy-csv', taxonomy_csv, - '--force') + c.run_sourmash( + "tax", "genome", "-g", empty_tax, "--taxonomy-csv", taxonomy_csv, "--force" + ) print(str(exc.value)) print(c.last_result.status) @@ -1940,26 +3399,40 @@ def test_genome_empty_gather_results_single_force(runtmp): print(c.last_result.err) assert c.last_result.status == -1 - assert '--force is set. Attempting to continue to next set of gather results.' in str(exc.value) - assert 'No results for classification. Exiting.' in str(exc.value) + assert ( + "--force is set. Attempting to continue to next set of gather results." + in str(exc.value) + ) + assert "No results for classification. Exiting." in str(exc.value) def test_genome_empty_gather_results_with_empty_csv_force(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") # write temp empty gather results - empty_tax = runtmp.output('tax_empty.txt') + empty_tax = runtmp.output("tax_empty.txt") with open(empty_tax, "w") as fp: fp.write("") g_from_file = runtmp.output("tmp-from-csv.csv") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{empty_tax}\n") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', empty_tax, '--from-file', g_from_file, - '--taxonomy-csv', taxonomy_csv, '--rank', 'species', '--force') + c.run_sourmash( + "tax", + "genome", + "-g", + empty_tax, + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--force", + ) print(str(exc.value)) print(c.last_result.status) @@ -1967,48 +3440,80 @@ def test_genome_empty_gather_results_with_empty_csv_force(runtmp): print(c.last_result.err) assert c.last_result.status == -1 - assert '--force is set. Attempting to continue to next set of gather results.' in str(exc.value) - assert 'No results for classification. Exiting.' in str(exc.value) + assert ( + "--force is set. Attempting to continue to next set of gather results." + in str(exc.value) + ) + assert "No results for classification. Exiting." in str(exc.value) def test_genome_empty_gather_results_with_csv_force(runtmp): c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") - g_res = utils.get_test_data('tax/test1.gather.csv') + g_res = utils.get_test_data("tax/test1.gather.csv") g_from_file = runtmp.output("tmp-from-file.txt") - with open(g_from_file, 'w') as f_csv: + with open(g_from_file, "w") as f_csv: f_csv.write(f"{g_res}\n") # write temp empty gather results - empty_tax = runtmp.output('tax_empty.csv') + empty_tax = runtmp.output("tax_empty.csv") with open(empty_tax, "w") as fp: fp.write("") - c.run_sourmash('tax', 'genome', '-g', empty_tax, '--from-file', g_from_file, - '--taxonomy-csv', taxonomy_csv, '--rank', 'species', - '--containment-threshold', '0', '--force') + c.run_sourmash( + "tax", + "genome", + "-g", + empty_tax, + "--from-file", + g_from_file, + "--taxonomy-csv", + taxonomy_csv, + "--rank", + "species", + "--containment-threshold", + "0", + "--force", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert '--force is set. Attempting to continue to next set of gather results.' in c.last_result.err - assert 'loaded results for 1 queries from 1 gather CSVs' in c.last_result.err - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "--force is set. Attempting to continue to next set of gather results." + in c.last_result.err + ) + assert "loaded results for 1 queries from 1 gather CSVs" in c.last_result.err + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) def test_genome_containment_threshold_bounds(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") below_threshold = "-1" with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', tax, '--taxonomy-csv', tax, - '--containment-threshold', below_threshold) + c.run_sourmash( + "tax", + "genome", + "-g", + tax, + "--taxonomy-csv", + tax, + "--containment-threshold", + below_threshold, + ) print(c.last_result.status) print(c.last_result.out) @@ -2017,8 +3522,16 @@ def test_genome_containment_threshold_bounds(runtmp): above_threshold = "1.1" with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--containment-threshold', above_threshold) + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--containment-threshold", + above_threshold, + ) print(c.last_result.status) print(c.last_result.out) @@ -2028,13 +3541,21 @@ def test_genome_containment_threshold_bounds(runtmp): def test_genome_containment_threshold_type(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") not_a_float = "str" with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--containment-threshold', not_a_float) + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--containment-threshold", + not_a_float, + ) print(c.last_result.status) print(c.last_result.out) @@ -2043,17 +3564,17 @@ def test_genome_containment_threshold_type(runtmp): def test_genome_over100percent_error(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") - perfect_g_csv = runtmp.output('g.csv') + perfect_g_csv = runtmp.output("g.csv") - #create an impossible gather result - with open(g_csv, 'r') as fp: - r = csv.DictReader(fp, delimiter=',') + # create an impossible gather result + with open(g_csv) as fp: + r = csv.DictReader(fp, delimiter=",") header = r.fieldnames print(header) - with open(perfect_g_csv, 'w') as out_fp: + with open(perfect_g_csv, "w") as out_fp: w = csv.DictWriter(out_fp, header) w.writeheader() for n, row in enumerate(r): @@ -2063,25 +3584,36 @@ def test_genome_over100percent_error(runtmp): print(row) with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'genome', '-g', perfect_g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "genome", "-g", perfect_g_csv, "--taxonomy-csv", tax) print(runtmp.last_result.status) print(runtmp.last_result.out) print(runtmp.last_result.err) assert runtmp.last_result.status == -1 - assert "fraction is > 100% of the query! This should not be possible." in runtmp.last_result.err + assert ( + "fraction is > 100% of the query! This should not be possible." + in runtmp.last_result.err + ) def test_genome_ani_threshold_input_errors(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather_old.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather_old.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") below_threshold = "-1" with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', tax, '--taxonomy-csv', tax, - '--ani-threshold', below_threshold) + c.run_sourmash( + "tax", + "genome", + "-g", + tax, + "--taxonomy-csv", + tax, + "--ani-threshold", + below_threshold, + ) print(c.last_result.status) print(c.last_result.out) @@ -2090,8 +3622,16 @@ def test_genome_ani_threshold_input_errors(runtmp): above_threshold = "1.1" with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--ani-threshold', above_threshold) + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--ani-threshold", + above_threshold, + ) print(c.last_result.status) print(c.last_result.out) @@ -2101,8 +3641,16 @@ def test_genome_ani_threshold_input_errors(runtmp): not_a_float = "str" with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--ani-threshold', not_a_float) + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--ani-threshold", + not_a_float, + ) print(c.last_result.status) print(c.last_result.out) @@ -2112,49 +3660,76 @@ def test_genome_ani_threshold_input_errors(runtmp): def test_genome_ani_threshold(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--ani-threshold', "0.93") # note: I think this was previously a bug, if 0.95 produced the result below... + c.run_sourmash( + "tax", "genome", "-g", g_csv, "--taxonomy-csv", tax, "--ani-threshold", "0.93" + ) # note: I think this was previously a bug, if 0.95 produced the result below... print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out - assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000,0.93' in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000,0.93" + in c.last_result.out + ) # more lax threshold - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--ani-threshold', "0.9") + c.run_sourmash( + "tax", "genome", "-g", g_csv, "--taxonomy-csv", tax, "--ani-threshold", "0.9" + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + assert ( + "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000" + in c.last_result.out + ) # too stringent of threshold (using rank) - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '--ani-threshold', "1.0", '--rank', 'species') + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--ani-threshold", + "1.0", + "--rank", + "species", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) - assert "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000,0.92" in c.last_result.out + assert ( + "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000,0.92" + in c.last_result.out + ) def test_genome_ani_oldgather(runtmp): # now fail if using gather <4.4 c = runtmp - g_csv = utils.get_test_data('tax/test1.gather_old.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather_old.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") with pytest.raises(SourmashCommandFailed) as exc: - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax) - assert "is missing columns needed for taxonomic summarization. Please run gather with sourmash >= 4.4." in str(exc.value) + c.run_sourmash("tax", "genome", "-g", g_csv, "--taxonomy-csv", tax) + assert ( + "is missing columns needed for taxonomic summarization. Please run gather with sourmash >= 4.4." + in str(exc.value) + ) assert c.last_result.status == -1 @@ -2164,11 +3739,10 @@ def test_genome_ani_lemonade_classify(runtmp): c = runtmp ## first run gather - genome = utils.get_test_data('tax/lemonade-MAG3.sig.gz') - matches = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.zip') + genome = utils.get_test_data("tax/lemonade-MAG3.sig.gz") + matches = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.matches.zip") - c.run_sourmash('gather', genome, matches, - '--threshold-bp=5000', '-o', 'gather.csv') + c.run_sourmash("gather", genome, matches, "--threshold-bp=5000", "-o", "gather.csv") print(c.last_result.status) print(c.last_result.out) @@ -2176,29 +3750,55 @@ def test_genome_ani_lemonade_classify(runtmp): assert c.last_result.status == 0 - this_gather_file = c.output('gather.csv') + this_gather_file = c.output("gather.csv") this_gather = Path(this_gather_file).read_text().splitlines() assert len(this_gather) == 4 ## now run 'tax genome' with human output - taxonomy_file = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv') - c.run_sourmash('tax', 'genome', '-g', this_gather_file, '-t', taxonomy_file, - '--ani', '0.8', '-F', 'human') + taxonomy_file = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.matches.tax.csv") + c.run_sourmash( + "tax", + "genome", + "-g", + this_gather_file, + "-t", + taxonomy_file, + "--ani", + "0.8", + "-F", + "human", + ) output = c.last_result.out - assert 'MAG3_1 match 5.3% 91.0% d__Bacteria;p__Bacteroidota;c__Chlorobia;o__Chlorobiales;f__Chlorobiaceae;g__Prosthecochloris;s__Prosthecochloris vibrioformis' in output + assert ( + "MAG3_1 match 5.3% 91.0% d__Bacteria;p__Bacteroidota;c__Chlorobia;o__Chlorobiales;f__Chlorobiaceae;g__Prosthecochloris;s__Prosthecochloris vibrioformis" + in output + ) # aaand classify to lineage_csv - c.run_sourmash('tax', 'genome', '-g', this_gather_file, '-t', taxonomy_file, - '--ani', '0.8', '-F', 'lineage_csv') + c.run_sourmash( + "tax", + "genome", + "-g", + this_gather_file, + "-t", + taxonomy_file, + "--ani", + "0.8", + "-F", + "lineage_csv", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) output = c.last_result.out - assert 'ident,superkingdom,phylum,class,order,family,genus,species' in output - assert 'MAG3_1,d__Bacteria,p__Bacteroidota,c__Chlorobia,o__Chlorobiales,f__Chlorobiaceae,g__Prosthecochloris,s__Prosthecochloris vibrioformis' in output + assert "ident,superkingdom,phylum,class,order,family,genus,species" in output + assert ( + "MAG3_1,d__Bacteria,p__Bacteroidota,c__Chlorobia,o__Chlorobiales,f__Chlorobiaceae,g__Prosthecochloris,s__Prosthecochloris vibrioformis" + in output + ) def test_genome_ani_lemonade_classify_estimate_ani_ci(runtmp): @@ -2207,11 +3807,18 @@ def test_genome_ani_lemonade_classify_estimate_ani_ci(runtmp): c = runtmp ## first run gather - genome = utils.get_test_data('tax/lemonade-MAG3.sig.gz') - matches = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.zip') - - c.run_sourmash('gather', genome, matches, - '--threshold-bp=5000', '-o', 'gather.csv', '--estimate-ani') + genome = utils.get_test_data("tax/lemonade-MAG3.sig.gz") + matches = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.matches.zip") + + c.run_sourmash( + "gather", + genome, + matches, + "--threshold-bp=5000", + "-o", + "gather.csv", + "--estimate-ani", + ) print(c.last_result.status) print(c.last_result.out) @@ -2219,36 +3826,62 @@ def test_genome_ani_lemonade_classify_estimate_ani_ci(runtmp): assert c.last_result.status == 0 - this_gather_file = c.output('gather.csv') + this_gather_file = c.output("gather.csv") this_gather = Path(this_gather_file).read_text().splitlines() assert len(this_gather) == 4 ## now run 'tax genome' with human output - taxonomy_file = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv') - c.run_sourmash('tax', 'genome', '-g', this_gather_file, '-t', taxonomy_file, - '--ani', '0.8', '-F', 'human') + taxonomy_file = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.matches.tax.csv") + c.run_sourmash( + "tax", + "genome", + "-g", + this_gather_file, + "-t", + taxonomy_file, + "--ani", + "0.8", + "-F", + "human", + ) output = c.last_result.out - assert 'MAG3_1 match 5.3% 91.0% d__Bacteria;p__Bacteroidota;c__Chlorobia;o__Chlorobiales;f__Chlorobiaceae;g__Prosthecochloris;s__Prosthecochloris vibrioformis' in output + assert ( + "MAG3_1 match 5.3% 91.0% d__Bacteria;p__Bacteroidota;c__Chlorobia;o__Chlorobiales;f__Chlorobiaceae;g__Prosthecochloris;s__Prosthecochloris vibrioformis" + in output + ) # aaand classify to lineage_csv - c.run_sourmash('tax', 'genome', '-g', this_gather_file, '-t', taxonomy_file, - '--ani', '0.8', '-F', 'lineage_csv') + c.run_sourmash( + "tax", + "genome", + "-g", + this_gather_file, + "-t", + taxonomy_file, + "--ani", + "0.8", + "-F", + "lineage_csv", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) output = c.last_result.out - assert 'ident,superkingdom,phylum,class,order,family,genus,species' in output - assert 'MAG3_1,d__Bacteria,p__Bacteroidota,c__Chlorobia,o__Chlorobiales,f__Chlorobiaceae,g__Prosthecochloris,s__Prosthecochloris vibrioformis' in output + assert "ident,superkingdom,phylum,class,order,family,genus,species" in output + assert ( + "MAG3_1,d__Bacteria,p__Bacteroidota,c__Chlorobia,o__Chlorobiales,f__Chlorobiaceae,g__Prosthecochloris,s__Prosthecochloris vibrioformis" + in output + ) def test_metagenome_no_gather_csv(runtmp): # test tax metagenome with no -g - taxonomy_file = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv') - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'metagenome', '-t', taxonomy_file) + taxonomy_file = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.matches.tax.csv") + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash("tax", "metagenome", "-t", taxonomy_file) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -2257,9 +3890,9 @@ def test_metagenome_no_gather_csv(runtmp): def test_genome_no_gather_csv(runtmp): # test tax genome with no -g - taxonomy_file = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv') - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'genome', '-t', taxonomy_file) + taxonomy_file = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.matches.tax.csv") + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash("tax", "genome", "-t", taxonomy_file) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -2268,9 +3901,9 @@ def test_genome_no_gather_csv(runtmp): def test_annotate_no_gather_csv(runtmp): # test tax annotate with no -g - taxonomy_file = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv') - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'annotate', '-t', taxonomy_file) + taxonomy_file = utils.get_test_data("tax/lemonade-MAG3.x.gtdb.matches.tax.csv") + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash("tax", "annotate", "-t", taxonomy_file) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -2281,89 +3914,165 @@ def test_genome_LIN(runtmp): # test basic genome with LIN taxonomy c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') - - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--ani-threshold', '0.93') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") + + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "--ani-threshold", + "0.93", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out - assert "test1,below_threshold,0,0.089,1,md5,test1.sig,0.057,444000,0.925" in c.last_result.out - - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--ani-threshold', '0.924') + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" + in c.last_result.out + ) + assert ( + "test1,below_threshold,0,0.089,1,md5,test1.sig,0.057,444000,0.925" + in c.last_result.out + ) + + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "--ani-threshold", + "0.924", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out - assert "test1,match,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out - - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--rank', '4') + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" + in c.last_result.out + ) + + c.run_sourmash( + "tax", "genome", "-g", g_csv, "--taxonomy-csv", tax, "--lins", "--rank", "4" + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out - assert "test1,below_threshold,4,0.088,0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" + in c.last_result.out + ) + assert ( + "test1,below_threshold,4,0.088,0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" + in c.last_result.out + ) def test_genome_LIN_lingroups(runtmp): # test basic genome with LIN taxonomy c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: - out.write('lin,name\n') - out.write('0;0;0,lg1\n') - out.write('1;0;0,lg2\n') - out.write('2;0;0,lg3\n') - out.write('1;0;1,lg3\n') + with open(lg_file, "w") as out: + out.write("lin,name\n") + out.write("0;0;0,lg1\n") + out.write("1;0;0,lg2\n") + out.write("2;0;0,lg3\n") + out.write("1;0;1,lg3\n") # write a 19 so we can check the end - out.write('0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n') - - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--lingroup', lg_file) + out.write("0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n") + + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "--lingroup", + lg_file, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out - assert "test1,below_threshold,2,0.088,0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out - - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--lingroup', lg_file, '--ani-threshold', '0.924') + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" + in c.last_result.out + ) + assert ( + "test1,below_threshold,2,0.088,0;0;0,md5,test1.sig,0.058,442000,0.925" + in c.last_result.out + ) + + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "--lingroup", + lg_file, + "--ani-threshold", + "0.924", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out - assert "test1,match,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out + assert ( + "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" + in c.last_result.out + ) + assert ( + "test1,match,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" + in c.last_result.out + ) def test_annotate_0(runtmp): # test annotate basics c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csvout = runtmp.output("test1.gather.with-lineages.csv") out_dir = os.path.dirname(csvout) - c.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir) + c.run_sourmash( + "tax", "annotate", "--gather-csv", g_csv, "--taxonomy-csv", tax, "-o", out_dir + ) print(c.last_result.status) print(c.last_result.out) @@ -2377,27 +4086,48 @@ def test_annotate_0(runtmp): assert f"saving 'annotate' output to '{csvout}'" in runtmp.last_result.err assert "lineage" in lin_gather_results[0] - assert "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in lin_gather_results[1] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[2] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in lin_gather_results[3] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4] + assert ( + "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" + in lin_gather_results[1] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" + in lin_gather_results[2] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" + in lin_gather_results[3] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" + in lin_gather_results[4] + ) def test_annotate_gzipped_gather(runtmp): # test annotate basics c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") # rewrite gather_csv as gzipped csv - gz_gather = runtmp.output('test1.gather.csv.gz') - with open(g_csv, 'rb') as f_in, gzip.open(gz_gather, 'wb') as f_out: + gz_gather = runtmp.output("test1.gather.csv.gz") + with open(g_csv, "rb") as f_in, gzip.open(gz_gather, "wb") as f_out: f_out.writelines(f_in) - tax = utils.get_test_data('tax/test.taxonomy.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") csvout = runtmp.output("test1.gather.with-lineages.csv") out_dir = os.path.dirname(csvout) - c.run_sourmash('tax', 'annotate', '--gather-csv', gz_gather, '--taxonomy-csv', tax, '-o', out_dir) + c.run_sourmash( + "tax", + "annotate", + "--gather-csv", + gz_gather, + "--taxonomy-csv", + tax, + "-o", + out_dir, + ) print(c.last_result.status) print(c.last_result.out) @@ -2411,22 +4141,44 @@ def test_annotate_gzipped_gather(runtmp): assert f"saving 'annotate' output to '{csvout}'" in runtmp.last_result.err assert "lineage" in lin_gather_results[0] - assert "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in lin_gather_results[1] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[2] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in lin_gather_results[3] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4] + assert ( + "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" + in lin_gather_results[1] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" + in lin_gather_results[2] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" + in lin_gather_results[3] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" + in lin_gather_results[4] + ) def test_annotate_0_LIN(runtmp): # test annotate basics c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") csvout = runtmp.output("test1.gather.with-lineages.csv") out_dir = os.path.dirname(csvout) - c.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir, "--lins") + c.run_sourmash( + "tax", + "annotate", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-o", + out_dir, + "--lins", + ) print(c.last_result.status) print(c.last_result.out) @@ -2451,19 +4203,29 @@ def test_annotate_gather_argparse(runtmp): # this tests argparse handling w/extend. c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csvout = runtmp.output("test1.gather.with-lineages.csv") out_dir = os.path.dirname(csvout) - g_empty_csv = runtmp.output('g_empty.csv') + g_empty_csv = runtmp.output("g_empty.csv") with open(g_empty_csv, "w") as fp: fp.write("") print("g_csv: ", g_empty_csv) - c.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, - '-g', g_empty_csv, '--taxonomy-csv', tax, '-o', out_dir, - '--force') + c.run_sourmash( + "tax", + "annotate", + "--gather-csv", + g_csv, + "-g", + g_empty_csv, + "--taxonomy-csv", + tax, + "-o", + out_dir, + "--force", + ) print(c.last_result.status) print(c.last_result.out) @@ -2477,19 +4239,24 @@ def test_annotate_gather_argparse(runtmp): assert f"saving 'annotate' output to '{csvout}'" in runtmp.last_result.err assert "lineage" in lin_gather_results[0] - assert "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in lin_gather_results[1] + assert ( + "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" + in lin_gather_results[1] + ) def test_annotate_0_db(runtmp): # test annotate with sqlite db c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.db') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.db") csvout = runtmp.output("test1.gather.with-lineages.csv") out_dir = os.path.dirname(csvout) - c.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir) + c.run_sourmash( + "tax", "annotate", "--gather-csv", g_csv, "--taxonomy-csv", tax, "-o", out_dir + ) print(c.last_result.status) print(c.last_result.out) @@ -2502,105 +4269,134 @@ def test_annotate_0_db(runtmp): assert f"saving 'annotate' output to '{csvout}'" in runtmp.last_result.err assert "lineage" in lin_gather_results[0] - assert "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in lin_gather_results[1] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[2] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in lin_gather_results[3] - assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4] + assert ( + "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" + in lin_gather_results[1] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" + in lin_gather_results[2] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" + in lin_gather_results[3] + ) + assert ( + "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" + in lin_gather_results[4] + ) def test_annotate_empty_gather_results(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") - #creates empty gather result - g_csv = runtmp.output('g.csv') + # creates empty gather result + g_csv = runtmp.output("g.csv") with open(g_csv, "w") as fp: fp.write("") print("g_csv: ", g_csv) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'annotate', '-g', g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "annotate", "-g", g_csv, "--taxonomy-csv", tax) assert f"Cannot read from '{g_csv}'. Is file empty?" in str(exc.value) assert runtmp.last_result.status == -1 def test_annotate_prefetch_or_other_header(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') - - alt_csv = runtmp.output('g.csv') - for alt_col in ['match_name', 'ident', 'accession']: - #modify 'name' to other acceptable id_columns result - alt_g = [x.replace("name", alt_col) + "\n" for x in Path(g_csv).read_text().splitlines()] - with open(alt_csv, 'w') as fp: + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") + + alt_csv = runtmp.output("g.csv") + for alt_col in ["match_name", "ident", "accession"]: + # modify 'name' to other acceptable id_columns result + alt_g = [ + x.replace("name", alt_col) + "\n" + for x in Path(g_csv).read_text().splitlines() + ] + with open(alt_csv, "w") as fp: fp.writelines(alt_g) - runtmp.run_sourmash('tax', 'annotate', '-g', alt_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "annotate", "-g", alt_csv, "--taxonomy-csv", tax) assert runtmp.last_result.status == 0 print(runtmp.last_result.out) print(runtmp.last_result.err) - assert f"Starting annotation on '{alt_csv}'. Using ID column: '{alt_col}'" in runtmp.last_result.err + assert ( + f"Starting annotation on '{alt_csv}'. Using ID column: '{alt_col}'" + in runtmp.last_result.err + ) assert f"Annotated 4 of 4 total rows from '{alt_csv}'" in runtmp.last_result.err def test_annotate_bad_header(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") - bad_g_csv = runtmp.output('g.csv') + bad_g_csv = runtmp.output("g.csv") - #creates bad gather result - bad_g = [x.replace("name", "nope") + "\n" for x in Path(g_csv).read_text().splitlines()] - with open(bad_g_csv, 'w') as fp: + # creates bad gather result + bad_g = [ + x.replace("name", "nope") + "\n" for x in Path(g_csv).read_text().splitlines() + ] + with open(bad_g_csv, "w") as fp: fp.writelines(bad_g) # print("bad_gather_results: \n", bad_g) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'annotate', '-g', bad_g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "annotate", "-g", bad_g_csv, "--taxonomy-csv", tax) - assert f"ERROR: Cannot find taxonomic identifier column in '{bad_g_csv}'. Tried: name, match_name, ident, accession" in str(exc.value) + assert ( + f"ERROR: Cannot find taxonomic identifier column in '{bad_g_csv}'. Tried: name, match_name, ident, accession" + in str(exc.value) + ) assert runtmp.last_result.status == -1 print(runtmp.last_result.out) print(runtmp.last_result.err) def test_annotate_no_tax_matches(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") - bad_g_csv = runtmp.output('g.csv') + bad_g_csv = runtmp.output("g.csv") - #mess up tax idents - bad_g = [x.replace("GCF_", "GGG_") + "\n" for x in Path(g_csv).read_text().splitlines()] - with open(bad_g_csv, 'w') as fp: + # mess up tax idents + bad_g = [ + x.replace("GCF_", "GGG_") + "\n" for x in Path(g_csv).read_text().splitlines() + ] + with open(bad_g_csv, "w") as fp: fp.writelines(bad_g) # print("bad_gather_results: \n", bad_g) with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'annotate', '-g', bad_g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "annotate", "-g", bad_g_csv, "--taxonomy-csv", tax) assert f"ERROR: Could not annotate any rows from '{bad_g_csv}'" in str(exc.value) assert runtmp.last_result.status == -1 print(runtmp.last_result.out) print(runtmp.last_result.err) - runtmp.run_sourmash('tax', 'annotate', '-g', bad_g_csv, '--taxonomy-csv', tax, '--force') + runtmp.run_sourmash( + "tax", "annotate", "-g", bad_g_csv, "--taxonomy-csv", tax, "--force" + ) assert runtmp.last_result.status == 0 assert f"Could not annotate any rows from '{bad_g_csv}'" in runtmp.last_result.err - assert f"--force is set. Attempting to continue to next file." in runtmp.last_result.err + assert ( + "--force is set. Attempting to continue to next file." in runtmp.last_result.err + ) print(runtmp.last_result.out) print(runtmp.last_result.err) def test_annotate_missed_tax_matches(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") - bad_g_csv = runtmp.output('g.csv') + bad_g_csv = runtmp.output("g.csv") - with open(g_csv, 'r') as gather_lines, open(bad_g_csv, 'w') as fp: + with open(g_csv) as gather_lines, open(bad_g_csv, "w") as fp: for n, line in enumerate(gather_lines): if n > 2: # mess up tax idents of lines 3, 4 @@ -2608,7 +4404,7 @@ def test_annotate_missed_tax_matches(runtmp): fp.write(line) # print("bad_gather_results: \n", bad_g) - runtmp.run_sourmash('tax', 'annotate', '-g', bad_g_csv, '--taxonomy-csv', tax) + runtmp.run_sourmash("tax", "annotate", "-g", bad_g_csv, "--taxonomy-csv", tax) print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -2618,16 +4414,15 @@ def test_annotate_missed_tax_matches(runtmp): def test_annotate_empty_tax_lineage_input(runtmp): - tax_empty = runtmp.output('t.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax_empty = runtmp.output("t.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") with open(tax_empty, "w") as fp: fp.write("") print("t_csv: ", tax_empty) - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'annotate', '-g', g_csv, '--taxonomy-csv', tax_empty) + runtmp.run_sourmash("tax", "annotate", "-g", g_csv, "--taxonomy-csv", tax_empty) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -2638,15 +4433,25 @@ def test_annotate_empty_tax_lineage_input(runtmp): def test_annotate_empty_tax_lineage_input_recover_with_second_taxfile(runtmp): - tax_empty = runtmp.output('t.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax_empty = runtmp.output("t.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") with open(tax_empty, "w") as fp: fp.write("") print("t_csv: ", tax_empty) - runtmp.run_sourmash('tax', 'annotate', '-g', g_csv, '-t', tax_empty, '--taxonomy-csv', tax, '--force') + runtmp.run_sourmash( + "tax", + "annotate", + "-g", + g_csv, + "-t", + tax_empty, + "--taxonomy-csv", + tax, + "--force", + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -2657,16 +4462,25 @@ def test_annotate_empty_tax_lineage_input_recover_with_second_taxfile(runtmp): def test_annotate_empty_tax_lineage_input_recover_with_second_taxfile_2(runtmp): # test with empty tax second, to check on argparse handling - tax_empty = runtmp.output('t.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') + tax_empty = runtmp.output("t.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") + g_csv = utils.get_test_data("tax/test1.gather.csv") with open(tax_empty, "w") as fp: fp.write("") print("t_csv: ", tax_empty) - runtmp.run_sourmash('tax', 'annotate', '-g', g_csv, - '--taxonomy-csv', tax, '-t', tax_empty, '--force') + runtmp.run_sourmash( + "tax", + "annotate", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-t", + tax_empty, + "--force", + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -2677,29 +4491,31 @@ def test_annotate_empty_tax_lineage_input_recover_with_second_taxfile_2(runtmp): def test_tax_prepare_1_csv_to_csv(runtmp, keep_identifiers, keep_versions): # CSV -> CSV; same assignments - tax = utils.get_test_data('tax/test.taxonomy.csv') - taxout = runtmp.output('out.csv') + tax = utils.get_test_data("tax/test.taxonomy.csv") + taxout = runtmp.output("out.csv") args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash("tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) @@ -2708,13 +4524,12 @@ def test_tax_prepare_1_csv_to_csv(runtmp, keep_identifiers, keep_versions): def test_tax_prepare_1_combine_csv(runtmp): # multiple CSVs to a single combined CSV - tax1 = utils.get_test_data('tax/test.taxonomy.csv') - tax2 = utils.get_test_data('tax/protozoa_genbank_lineage.csv') + tax1 = utils.get_test_data("tax/test.taxonomy.csv") + tax2 = utils.get_test_data("tax/protozoa_genbank_lineage.csv") - taxout = runtmp.output('out.csv') + taxout = runtmp.output("out.csv") - runtmp.sourmash('tax', 'prepare', '-t', tax1, tax2, '-F', 'csv', - '-o', taxout) + runtmp.sourmash("tax", "prepare", "-t", tax1, tax2, "-F", "csv", "-o", taxout) out = runtmp.last_result.out err = runtmp.last_result.err @@ -2728,29 +4543,31 @@ def test_tax_prepare_1_combine_csv(runtmp): def test_tax_prepare_1_csv_to_csv_empty_ranks(runtmp, keep_identifiers, keep_versions): # CSV -> CSV; same assignments, even when trailing ranks are empty - tax = utils.get_test_data('tax/test-empty-ranks.taxonomy.csv') - taxout = runtmp.output('out.csv') + tax = utils.get_test_data("tax/test-empty-ranks.taxonomy.csv") + taxout = runtmp.output("out.csv") args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash("tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) @@ -2760,9 +4577,9 @@ def test_tax_prepare_1_csv_to_csv_empty_ranks(runtmp, keep_identifiers, keep_ver def test_tax_prepare_1_csv_to_csv_empty_file(runtmp, keep_identifiers, keep_versions): # CSV -> CSV with an empty input file and --force # tests argparse extend - tax = utils.get_test_data('tax/test-empty-ranks.taxonomy.csv') - tax_empty = runtmp.output('t.csv') - taxout = runtmp.output('out.csv') + tax = utils.get_test_data("tax/test-empty-ranks.taxonomy.csv") + tax_empty = runtmp.output("t.csv") + taxout = runtmp.output("out.csv") with open(tax_empty, "w") as fp: fp.write("") @@ -2770,86 +4587,109 @@ def test_tax_prepare_1_csv_to_csv_empty_file(runtmp, keep_identifiers, keep_vers args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-t', tax_empty, '-o', - taxout, '-F', 'csv', *args, '--force') + runtmp.run_sourmash( + "tax", + "prepare", + "-t", + tax, + "-t", + tax_empty, + "-o", + taxout, + "-F", + "csv", + *args, + "--force", + ) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) assert set(db1) == set(db2) -def test_tax_prepare_1_csv_to_csv_empty_ranks_2(runtmp, keep_identifiers, keep_versions): +def test_tax_prepare_1_csv_to_csv_empty_ranks_2( + runtmp, keep_identifiers, keep_versions +): # CSV -> CSV; same assignments for situations with empty internal ranks - tax = utils.get_test_data('tax/test-empty-ranks-2.taxonomy.csv') - taxout = runtmp.output('out.csv') + tax = utils.get_test_data("tax/test-empty-ranks-2.taxonomy.csv") + taxout = runtmp.output("out.csv") args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash("tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) assert set(db1) == set(db2) -def test_tax_prepare_1_csv_to_csv_empty_ranks_3(runtmp, keep_identifiers, keep_versions): +def test_tax_prepare_1_csv_to_csv_empty_ranks_3( + runtmp, keep_identifiers, keep_versions +): # CSV -> CSV; same assignments for situations with empty internal ranks - tax = utils.get_test_data('tax/test-empty-ranks-3.taxonomy.csv') - taxout = runtmp.output('out.csv') + tax = utils.get_test_data("tax/test-empty-ranks-3.taxonomy.csv") + taxout = runtmp.output("out.csv") args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', - taxout, '-F', 'csv', *args) + runtmp.run_sourmash("tax", "prepare", "-t", tax, "-o", taxout, "-F", "csv", *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) @@ -2858,65 +4698,70 @@ def test_tax_prepare_1_csv_to_csv_empty_ranks_3(runtmp, keep_identifiers, keep_v def test_tax_prepare_2_csv_to_sql(runtmp, keep_identifiers, keep_versions): # CSV -> SQL; same assignments? - tax = utils.get_test_data('tax/test.taxonomy.csv') - taxout = runtmp.output('out.db') + tax = utils.get_test_data("tax/test.taxonomy.csv") + taxout = runtmp.output("out.db") args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) + runtmp.run_sourmash("tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) assert set(db1) == set(db2) # cannot overwrite - with pytest.raises(SourmashCommandFailed) as exc: - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) - assert 'taxonomy table already exists' in str(exc.value) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args + ) + assert "taxonomy table already exists" in str(exc.value) def test_tax_prepare_2_csv_to_sql_empty_ranks(runtmp, keep_identifiers, keep_versions): # CSV -> SQL with some empty ranks in the taxonomy file - tax = utils.get_test_data('tax/test-empty-ranks.taxonomy.csv') - taxout = runtmp.output('out.db') + tax = utils.get_test_data("tax/test-empty-ranks.taxonomy.csv") + taxout = runtmp.output("out.db") args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) + runtmp.run_sourmash("tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) assert set(db1) == set(db2) @@ -2924,107 +4769,113 @@ def test_tax_prepare_2_csv_to_sql_empty_ranks(runtmp, keep_identifiers, keep_ver def test_tax_prepare_3_db_to_csv(runtmp): # SQL -> CSV; same assignments - taxcsv = utils.get_test_data('tax/test.taxonomy.csv') - taxdb = utils.get_test_data('tax/test.taxonomy.db') - taxout = runtmp.output('out.csv') + taxcsv = utils.get_test_data("tax/test.taxonomy.csv") + taxdb = utils.get_test_data("tax/test.taxonomy.db") + taxout = runtmp.output("out.csv") - runtmp.run_sourmash('tax', 'prepare', '-t', taxdb, - '-o', taxout, '-F', 'csv') + runtmp.run_sourmash("tax", "prepare", "-t", taxdb, "-o", taxout, "-F", "csv") assert os.path.exists(taxout) with open(taxout) as fp: print(fp.read()) - db1 = tax_utils.MultiLineageDB.load([taxcsv], - keep_full_identifiers=False, - keep_identifier_versions=False) + db1 = tax_utils.MultiLineageDB.load( + [taxcsv], keep_full_identifiers=False, keep_identifier_versions=False + ) db2 = tax_utils.MultiLineageDB.load([taxout]) - db3 = tax_utils.MultiLineageDB.load([taxdb], - keep_full_identifiers=False, - keep_identifier_versions=False) + db3 = tax_utils.MultiLineageDB.load( + [taxdb], keep_full_identifiers=False, keep_identifier_versions=False + ) assert set(db1) == set(db2) assert set(db1) == set(db3) def test_tax_prepare_3_db_to_csv_gz(runtmp): # SQL -> CSV; same assignments - taxcsv = utils.get_test_data('tax/test.taxonomy.csv') - taxdb = utils.get_test_data('tax/test.taxonomy.db') - taxout = runtmp.output('out.csv.gz') + taxcsv = utils.get_test_data("tax/test.taxonomy.csv") + taxdb = utils.get_test_data("tax/test.taxonomy.db") + taxout = runtmp.output("out.csv.gz") - runtmp.run_sourmash('tax', 'prepare', '-t', taxdb, - '-o', taxout, '-F', 'csv') + runtmp.run_sourmash("tax", "prepare", "-t", taxdb, "-o", taxout, "-F", "csv") assert os.path.exists(taxout) - with gzip.open(taxout, 'rt') as fp: + with gzip.open(taxout, "rt") as fp: print(fp.read()) - db1 = tax_utils.MultiLineageDB.load([taxcsv], - keep_full_identifiers=False, - keep_identifier_versions=False) + db1 = tax_utils.MultiLineageDB.load( + [taxcsv], keep_full_identifiers=False, keep_identifier_versions=False + ) db2 = tax_utils.MultiLineageDB.load([taxout]) - db3 = tax_utils.MultiLineageDB.load([taxdb], - keep_full_identifiers=False, - keep_identifier_versions=False) + db3 = tax_utils.MultiLineageDB.load( + [taxdb], keep_full_identifiers=False, keep_identifier_versions=False + ) assert set(db1) == set(db2) assert set(db1) == set(db3) -def test_tax_prepare_2_csv_to_sql_empty_ranks_2(runtmp, keep_identifiers, keep_versions): +def test_tax_prepare_2_csv_to_sql_empty_ranks_2( + runtmp, keep_identifiers, keep_versions +): # CSV -> SQL with some empty internal ranks in the taxonomy file - tax = utils.get_test_data('tax/test-empty-ranks-2.taxonomy.csv') - taxout = runtmp.output('out.db') + tax = utils.get_test_data("tax/test-empty-ranks-2.taxonomy.csv") + taxout = runtmp.output("out.db") args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) + runtmp.run_sourmash("tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) assert set(db1) == set(db2) -def test_tax_prepare_2_csv_to_sql_empty_ranks_3(runtmp, keep_identifiers, keep_versions): +def test_tax_prepare_2_csv_to_sql_empty_ranks_3( + runtmp, keep_identifiers, keep_versions +): # CSV -> SQL with some empty internal ranks in the taxonomy file - tax = utils.get_test_data('tax/test-empty-ranks-3.taxonomy.csv') - taxout = runtmp.output('out.db') + tax = utils.get_test_data("tax/test-empty-ranks-3.taxonomy.csv") + taxout = runtmp.output("out.db") args = [] if keep_identifiers: - args.append('--keep-full-identifiers') + args.append("--keep-full-identifiers") if keep_versions: - args.append('--keep-identifier-versions') + args.append("--keep-identifier-versions") # this is an error - can't strip versions if not splitting identifiers if keep_identifiers and not keep_versions: with pytest.raises(SourmashCommandFailed): - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) + runtmp.run_sourmash( + "tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args + ) return - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql', *args) + runtmp.run_sourmash("tax", "prepare", "-t", tax, "-o", taxout, "-F", "sql", *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db1 = tax_utils.MultiLineageDB.load( + [tax], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) db2 = tax_utils.MultiLineageDB.load([taxout]) assert set(db1) == set(db2) @@ -3032,83 +4883,79 @@ def test_tax_prepare_2_csv_to_sql_empty_ranks_3(runtmp, keep_identifiers, keep_v def test_tax_prepare_3_db_to_csv_empty_ranks(runtmp): # SQL -> CSV; same assignments, with empty ranks - taxcsv = utils.get_test_data('tax/test-empty-ranks.taxonomy.csv') - taxdb = utils.get_test_data('tax/test-empty-ranks.taxonomy.db') - taxout = runtmp.output('out.csv') + taxcsv = utils.get_test_data("tax/test-empty-ranks.taxonomy.csv") + taxdb = utils.get_test_data("tax/test-empty-ranks.taxonomy.db") + taxout = runtmp.output("out.csv") - runtmp.run_sourmash('tax', 'prepare', '-t', taxdb, - '-o', taxout, '-F', 'csv') + runtmp.run_sourmash("tax", "prepare", "-t", taxdb, "-o", taxout, "-F", "csv") assert os.path.exists(taxout) with open(taxout) as fp: print(fp.read()) - db1 = tax_utils.MultiLineageDB.load([taxcsv], - keep_full_identifiers=False, - keep_identifier_versions=False) + db1 = tax_utils.MultiLineageDB.load( + [taxcsv], keep_full_identifiers=False, keep_identifier_versions=False + ) db2 = tax_utils.MultiLineageDB.load([taxout]) - db3 = tax_utils.MultiLineageDB.load([taxdb], - keep_full_identifiers=False, - keep_identifier_versions=False) + db3 = tax_utils.MultiLineageDB.load( + [taxdb], keep_full_identifiers=False, keep_identifier_versions=False + ) assert set(db1) == set(db2) assert set(db1) == set(db3) def test_tax_prepare_3_db_to_csv_empty_ranks_2(runtmp): # SQL -> CSV; same assignments, with empty ranks - taxcsv = utils.get_test_data('tax/test-empty-ranks-2.taxonomy.csv') - taxdb = utils.get_test_data('tax/test-empty-ranks-2.taxonomy.db') - taxout = runtmp.output('out.csv') + taxcsv = utils.get_test_data("tax/test-empty-ranks-2.taxonomy.csv") + taxdb = utils.get_test_data("tax/test-empty-ranks-2.taxonomy.db") + taxout = runtmp.output("out.csv") - runtmp.run_sourmash('tax', 'prepare', '-t', taxdb, - '-o', taxout, '-F', 'csv') + runtmp.run_sourmash("tax", "prepare", "-t", taxdb, "-o", taxout, "-F", "csv") assert os.path.exists(taxout) with open(taxout) as fp: print(fp.read()) - db1 = tax_utils.MultiLineageDB.load([taxcsv], - keep_full_identifiers=False, - keep_identifier_versions=False) + db1 = tax_utils.MultiLineageDB.load( + [taxcsv], keep_full_identifiers=False, keep_identifier_versions=False + ) db2 = tax_utils.MultiLineageDB.load([taxout]) - db3 = tax_utils.MultiLineageDB.load([taxdb], - keep_full_identifiers=False, - keep_identifier_versions=False) + db3 = tax_utils.MultiLineageDB.load( + [taxdb], keep_full_identifiers=False, keep_identifier_versions=False + ) assert set(db1) == set(db2) assert set(db1) == set(db3) def test_tax_prepare_3_db_to_csv_empty_ranks_3(runtmp): # SQL -> CSV; same assignments, with empty ranks - taxcsv = utils.get_test_data('tax/test-empty-ranks-3.taxonomy.csv') - taxdb = utils.get_test_data('tax/test-empty-ranks-3.taxonomy.db') - taxout = runtmp.output('out.csv') + taxcsv = utils.get_test_data("tax/test-empty-ranks-3.taxonomy.csv") + taxdb = utils.get_test_data("tax/test-empty-ranks-3.taxonomy.db") + taxout = runtmp.output("out.csv") - runtmp.run_sourmash('tax', 'prepare', '-t', taxdb, - '-o', taxout, '-F', 'csv') + runtmp.run_sourmash("tax", "prepare", "-t", taxdb, "-o", taxout, "-F", "csv") assert os.path.exists(taxout) with open(taxout) as fp: print(fp.read()) - db1 = tax_utils.MultiLineageDB.load([taxcsv], - keep_full_identifiers=False, - keep_identifier_versions=False) + db1 = tax_utils.MultiLineageDB.load( + [taxcsv], keep_full_identifiers=False, keep_identifier_versions=False + ) db2 = tax_utils.MultiLineageDB.load([taxout]) - db3 = tax_utils.MultiLineageDB.load([taxdb], - keep_full_identifiers=False, - keep_identifier_versions=False) + db3 = tax_utils.MultiLineageDB.load( + [taxdb], keep_full_identifiers=False, keep_identifier_versions=False + ) assert set(db1) == set(db2) assert set(db1) == set(db3) def test_tax_prepare_sqlite_lineage_version(runtmp): # test bad sourmash_internals version for SqliteLineage - taxcsv = utils.get_test_data('tax/test.taxonomy.csv') - taxout = runtmp.output('out.db') + taxcsv = utils.get_test_data("tax/test.taxonomy.csv") + taxout = runtmp.output("out.db") - runtmp.run_sourmash('tax', 'prepare', '-t', taxcsv, - '-o', taxout, '-F', 'sql') + runtmp.run_sourmash("tax", "prepare", "-t", taxcsv, "-o", taxout, "-F", "sql") assert os.path.exists(taxout) # set bad version @@ -3120,206 +4967,208 @@ def test_tax_prepare_sqlite_lineage_version(runtmp): conn.close() with pytest.raises(IndexNotSupported): - db = tax_utils.MultiLineageDB.load([taxout]) + tax_utils.MultiLineageDB.load([taxout]) def test_tax_prepare_sqlite_no_lineage(): # no lineage table at all - sqldb = utils.get_test_data('sqlite/index.sqldb') + sqldb = utils.get_test_data("sqlite/index.sqldb") with pytest.raises(ValueError): - db = tax_utils.MultiLineageDB.load([sqldb]) + tax_utils.MultiLineageDB.load([sqldb]) def test_tax_grep_exists(runtmp): # test that 'tax grep' exists with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('tax', 'grep') + runtmp.sourmash("tax", "grep") err = runtmp.last_result.err - assert 'usage:' in err + assert "usage:" in err def test_tax_grep_search_shew(runtmp): # test 'tax grep Shew' - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") - runtmp.sourmash('tax', 'grep', 'Shew', '-t', taxfile) + runtmp.sourmash("tax", "grep", "Shew", "-t", taxfile) out = runtmp.last_result.out err = runtmp.last_result.err - lines = [ x.strip() for x in out.splitlines() ] - lines = [ x.split(',') for x in lines ] - assert lines[0][0] == 'ident' - assert lines[1][0] == 'GCF_000017325.1' - assert lines[2][0] == 'GCF_000021665.1' + lines = [x.strip() for x in out.splitlines()] + lines = [x.split(",") for x in lines] + assert lines[0][0] == "ident" + assert lines[1][0] == "GCF_000017325.1" + assert lines[2][0] == "GCF_000021665.1" assert len(lines) == 3 assert "searching 1 taxonomy files for 'Shew'" in err - assert 'found 2 matches; saved identifiers to picklist' in err + assert "found 2 matches; saved identifiers to picklist" in err def test_tax_grep_search_shew_out(runtmp): # test 'tax grep Shew', save result to a file - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") - runtmp.sourmash('tax', 'grep', 'Shew', '-t', taxfile, '-o', 'pick.csv') + runtmp.sourmash("tax", "grep", "Shew", "-t", taxfile, "-o", "pick.csv") err = runtmp.last_result.err - out = Path(runtmp.output('pick.csv')).read_text() - lines = [ x.strip() for x in out.splitlines() ] - lines = [ x.split(',') for x in lines ] - assert lines[0][0] == 'ident' - assert lines[1][0] == 'GCF_000017325.1' - assert lines[2][0] == 'GCF_000021665.1' + out = Path(runtmp.output("pick.csv")).read_text() + lines = [x.strip() for x in out.splitlines()] + lines = [x.split(",") for x in lines] + assert lines[0][0] == "ident" + assert lines[1][0] == "GCF_000017325.1" + assert lines[2][0] == "GCF_000021665.1" assert len(lines) == 3 assert "searching 1 taxonomy files for 'Shew'" in err - assert 'found 2 matches; saved identifiers to picklist' in err + assert "found 2 matches; saved identifiers to picklist" in err def test_tax_grep_search_shew_sqldb_out(runtmp): # test 'tax grep Shew' on a sqldb, save result to a file - taxfile = utils.get_test_data('tax/test.taxonomy.db') + taxfile = utils.get_test_data("tax/test.taxonomy.db") - runtmp.sourmash('tax', 'grep', 'Shew', '-t', taxfile, '-o', 'pick.csv') + runtmp.sourmash("tax", "grep", "Shew", "-t", taxfile, "-o", "pick.csv") err = runtmp.last_result.err - out = Path(runtmp.output('pick.csv')).read_text() - lines = [ x.strip() for x in out.splitlines() ] - lines = [ x.split(',') for x in lines ] - assert lines[0][0] == 'ident' - assert lines[1][0] == 'GCF_000017325' - assert lines[2][0] == 'GCF_000021665' + out = Path(runtmp.output("pick.csv")).read_text() + lines = [x.strip() for x in out.splitlines()] + lines = [x.split(",") for x in lines] + assert lines[0][0] == "ident" + assert lines[1][0] == "GCF_000017325" + assert lines[2][0] == "GCF_000021665" assert len(lines) == 3 assert "searching 1 taxonomy files for 'Shew'" in err - assert 'found 2 matches; saved identifiers to picklist' in err + assert "found 2 matches; saved identifiers to picklist" in err def test_tax_grep_search_shew_lowercase(runtmp): # test 'tax grep shew' (lowercase), save result to a file - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") - runtmp.sourmash('tax', 'grep', 'shew', '-t', taxfile, '-o', 'pick.csv') + runtmp.sourmash("tax", "grep", "shew", "-t", taxfile, "-o", "pick.csv") err = runtmp.last_result.err assert "searching 1 taxonomy files for 'shew'" in err - assert 'found 0 matches; saved identifiers to picklist' in err + assert "found 0 matches; saved identifiers to picklist" in err - runtmp.sourmash('tax', 'grep', '-i', 'shew', - '-t', taxfile, '-o', 'pick.csv') + runtmp.sourmash("tax", "grep", "-i", "shew", "-t", taxfile, "-o", "pick.csv") err = runtmp.last_result.err assert "searching 1 taxonomy files for 'shew'" in err - assert 'found 2 matches; saved identifiers to picklist' in err - - out = Path(runtmp.output('pick.csv')).read_text() - lines = [ x.strip() for x in out.splitlines() ] - lines = [ x.split(',') for x in lines ] - assert lines[0][0] == 'ident' - assert lines[1][0] == 'GCF_000017325.1' - assert lines[2][0] == 'GCF_000021665.1' + assert "found 2 matches; saved identifiers to picklist" in err + + out = Path(runtmp.output("pick.csv")).read_text() + lines = [x.strip() for x in out.splitlines()] + lines = [x.split(",") for x in lines] + assert lines[0][0] == "ident" + assert lines[1][0] == "GCF_000017325.1" + assert lines[2][0] == "GCF_000021665.1" assert len(lines) == 3 def test_tax_grep_search_shew_out_use_picklist(runtmp): # test 'tax grep Shew', output to a picklist, use picklist - taxfile = utils.get_test_data('tax/test.taxonomy.csv') - dbfile = utils.get_test_data('tax/gtdb-tax-grep.sigs.zip') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") + dbfile = utils.get_test_data("tax/gtdb-tax-grep.sigs.zip") - runtmp.sourmash('tax', 'grep', 'Shew', '-t', taxfile, '-o', 'pick.csv') + runtmp.sourmash("tax", "grep", "Shew", "-t", taxfile, "-o", "pick.csv") - runtmp.sourmash('sig', 'cat', dbfile, '--picklist', - 'pick.csv:ident:ident', '-o', 'pick-out.zip') + runtmp.sourmash( + "sig", "cat", dbfile, "--picklist", "pick.csv:ident:ident", "-o", "pick-out.zip" + ) all_sigs = sourmash.load_file_as_index(dbfile) assert len(all_sigs) == 3 - pick_sigs = sourmash.load_file_as_index(runtmp.output('pick-out.zip')) + pick_sigs = sourmash.load_file_as_index(runtmp.output("pick-out.zip")) assert len(pick_sigs) == 2 - names = [ ss.name.split()[0] for ss in pick_sigs.signatures() ] + names = [ss.name.split()[0] for ss in pick_sigs.signatures()] assert len(names) == 2 - assert 'GCF_000017325.1' in names - assert 'GCF_000021665.1' in names + assert "GCF_000017325.1" in names + assert "GCF_000021665.1" in names def test_tax_grep_search_shew_invert(runtmp): # test 'tax grep -v Shew' - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") - runtmp.sourmash('tax', 'grep', '-v', 'Shew', '-t', taxfile) + runtmp.sourmash("tax", "grep", "-v", "Shew", "-t", taxfile) out = runtmp.last_result.out err = runtmp.last_result.err - assert "-v/--invert-match specified; returning only lineages that do not match." in err + assert ( + "-v/--invert-match specified; returning only lineages that do not match." in err + ) - lines = [ x.strip() for x in out.splitlines() ] - lines = [ x.split(',') for x in lines ] - assert lines[0][0] == 'ident' - assert lines[1][0] == 'GCF_001881345.1' - assert lines[2][0] == 'GCF_003471795.1' + lines = [x.strip() for x in out.splitlines()] + lines = [x.split(",") for x in lines] + assert lines[0][0] == "ident" + assert lines[1][0] == "GCF_001881345.1" + assert lines[2][0] == "GCF_003471795.1" assert len(lines) == 5 assert "searching 1 taxonomy files for 'Shew'" in err - assert 'found 4 matches; saved identifiers to picklist' in err + assert "found 4 matches; saved identifiers to picklist" in err - all_names = set([ x[0] for x in lines ]) - assert 'GCF_000017325.1' not in all_names - assert 'GCF_000021665.1' not in all_names + all_names = set([x[0] for x in lines]) + assert "GCF_000017325.1" not in all_names + assert "GCF_000021665.1" not in all_names def test_tax_grep_search_shew_invert_select_phylum(runtmp): # test 'tax grep -v Shew -r phylum' - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") - runtmp.sourmash('tax', 'grep', '-v', 'Shew', '-t', taxfile, '-r', 'phylum') + runtmp.sourmash("tax", "grep", "-v", "Shew", "-t", taxfile, "-r", "phylum") out = runtmp.last_result.out err = runtmp.last_result.err - assert "-v/--invert-match specified; returning only lineages that do not match." in err + assert ( + "-v/--invert-match specified; returning only lineages that do not match." in err + ) assert "limiting matches to phylum" - lines = [ x.strip() for x in out.splitlines() ] - lines = [ x.split(',') for x in lines ] - assert lines[0][0] == 'ident' + lines = [x.strip() for x in out.splitlines()] + lines = [x.split(",") for x in lines] + assert lines[0][0] == "ident" assert len(lines) == 7 assert "searching 1 taxonomy files for 'Shew'" in err - assert 'found 6 matches; saved identifiers to picklist' in err + assert "found 6 matches; saved identifiers to picklist" in err - all_names = set([ x[0] for x in lines ]) - assert 'GCF_000017325.1' in all_names - assert 'GCF_000021665.1' in all_names + all_names = set([x[0] for x in lines]) + assert "GCF_000017325.1" in all_names + assert "GCF_000021665.1" in all_names def test_tax_grep_search_shew_invert_select_bad_rank(runtmp): # test 'tax grep -v Shew -r badrank' - should fail - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('tax', 'grep', '-v', 'Shew', '-t', taxfile, - '-r', 'badrank') + runtmp.sourmash("tax", "grep", "-v", "Shew", "-t", taxfile, "-r", "badrank") - out = runtmp.last_result.out err = runtmp.last_result.err print(err) - assert 'error: argument -r/--rank: invalid choice:' in err + assert "error: argument -r/--rank: invalid choice:" in err def test_tax_grep_search_shew_count(runtmp): # test 'tax grep Shew --count' - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") - runtmp.sourmash('tax', 'grep', 'Shew', '-t', taxfile, '-c') + runtmp.sourmash("tax", "grep", "Shew", "-t", taxfile, "-c") out = runtmp.last_result.out err = runtmp.last_result.err @@ -3327,19 +5176,17 @@ def test_tax_grep_search_shew_count(runtmp): assert not out.strip() assert "searching 1 taxonomy files for 'Shew'" in err - assert not 'found 2 matches; saved identifiers to picklist' in err + assert "found 2 matches; saved identifiers to picklist" not in err def test_tax_grep_multiple_csv(runtmp): # grep on multiple CSVs - tax1 = utils.get_test_data('tax/test.taxonomy.csv') - tax2 = utils.get_test_data('tax/protozoa_genbank_lineage.csv') + tax1 = utils.get_test_data("tax/test.taxonomy.csv") + tax2 = utils.get_test_data("tax/protozoa_genbank_lineage.csv") - taxout = runtmp.output('out.csv') + taxout = runtmp.output("out.csv") - runtmp.sourmash('tax', 'grep', "Toxo|Gamma", - '-t', tax1, tax2, - '-o', taxout) + runtmp.sourmash("tax", "grep", "Toxo|Gamma", "-t", tax1, tax2, "-o", taxout) out = runtmp.last_result.out err = runtmp.last_result.err @@ -3350,27 +5197,37 @@ def test_tax_grep_multiple_csv(runtmp): lines = Path(taxout).read_text().splitlines() assert len(lines) == 5 - names = set([ x.split(',')[0] for x in lines ]) - assert 'GCA_000256725' in names - assert 'GCF_000017325.1' in names - assert 'GCF_000021665.1' in names - assert 'GCF_001881345.1' in names + names = set([x.split(",")[0] for x in lines]) + assert "GCA_000256725" in names + assert "GCF_000017325.1" in names + assert "GCF_000021665.1" in names + assert "GCF_001881345.1" in names def test_tax_grep_multiple_csv_empty_force(runtmp): # grep on multiple CSVs, one empty, with --force - tax1 = utils.get_test_data('tax/test.taxonomy.csv') - tax2 = utils.get_test_data('tax/protozoa_genbank_lineage.csv') - tax_empty = runtmp.output('t.csv') + tax1 = utils.get_test_data("tax/test.taxonomy.csv") + tax2 = utils.get_test_data("tax/protozoa_genbank_lineage.csv") + tax_empty = runtmp.output("t.csv") - taxout = runtmp.output('out.csv') + taxout = runtmp.output("out.csv") with open(tax_empty, "w") as fp: fp.write("") print("t_csv: ", tax_empty) - runtmp.sourmash('tax', 'grep', "Toxo|Gamma", - '-t', tax1, tax2, '-t', tax_empty, - '-o', taxout, '--force') + runtmp.sourmash( + "tax", + "grep", + "Toxo|Gamma", + "-t", + tax1, + tax2, + "-t", + tax_empty, + "-o", + taxout, + "--force", + ) out = runtmp.last_result.out err = runtmp.last_result.err @@ -3381,22 +5238,20 @@ def test_tax_grep_multiple_csv_empty_force(runtmp): lines = Path(taxout).read_text().splitlines() assert len(lines) == 5 - names = set([ x.split(',')[0] for x in lines ]) - assert 'GCA_000256725' in names - assert 'GCF_000017325.1' in names - assert 'GCF_000021665.1' in names - assert 'GCF_001881345.1' in names + names = set([x.split(",")[0] for x in lines]) + assert "GCA_000256725" in names + assert "GCF_000017325.1" in names + assert "GCF_000021665.1" in names + assert "GCF_001881345.1" in names def test_tax_grep_duplicate_csv(runtmp): # grep on duplicates => should collapse to uniques on identifiers - tax1 = utils.get_test_data('tax/test.taxonomy.csv') + tax1 = utils.get_test_data("tax/test.taxonomy.csv") - taxout = runtmp.output('out.csv') + taxout = runtmp.output("out.csv") - runtmp.sourmash('tax', 'grep', "Gamma", - '-t', tax1, tax1, - '-o', taxout) + runtmp.sourmash("tax", "grep", "Gamma", "-t", tax1, tax1, "-o", taxout) out = runtmp.last_result.out err = runtmp.last_result.err @@ -3407,20 +5262,19 @@ def test_tax_grep_duplicate_csv(runtmp): lines = Path(taxout).read_text().splitlines() assert len(lines) == 4 - names = set([ x.split(',')[0] for x in lines ]) - assert 'GCF_000017325.1' in names - assert 'GCF_000021665.1' in names - assert 'GCF_001881345.1' in names + names = set([x.split(",")[0] for x in lines]) + assert "GCF_000017325.1" in names + assert "GCF_000021665.1" in names + assert "GCF_001881345.1" in names def test_tax_summarize(runtmp): # test basic operation with summarize - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") - runtmp.sourmash('tax', 'summarize', taxfile) + runtmp.sourmash("tax", "summarize", taxfile) out = runtmp.last_result.out - err = runtmp.last_result.err assert "number of distinct taxonomic lineages: 6" in out assert "rank superkingdom: 1 distinct taxonomic lineages" in out @@ -3434,13 +5288,12 @@ def test_tax_summarize(runtmp): def test_tax_summarize_multiple(runtmp): # test basic operation with summarize on multiple files - tax1 = utils.get_test_data('tax/bacteria_refseq_lineage.csv') - tax2 = utils.get_test_data('tax/protozoa_genbank_lineage.csv') + tax1 = utils.get_test_data("tax/bacteria_refseq_lineage.csv") + tax2 = utils.get_test_data("tax/protozoa_genbank_lineage.csv") - runtmp.sourmash('tax', 'summarize', tax1, tax2) + runtmp.sourmash("tax", "summarize", tax1, tax2) out = runtmp.last_result.out - err = runtmp.last_result.err assert "number of distinct taxonomic lineages: 6" in out assert "rank superkingdom: 2 distinct taxonomic lineages" in out @@ -3454,12 +5307,11 @@ def test_tax_summarize_multiple(runtmp): def test_tax_summarize_empty_line(runtmp): # test basic operation with summarize on a file w/empty line - taxfile = utils.get_test_data('tax/test-empty-line.taxonomy.csv') + taxfile = utils.get_test_data("tax/test-empty-line.taxonomy.csv") - runtmp.sourmash('tax', 'summarize', taxfile) + runtmp.sourmash("tax", "summarize", taxfile) out = runtmp.last_result.out - err = runtmp.last_result.err assert "number of distinct taxonomic lineages: 6" in out assert "rank superkingdom: 1 distinct taxonomic lineages" in out @@ -3473,21 +5325,20 @@ def test_tax_summarize_empty_line(runtmp): def test_tax_summarize_empty(runtmp): # test failure on empty file - taxfile = runtmp.output('no-exist') + taxfile = runtmp.output("no-exist") with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('tax', 'summarize', taxfile) + runtmp.sourmash("tax", "summarize", taxfile) - out = runtmp.last_result.out err = runtmp.last_result.err assert "ERROR while loading taxonomies" in err def test_tax_summarize_csv(runtmp): # test basic operation w/csv output - taxfile = utils.get_test_data('tax/test.taxonomy.csv') + taxfile = utils.get_test_data("tax/test.taxonomy.csv") - runtmp.sourmash('tax', 'summarize', taxfile, '-o', 'ranks.csv') + runtmp.sourmash("tax", "summarize", taxfile, "-o", "ranks.csv") out = runtmp.last_result.out err = runtmp.last_result.err @@ -3495,28 +5346,30 @@ def test_tax_summarize_csv(runtmp): assert "number of distinct taxonomic lineages: 6" in out assert "saved 18 lineage counts to 'ranks.csv'" in err - csv_out = runtmp.output('ranks.csv') + csv_out = runtmp.output("ranks.csv") with sourmash_args.FileInputCSV(csv_out) as r: # count number across ranks as a cheap consistency check c = Counter() for row in r: - val = row['lineage_count'] + val = row["lineage_count"] c[val] += 1 - assert c['3'] == 7 - assert c['2'] == 5 - assert c['1'] == 5 + assert c["3"] == 7 + assert c["2"] == 5 + assert c["1"] == 5 def test_tax_summarize_on_annotate(runtmp): # test summarize on output of annotate basics - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.taxonomy.csv") csvout = runtmp.output("test1.gather.with-lineages.csv") out_dir = os.path.dirname(csvout) - runtmp.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir) + runtmp.run_sourmash( + "tax", "annotate", "--gather-csv", g_csv, "--taxonomy-csv", tax, "-o", out_dir + ) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -3527,7 +5380,7 @@ def test_tax_summarize_on_annotate(runtmp): # so far so good - now see if we can run summarize! - runtmp.run_sourmash('tax', 'summarize', csvout) + runtmp.run_sourmash("tax", "summarize", csvout) out = runtmp.last_result.out err = runtmp.last_result.err @@ -3546,9 +5399,9 @@ def test_tax_summarize_on_annotate(runtmp): def test_tax_summarize_strain_csv(runtmp): # test basic operation w/csv output on taxonomy with strains - taxfile = utils.get_test_data('tax/test-strain.taxonomy.csv') + taxfile = utils.get_test_data("tax/test-strain.taxonomy.csv") - runtmp.sourmash('tax', 'summarize', taxfile, '-o', 'ranks.csv') + runtmp.sourmash("tax", "summarize", taxfile, "-o", "ranks.csv") out = runtmp.last_result.out err = runtmp.last_result.err @@ -3556,38 +5409,38 @@ def test_tax_summarize_strain_csv(runtmp): assert "number of distinct taxonomic lineages: 6" in out assert "saved 24 lineage counts to 'ranks.csv'" in err - csv_out = runtmp.output('ranks.csv') + csv_out = runtmp.output("ranks.csv") with sourmash_args.FileInputCSV(csv_out) as r: # count number across ranks as a cheap consistency check c = Counter() for row in r: print(row) - val = row['lineage_count'] + val = row["lineage_count"] c[val] += 1 print(list(c.most_common())) - assert c['3'] == 7 - assert c['2'] == 5 - assert c['6'] == 1 - assert c['1'] == 11 + assert c["3"] == 7 + assert c["2"] == 5 + assert c["6"] == 1 + assert c["1"] == 11 def test_tax_summarize_strain_csv_with_lineages(runtmp): # test basic operation w/csv output on lineages-style file w/strain csv - taxfile = utils.get_test_data('tax/test-strain.taxonomy.csv') - lineage_csv = runtmp.output('lin-with-strains.csv') + taxfile = utils.get_test_data("tax/test-strain.taxonomy.csv") + lineage_csv = runtmp.output("lin-with-strains.csv") taxdb = tax_utils.LineageDB.load(taxfile) - with open(lineage_csv, 'w', newline="") as fp: + with open(lineage_csv, "w", newline="") as fp: w = csv.writer(fp) - w.writerow(['name', 'lineage']) + w.writerow(["name", "lineage"]) for k, v in taxdb.items(): linstr = lca_utils.display_lineage(v) w.writerow([k, linstr]) - runtmp.sourmash('tax', 'summarize', lineage_csv, '-o', 'ranks.csv') + runtmp.sourmash("tax", "summarize", lineage_csv, "-o", "ranks.csv") out = runtmp.last_result.out err = runtmp.last_result.err @@ -3595,40 +5448,40 @@ def test_tax_summarize_strain_csv_with_lineages(runtmp): assert "number of distinct taxonomic lineages: 6" in out assert "saved 24 lineage counts to" in err - csv_out = runtmp.output('ranks.csv') + csv_out = runtmp.output("ranks.csv") with sourmash_args.FileInputCSV(csv_out) as r: # count number across ranks as a cheap consistency check c = Counter() for row in r: print(row) - val = row['lineage_count'] + val = row["lineage_count"] c[val] += 1 print(list(c.most_common())) - assert c['3'] == 7 - assert c['2'] == 5 - assert c['6'] == 1 - assert c['1'] == 11 + assert c["3"] == 7 + assert c["2"] == 5 + assert c["6"] == 1 + assert c["1"] == 11 def test_tax_summarize_LINS(runtmp): # test basic operation w/LINs - taxfile = utils.get_test_data('tax/test.LIN-taxonomy.csv') - lineage_csv = runtmp.output('annotated-lin.csv') + taxfile = utils.get_test_data("tax/test.LIN-taxonomy.csv") + lineage_csv = runtmp.output("annotated-lin.csv") taxdb = tax_utils.LineageDB.load(taxfile, lins=True) - with open(lineage_csv, 'w', newline="") as fp: + with open(lineage_csv, "w", newline="") as fp: w = csv.writer(fp) - w.writerow(['name', 'lineage']) + w.writerow(["name", "lineage"]) for k, v in taxdb.items(): lin = tax_utils.LINLineageInfo(lineage=v) linstr = lin.display_lineage(truncate_empty=False) print(linstr) w.writerow([k, linstr]) - runtmp.sourmash('tax', 'summarize', lineage_csv, '-o', 'ranks.csv', '--lins') + runtmp.sourmash("tax", "summarize", lineage_csv, "-o", "ranks.csv", "--lins") out = runtmp.last_result.out err = runtmp.last_result.err @@ -3639,137 +5492,209 @@ def test_tax_summarize_LINS(runtmp): assert "number of distinct taxonomic lineages: 6" in out assert "saved 91 lineage counts to" in err - csv_out = runtmp.output('ranks.csv') + csv_out = runtmp.output("ranks.csv") with sourmash_args.FileInputCSV(csv_out) as r: - # count number across ranks as a cheap consistency check + # count number across ranks as a cheap consistency check c = Counter() for row in r: print(row) - val = row['lineage_count'] + val = row["lineage_count"] c[val] += 1 print(list(c.most_common())) - assert c['1'] == 77 - assert c['2'] == 1 - assert c['3'] == 11 - assert c['4'] == 2 + assert c["1"] == 77 + assert c["2"] == 1 + assert c["3"] == 11 + assert c["4"] == 2 def test_metagenome_LIN(runtmp): # test basic metagenome with LIN taxonomy c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--lins') + c.run_sourmash("tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax, "--lins") print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) # 0th rank/position assert "test1,0,0.089,1,md5,test1.sig,0.057,444000,0.925,0" in c.last_result.out assert "test1,0,0.088,0,md5,test1.sig,0.058,442000,0.925,0" in c.last_result.out assert "test1,0,0.028,2,md5,test1.sig,0.016,138000,0.891,0" in c.last_result.out - assert "test1,0,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + assert ( + "test1,0,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + ) # 1st rank/position assert "test1,1,0.089,1;0,md5,test1.sig,0.057,444000,0.925,0" in c.last_result.out assert "test1,1,0.088,0;0,md5,test1.sig,0.058,442000,0.925,0" in c.last_result.out assert "test1,1,0.028,2;0,md5,test1.sig,0.016,138000,0.891,0" in c.last_result.out - assert "test1,1,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + assert ( + "test1,1,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + ) # 2nd rank/position assert "test1,2,0.088,0;0;0,md5,test1.sig,0.058,442000,0.925,0" in c.last_result.out assert "test1,2,0.078,1;0;0,md5,test1.sig,0.050,390000,0.921,0" in c.last_result.out assert "test1,2,0.028,2;0;0,md5,test1.sig,0.016,138000,0.891,0" in c.last_result.out assert "test1,2,0.011,1;0;1,md5,test1.sig,0.007,54000,0.864,0" in c.last_result.out - assert "test1,2,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + assert ( + "test1,2,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + ) # 19th rank/position - assert "test1,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925,0" in c.last_result.out - assert "test1,19,0.078,1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.050,390000,0.921,0" in c.last_result.out - assert "test1,19,0.028,2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.016,138000,0.891,0" in c.last_result.out - assert "test1,19,0.011,1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.007,54000,0.864,0" in c.last_result.out - assert "test1,19,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + assert ( + "test1,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925,0" + in c.last_result.out + ) + assert ( + "test1,19,0.078,1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.050,390000,0.921,0" + in c.last_result.out + ) + assert ( + "test1,19,0.028,2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.016,138000,0.891,0" + in c.last_result.out + ) + assert ( + "test1,19,0.011,1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.007,54000,0.864,0" + in c.last_result.out + ) + assert ( + "test1,19,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) def test_metagenome_LIN_lingroups(runtmp): # test lingroups output c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: - out.write('lin,name\n') - out.write('0;0;0,lg1\n') - out.write('1;0;0,lg2\n') - out.write('2;0;0,lg3\n') - out.write('1;0;1,lg3\n') + with open(lg_file, "w") as out: + out.write("lin,name\n") + out.write("0;0;0,lg1\n") + out.write("1;0;0,lg2\n") + out.write("2;0;0,lg3\n") + out.write("1;0;1,lg3\n") # write a 19 so we can check the end - out.write('1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n') - - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '--lingroup', lg_file) + out.write("1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n") + + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "--lingroup", + lg_file, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err - assert "Read 5 lingroup rows and found 5 distinct lingroup prefixes." in c.last_result.err + assert ( + "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" + in c.last_result.err + ) + assert ( + "Read 5 lingroup rows and found 5 distinct lingroup prefixes." + in c.last_result.err + ) assert "name lin percent_containment num_bp_contained" in c.last_result.out assert "lg1 0;0;0 5.82 714000" in c.last_result.out assert "lg2 1;0;0 5.05 620000" in c.last_result.out assert "lg3 2;0;0 1.56 192000" in c.last_result.out assert "lg3 1;0;1 0.65 80000" in c.last_result.out - assert "lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000" in c.last_result.out + assert ( + "lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000" + in c.last_result.out + ) def test_metagenome_LIN_human_summary_no_lin_position(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '-F', "human") + c.run_sourmash( + "tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax, "--lins", "-F", "human" + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err + assert ( + "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" + in c.last_result.err + ) assert "sample name proportion cANI lineage" in c.last_result.out assert "----------- ---------- ---- -------" in c.last_result.out assert "test1 86.9% - unclassified" in c.last_result.out - assert "test1 5.8% 92.5% 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in c.last_result.out - assert "test1 5.0% 92.1% 1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in c.last_result.out - assert "test1 1.6% 89.1% 2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in c.last_result.out - assert "test1 0.7% 86.4% 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in c.last_result.out + assert ( + "test1 5.8% 92.5% 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" + in c.last_result.out + ) + assert ( + "test1 5.0% 92.1% 1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" + in c.last_result.out + ) + assert ( + "test1 1.6% 89.1% 2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" + in c.last_result.out + ) + assert ( + "test1 0.7% 86.4% 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" + in c.last_result.out + ) def test_metagenome_LIN_human_summary_lin_position_5(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') - - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '-F', "human", '--lin-position', '5') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") + + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "-F", + "human", + "--lin-position", + "5", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err + assert ( + "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" + in c.last_result.err + ) assert "sample name proportion cANI lineage" in c.last_result.out assert "----------- ---------- ---- -------" in c.last_result.out assert "test1 86.9% - unclassified" in c.last_result.out @@ -3782,155 +5707,274 @@ def test_metagenome_LIN_human_summary_lin_position_5(runtmp): def test_metagenome_LIN_krona_lin_position_5(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') - - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '-F', "krona", '--lin-position', '5') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") + + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "-F", + "krona", + "--lin-position", + "5", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err + assert ( + "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" + in c.last_result.err + ) assert "fraction 0 1 2 3 4 5" in c.last_result.out assert "0.08815317112086159 0 0 0 0 0 0" in c.last_result.out assert "0.07778220981252493 1 0 0 0 0 0" in c.last_result.out assert "0.027522935779816515 2 0 0 0 0 0" in c.last_result.out assert "0.010769844435580374 1 0 1 0 0 0" in c.last_result.out - assert "0.7957718388512166 unclassified unclassified unclassified unclassified unclassified unclassified" in c.last_result.out + assert ( + "0.7957718388512166 unclassified unclassified unclassified unclassified unclassified unclassified" + in c.last_result.out + ) def test_metagenome_LIN_krona_bad_rank(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '-F', "krona", '--lin-position', 'strain') + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "-F", + "krona", + "--lin-position", + "strain", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status != 0 - assert "Invalid '--rank'/'--position' input: 'strain'. '--lins' is specified. Rank must be an integer corresponding to a LIN position." in c.last_result.err - + assert ( + "Invalid '--rank'/'--position' input: 'strain'. '--lins' is specified. Rank must be an integer corresponding to a LIN position." + in c.last_result.err + ) def test_metagenome_LIN_lingroups_empty_lg_file(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: + with open(lg_file, "w") as out: out.write("") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '--lingroup', lg_file) + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "--lingroup", + lg_file, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status != 0 - assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err - assert f"Cannot read lingroups from '{lg_file}'. Is file empty?" in c.last_result.err + assert ( + "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" + in c.last_result.err + ) + assert ( + f"Cannot read lingroups from '{lg_file}'. Is file empty?" in c.last_result.err + ) def test_metagenome_LIN_lingroups_bad_cli_inputs(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: + with open(lg_file, "w") as out: out.write("") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '-F', "lingroup") + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "-F", + "lingroup", + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status != 0 - assert "Must provide lingroup csv via '--lingroup' in order to output a lingroup report." in c.last_result.err + assert ( + "Must provide lingroup csv via '--lingroup' in order to output a lingroup report." + in c.last_result.err + ) with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-F', "lingroup") + c.run_sourmash( + "tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax, "-F", "lingroup" + ) print(c.last_result.err) assert c.last_result.status != 0 - assert "Must enable LIN taxonomy via '--lins' in order to use lingroups." in c.last_result.err + assert ( + "Must enable LIN taxonomy via '--lins' in order to use lingroups." + in c.last_result.err + ) with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--lingroup', lg_file) + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lingroup", + lg_file, + ) print(c.last_result.err) assert c.last_result.status != 0 with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '-F', 'bioboxes') + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "-F", + "bioboxes", + ) print(c.last_result.err) assert c.last_result.status != 0 - assert "ERROR: The following outputs are incompatible with '--lins': : bioboxes, kreport" in c.last_result.err + assert ( + "ERROR: The following outputs are incompatible with '--lins': : bioboxes, kreport" + in c.last_result.err + ) def test_metagenome_mult_outputs_stdout_fail(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '-F', "kreport", 'csv_summary') + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-F", + "kreport", + "csv_summary", + ) print(c.last_result.err) assert c.last_result.status != 0 - assert f"Writing to stdout is incompatible with multiple output formats ['kreport', 'csv_summary']" in c.last_result.err + assert ( + "Writing to stdout is incompatible with multiple output formats ['kreport', 'csv_summary']" + in c.last_result.err + ) def test_genome_mult_outputs_stdout_fail(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, - '-F', "lineage_csv", 'csv_summary') + c.run_sourmash( + "tax", + "genome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "-F", + "lineage_csv", + "csv_summary", + ) print(c.last_result.err) assert c.last_result.status != 0 - assert f"Writing to stdout is incompatible with multiple output formats ['lineage_csv', 'csv_summary']" in c.last_result.err + assert ( + "Writing to stdout is incompatible with multiple output formats ['lineage_csv', 'csv_summary']" + in c.last_result.err + ) def test_metagenome_LIN_lingroups_lg_only_header(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + g_csv = utils.get_test_data("tax/test1.gather.v450.csv") + tax = utils.get_test_data("tax/test.LIN-taxonomy.csv") lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: - out.write('lin,name\n') + with open(lg_file, "w") as out: + out.write("lin,name\n") with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '--lingroup', lg_file) + c.run_sourmash( + "tax", + "metagenome", + "-g", + g_csv, + "--taxonomy-csv", + tax, + "--lins", + "--lingroup", + lg_file, + ) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status != 0 - assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err + assert ( + "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" + in c.last_result.err + ) assert f"No lingroups loaded from {lg_file}" in c.last_result.err diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 00344ec0d0..a362984532 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -11,20 +11,40 @@ import sourmash_tst_utils as utils -from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results, - collect_gather_csvs, check_and_load_gather_csvs, - LineagePair, QueryInfo, GatherRow, TaxResult, QueryTaxResult, - SummarizedGatherResult, ClassificationResult, AnnotateTaxResult, - BaseLineageInfo, RankLineageInfo, LINLineageInfo, - aggregate_by_lineage_at_rank, format_for_krona, - write_krona, write_lineage_sample_frac, read_lingroups, - LineageTree, LineageDB, LineageDB_Sqlite, MultiLineageDB) +from sourmash.tax.tax_utils import ( + ascending_taxlist, + get_ident, + load_gather_results, + collect_gather_csvs, + check_and_load_gather_csvs, + LineagePair, + QueryInfo, + GatherRow, + TaxResult, + QueryTaxResult, + SummarizedGatherResult, + ClassificationResult, + AnnotateTaxResult, + BaseLineageInfo, + RankLineageInfo, + LINLineageInfo, + aggregate_by_lineage_at_rank, + format_for_krona, + write_krona, + write_lineage_sample_frac, + read_lingroups, + LineageTree, + LineageDB, + LineageDB_Sqlite, + MultiLineageDB, +) + # utility functions for testing def make_mini_taxonomy(tax_info, LIN=False): - #pass in list of tuples: (name, lineage) + # pass in list of tuples: (name, lineage) taxD = {} - for (name, lin) in tax_info: + for name, lin in tax_info: if LIN: lineage = LINLineageInfo(lineage_str=lin) else: @@ -32,15 +52,16 @@ def make_mini_taxonomy(tax_info, LIN=False): taxD[name] = lineage.filled_lineage return taxD + def make_mini_taxonomy_with_taxids(tax_info, LIN=False): taxD = {} - for (name, lin, taxids) in tax_info: + for name, lin, taxids in tax_info: if LIN: lineage = LINLineageInfo(lineage_str=lin) else: ranks = RankLineageInfo.ranks - txs = taxids.split(';') - lns = lin.split(';') + txs = taxids.split(";") + lns = lin.split(";") lineage_tups = [] for n, taxname in enumerate(lns): rk = ranks[n] @@ -51,20 +72,23 @@ def make_mini_taxonomy_with_taxids(tax_info, LIN=False): taxD[name] = lineage.filled_lineage return taxD + def make_GatherRow(gather_dict=None, exclude_cols=[]): """Load artificial gather row (dict) into GatherRow class""" # default contains just the essential cols - gatherD = {'query_name': 'q1', - 'query_md5': 'md5', - 'query_filename': 'query_fn', - 'name': 'gA', - 'f_unique_weighted': 0.2, - 'f_unique_to_query': 0.1, - 'query_bp':100, - 'unique_intersect_bp': 20, - 'remaining_bp': 1, - 'ksize': 31, - 'scaled': 1} + gatherD = { + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "name": "gA", + "f_unique_weighted": 0.2, + "f_unique_to_query": 0.1, + "query_bp": 100, + "unique_intersect_bp": 20, + "remaining_bp": 1, + "ksize": 31, + "scaled": 1, + } if gather_dict is not None: gatherD.update(gather_dict) for col in exclude_cols: @@ -73,39 +97,73 @@ def make_GatherRow(gather_dict=None, exclude_cols=[]): return gatherRaw -def make_TaxResult(gather_dict=None, taxD=None, keep_full_ident=False, keep_ident_version=False, skip_idents=None, LIN=False): +def make_TaxResult( + gather_dict=None, + taxD=None, + keep_full_ident=False, + keep_ident_version=False, + skip_idents=None, + LIN=False, +): """Make TaxResult from artificial gather row (dict)""" gRow = make_GatherRow(gather_dict) - taxres = TaxResult(raw=gRow, keep_full_identifiers=keep_full_ident, - keep_identifier_versions=keep_ident_version, lins=LIN) + taxres = TaxResult( + raw=gRow, + keep_full_identifiers=keep_full_ident, + keep_identifier_versions=keep_ident_version, + lins=LIN, + ) if taxD is not None: taxres.get_match_lineage(tax_assignments=taxD, skip_idents=skip_idents) return taxres -def make_QueryTaxResults(gather_info, taxD=None, single_query=False, keep_full_ident=False, keep_ident_version=False, - skip_idents=None, summarize=False, classify=False, classify_rank=None, c_thresh=0.1, ani_thresh=None, - LIN=False): +def make_QueryTaxResults( + gather_info, + taxD=None, + single_query=False, + keep_full_ident=False, + keep_ident_version=False, + skip_idents=None, + summarize=False, + classify=False, + classify_rank=None, + c_thresh=0.1, + ani_thresh=None, + LIN=False, +): """Make QueryTaxResult(s) from artificial gather information, formatted as list of gather rows (dicts)""" gather_results = {} this_querytaxres = None for gather_infoD in gather_info: - taxres = make_TaxResult(gather_infoD, taxD=taxD, keep_full_ident=keep_full_ident, - keep_ident_version=keep_ident_version, skip_idents=skip_idents, LIN=LIN) + taxres = make_TaxResult( + gather_infoD, + taxD=taxD, + keep_full_ident=keep_full_ident, + keep_ident_version=keep_ident_version, + skip_idents=skip_idents, + LIN=LIN, + ) query_name = taxres.query_name # add to matching QueryTaxResult or create new one if not this_querytaxres or not this_querytaxres.is_compatible(taxres): # get existing or initialize new - this_querytaxres = gather_results.get(query_name, QueryTaxResult(taxres.query_info, lins=LIN)) + this_querytaxres = gather_results.get( + query_name, QueryTaxResult(taxres.query_info, lins=LIN) + ) this_querytaxres.add_taxresult(taxres) -# print('missed_ident?', taxres.missed_ident) + # print('missed_ident?', taxres.missed_ident) gather_results[query_name] = this_querytaxres if summarize: for query_name, qres in gather_results.items(): qres.build_summarized_result() if classify: for query_name, qres in gather_results.items(): - qres.build_classification_result(rank=classify_rank, containment_threshold=c_thresh, ani_threshold=ani_thresh) + qres.build_classification_result( + rank=classify_rank, + containment_threshold=c_thresh, + ani_threshold=ani_thresh, + ) # for convenience: If working with single query, just return that QueryTaxResult. if single_query: if len(gather_results.keys()) > 1: @@ -117,17 +175,43 @@ def make_QueryTaxResults(gather_info, taxD=None, single_query=False, keep_full_i ## tests def test_ascending_taxlist_1(): - assert list(ascending_taxlist()) == ['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] + assert list(ascending_taxlist()) == [ + "strain", + "species", + "genus", + "family", + "order", + "class", + "phylum", + "superkingdom", + ] def test_ascending_taxlist_2(): - assert list(ascending_taxlist(include_strain=False)) == ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] + assert list(ascending_taxlist(include_strain=False)) == [ + "species", + "genus", + "family", + "order", + "class", + "phylum", + "superkingdom", + ] def test_QueryInfo_basic(): "basic functionality of QueryInfo dataclass" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100',query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - assert qInf.query_name == 'q1' + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + assert qInf.query_name == "q1" assert isinstance(qInf.query_n_hashes, int) assert isinstance(qInf.ksize, int) assert isinstance(qInf.scaled, int) @@ -137,8 +221,15 @@ def test_QueryInfo_basic(): def test_QueryInfo_no_hash_info(): "QueryInfo dataclass for older gather results without query_n_hashes or total_weighted_hashes" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100',ksize=31,scaled=10) - assert qInf.query_name == 'q1' + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + ksize=31, + scaled=10, + ) + assert qInf.query_name == "q1" assert qInf.query_n_hashes == 0 assert qInf.total_weighted_hashes == 0 assert qInf.total_weighted_bp == 0 @@ -147,89 +238,213 @@ def test_QueryInfo_no_hash_info(): def test_QueryInfo_missing(): "check that required args" with pytest.raises(TypeError) as exc: - QueryInfo(query_name='q1', query_filename='f1',query_bp='100',query_n_hashes='10',ksize=31,scaled=10, total_weighted_hashes=200) + QueryInfo( + query_name="q1", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize=31, + scaled=10, + total_weighted_hashes=200, + ) print(str(exc)) assert "missing 1 required positional argument: 'query_md5'" in str(exc) def test_SummarizedGatherResult(): "basic functionality of SummarizedGatherResult dataclass" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + sgr = SummarizedGatherResult( + rank="phylum", + fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) print(sgr) - assert sgr.rank=='phylum' + assert sgr.rank == "phylum" sumD = sgr.as_summary_dict(query_info=qInf) print(sumD) - assert sumD == {'rank': 'phylum', 'fraction': "0.2", 'lineage': 'a;b', 'f_weighted_at_rank': "0.3", - 'bp_match_at_rank': "30", 'query_ani_at_rank': None, 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} + assert sumD == { + "rank": "phylum", + "fraction": "0.2", + "lineage": "a;b", + "f_weighted_at_rank": "0.3", + "bp_match_at_rank": "30", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "f1", + "total_weighted_hashes": "200", + } hD = sgr.as_human_friendly_dict(query_info=qInf) print(hD) - assert hD == {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', 'f_weighted_at_rank': '30.0%', - 'bp_match_at_rank': "30", 'query_ani_at_rank': '- ', 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} + assert hD == { + "rank": "phylum", + "fraction": "0.200", + "lineage": "a;b", + "f_weighted_at_rank": "30.0%", + "bp_match_at_rank": "30", + "query_ani_at_rank": "- ", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "f1", + "total_weighted_hashes": "200", + } krD = sgr.as_kreport_dict(query_info=qInf) print(krD) - assert krD == {'ncbi_taxid': None, 'sci_name': 'b', 'rank_code': 'P', 'num_bp_assigned': "0", - 'percent_containment': '30.00', 'num_bp_contained': "600"} - lD = sgr.as_lineage_dict(ranks = RankLineageInfo().ranks, query_info=qInf) + assert krD == { + "ncbi_taxid": None, + "sci_name": "b", + "rank_code": "P", + "num_bp_assigned": "0", + "percent_containment": "30.00", + "num_bp_contained": "600", + } + lD = sgr.as_lineage_dict(ranks=RankLineageInfo().ranks, query_info=qInf) print(lD) - assert lD == {'ident': 'q1', 'superkingdom': 'a', 'phylum': 'b', 'class': '', 'order': '', - 'family': '', 'genus': '', 'species': '', 'strain': ''} + assert lD == { + "ident": "q1", + "superkingdom": "a", + "phylum": "b", + "class": "", + "order": "", + "family": "", + "genus": "", + "species": "", + "strain": "", + } cami = sgr.as_cami_bioboxes() print(cami) - assert cami == [None, 'phylum', None, 'a|b', '30.00'] + assert cami == [None, "phylum", None, "a|b", "30.00"] def test_SummarizedGatherResult_withtaxids(): "basic functionality of SummarizedGatherResult dataclass" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - lin = [LineagePair(rank='superkingdom', name='a', taxid='1'), LineagePair(rank='phylum', name='b', taxid=2)] - sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage=lin), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + lin = [ + LineagePair(rank="superkingdom", name="a", taxid="1"), + LineagePair(rank="phylum", name="b", taxid=2), + ] + sgr = SummarizedGatherResult( + rank="phylum", + fraction=0.2, + lineage=RankLineageInfo(lineage=lin), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) print(sgr) - assert sgr.rank=='phylum' + assert sgr.rank == "phylum" sumD = sgr.as_summary_dict(query_info=qInf) print(sumD) - assert sumD == {'rank': 'phylum', 'fraction': "0.2", 'lineage': 'a;b', 'f_weighted_at_rank': "0.3", - 'bp_match_at_rank': "30", 'query_ani_at_rank': None, 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} + assert sumD == { + "rank": "phylum", + "fraction": "0.2", + "lineage": "a;b", + "f_weighted_at_rank": "0.3", + "bp_match_at_rank": "30", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "f1", + "total_weighted_hashes": "200", + } hD = sgr.as_human_friendly_dict(query_info=qInf) print(hD) - assert hD == {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', 'f_weighted_at_rank': '30.0%', - 'bp_match_at_rank': "30", 'query_ani_at_rank': '- ', 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} + assert hD == { + "rank": "phylum", + "fraction": "0.200", + "lineage": "a;b", + "f_weighted_at_rank": "30.0%", + "bp_match_at_rank": "30", + "query_ani_at_rank": "- ", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "f1", + "total_weighted_hashes": "200", + } krD = sgr.as_kreport_dict(query_info=qInf) print(krD) - assert krD == {'ncbi_taxid': '2', 'sci_name': 'b', 'rank_code': 'P', 'num_bp_assigned': "0", - 'percent_containment': '30.00', 'num_bp_contained': "600"} - lD = sgr.as_lineage_dict(ranks = RankLineageInfo().ranks, query_info=qInf) + assert krD == { + "ncbi_taxid": "2", + "sci_name": "b", + "rank_code": "P", + "num_bp_assigned": "0", + "percent_containment": "30.00", + "num_bp_contained": "600", + } + lD = sgr.as_lineage_dict(ranks=RankLineageInfo().ranks, query_info=qInf) print(lD) - assert lD == {'ident': 'q1', 'superkingdom': 'a', 'phylum': 'b', 'class': '', 'order': '', - 'family': '', 'genus': '', 'species': '', 'strain': ''} + assert lD == { + "ident": "q1", + "superkingdom": "a", + "phylum": "b", + "class": "", + "order": "", + "family": "", + "genus": "", + "species": "", + "strain": "", + } cami = sgr.as_cami_bioboxes() print(cami) - assert cami == ['2', 'phylum', '1|2', 'a|b', '30.00'] + assert cami == ["2", "phylum", "1|2", "a|b", "30.00"] def test_SummarizedGatherResult_LINs(): "SummarizedGatherResult with LINs" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=LINLineageInfo(lineage_str="0;0;1"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + sgr = SummarizedGatherResult( + rank="phylum", + fraction=0.2, + lineage=LINLineageInfo(lineage_str="0;0;1"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name") print(lgD) - assert lgD == {'name': "lg_name", "lin": "0;0;1", - 'percent_containment': '30.00', 'num_bp_contained': "600"} + assert lgD == { + "name": "lg_name", + "lin": "0;0;1", + "percent_containment": "30.00", + "num_bp_contained": "600", + } lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name") print(lgD) - assert lgD == {'name': "lg_name", "lin": "0;0;1", - 'percent_containment': '30.00', 'num_bp_contained': "600"} + assert lgD == { + "name": "lg_name", + "lin": "0;0;1", + "percent_containment": "30.00", + "num_bp_contained": "600", + } with pytest.raises(ValueError) as exc: sgr.as_kreport_dict(query_info=qInf) print(str(exc)) @@ -242,164 +457,344 @@ def test_SummarizedGatherResult_LINs(): def test_SummarizedGatherResult_set_query_ani(): "Check ANI estimation within SummarizedGatherResult dataclass" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + sgr = SummarizedGatherResult( + rank="phylum", + fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) sgr.set_query_ani(query_info=qInf) print(sgr.query_ani_at_rank) - assert sgr.query_ani_at_rank == approx(0.949, rel=1e-3) + assert sgr.query_ani_at_rank == approx(0.949, rel=1e-3) # ANI can be calculated with query_bp OR query_n_hashes. Remove each and check the results are identical - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes=0,ksize='31',scaled='10', total_weighted_hashes='200') - sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes=0, + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + sgr = SummarizedGatherResult( + rank="phylum", + fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) sgr.set_query_ani(query_info=qInf) print(sgr.query_ani_at_rank) - assert sgr.query_ani_at_rank == approx(0.949, rel=1e-3) + assert sgr.query_ani_at_rank == approx(0.949, rel=1e-3) # try without query_bp - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp=0, - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp=0, + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + sgr = SummarizedGatherResult( + rank="phylum", + fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) sgr.set_query_ani(query_info=qInf) print(sgr.query_ani_at_rank) - assert sgr.query_ani_at_rank == approx(0.949, rel=1e-3) + assert sgr.query_ani_at_rank == approx(0.949, rel=1e-3) def test_SummarizedGatherResult_greater_than_1(): "basic functionality of SummarizedGatherResult dataclass" # fraction > 1 with pytest.raises(ValueError) as exc: - SummarizedGatherResult(rank="phylum", fraction=0.3, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=1.2, bp_match_at_rank=30) + SummarizedGatherResult( + rank="phylum", + fraction=0.3, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=1.2, + bp_match_at_rank=30, + ) print(str(exc)) assert "> 100% of the query!" in str(exc) # f_weighted > 1 with pytest.raises(ValueError) as exc: - SummarizedGatherResult(rank="phylum", fraction=1.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + SummarizedGatherResult( + rank="phylum", + fraction=1.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) print(str(exc)) assert "> 100% of the query!" in str(exc) def test_SummarizedGatherResult_0_fraction(): with pytest.raises(ValueError) as exc: - SummarizedGatherResult(rank="phylum", fraction=-.1, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + SummarizedGatherResult( + rank="phylum", + fraction=-0.1, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) err_msg = "Summarized fraction is <=0% of the query! This should not occur." assert err_msg in str(exc) - #assert cr.status == 'nomatch' - + # assert cr.status == 'nomatch' + with pytest.raises(ValueError) as exc: - SummarizedGatherResult(rank="phylum", fraction=.1, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0, bp_match_at_rank=30) + SummarizedGatherResult( + rank="phylum", + fraction=0.1, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0, + bp_match_at_rank=30, + ) print(str(exc)) assert err_msg in str(exc) def test_SummarizedGatherResult_species_kreport(): "basic functionality of SummarizedGatherResult dataclass" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - sgr = SummarizedGatherResult(rank="species", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b;c;d;e;f;g"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + sgr = SummarizedGatherResult( + rank="species", + fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b;c;d;e;f;g"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) print(sgr) - assert sgr.rank=='species' + assert sgr.rank == "species" krD = sgr.as_kreport_dict(query_info=qInf) print(krD) - assert krD == {'ncbi_taxid': None, 'sci_name': 'g', 'rank_code': 'S', 'num_bp_assigned': "600", - 'percent_containment': '30.00', 'num_bp_contained': "600"} + assert krD == { + "ncbi_taxid": None, + "sci_name": "g", + "rank_code": "S", + "num_bp_assigned": "600", + "percent_containment": "30.00", + "num_bp_contained": "600", + } def test_SummarizedGatherResult_summary_dict_limit_float(): "basic functionality of SummarizedGatherResult dataclass" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - sgr = SummarizedGatherResult(rank="phylum", fraction=0.123456, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.345678, bp_match_at_rank=30) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + sgr = SummarizedGatherResult( + rank="phylum", + fraction=0.123456, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.345678, + bp_match_at_rank=30, + ) print(sgr) - assert sgr.rank=='phylum' + assert sgr.rank == "phylum" sumD = sgr.as_summary_dict(query_info=qInf) print(sumD) - assert sumD == {'rank': 'phylum', 'fraction': "0.123456", 'lineage': 'a;b', 'f_weighted_at_rank': "0.345678", - 'bp_match_at_rank': "30", 'query_ani_at_rank': None, 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} - + assert sumD == { + "rank": "phylum", + "fraction": "0.123456", + "lineage": "a;b", + "f_weighted_at_rank": "0.345678", + "bp_match_at_rank": "30", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "f1", + "total_weighted_hashes": "200", + } + sumD = sgr.as_summary_dict(query_info=qInf, limit_float=True) print(sumD) - assert sumD == {'rank': 'phylum', 'fraction': "0.123", 'lineage': 'a;b', 'f_weighted_at_rank': "0.346", - 'bp_match_at_rank': "30", 'query_ani_at_rank': None, 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} + assert sumD == { + "rank": "phylum", + "fraction": "0.123", + "lineage": "a;b", + "f_weighted_at_rank": "0.346", + "bp_match_at_rank": "30", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "f1", + "total_weighted_hashes": "200", + } def test_ClassificationResult(): "basic functionality of ClassificationResult dataclass" - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - cr = ClassificationResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30, query_ani_at_rank=0.97) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + cr = ClassificationResult( + rank="phylum", + fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + query_ani_at_rank=0.97, + ) cr.set_status(query_info=qInf, containment_threshold=0.1) - assert cr.status == 'match' + assert cr.status == "match" print(cr.query_ani_at_rank) - assert cr.query_ani_at_rank == approx(0.949, rel=1e-3) + assert cr.query_ani_at_rank == approx(0.949, rel=1e-3) cr.set_status(query_info=qInf, containment_threshold=0.35) - assert cr.status == 'below_threshold' - lD = cr.as_lineage_dict(ranks = RankLineageInfo().ranks, query_info=qInf) + assert cr.status == "below_threshold" + lD = cr.as_lineage_dict(ranks=RankLineageInfo().ranks, query_info=qInf) print(lD) - assert lD == {'ident': 'q1', 'superkingdom': 'a', 'phylum': 'b', 'class': '', 'order': '', - 'family': '', 'genus': '', 'species': '', 'strain': ''} + assert lD == { + "ident": "q1", + "superkingdom": "a", + "phylum": "b", + "class": "", + "order": "", + "family": "", + "genus": "", + "species": "", + "strain": "", + } def test_ClassificationResult_greater_than_1(): "basic functionality of SummarizedGatherResult dataclass" # fraction > 1 with pytest.raises(ValueError) as exc: - ClassificationResult(rank="phylum", fraction=0.3, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=1.2, bp_match_at_rank=30) + ClassificationResult( + rank="phylum", + fraction=0.3, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=1.2, + bp_match_at_rank=30, + ) print(str(exc)) assert "> 100% of the query!" in str(exc) # f_weighted > 1 with pytest.raises(ValueError) as exc: - ClassificationResult(rank="phylum", fraction=1.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + ClassificationResult( + rank="phylum", + fraction=1.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) print(str(exc)) assert "> 100% of the query!" in str(exc) def test_ClassificationResult_0_fraction(): with pytest.raises(ValueError) as exc: - ClassificationResult(rank="phylum", fraction=-.1, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30) + ClassificationResult( + rank="phylum", + fraction=-0.1, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + ) err_msg = "Summarized fraction is <=0% of the query! This should not occur." assert err_msg in str(exc) - #assert cr.status == 'nomatch' - + # assert cr.status == 'nomatch' + with pytest.raises(ValueError) as exc: - ClassificationResult(rank="phylum", fraction=.1, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0, bp_match_at_rank=30) + ClassificationResult( + rank="phylum", + fraction=0.1, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0, + bp_match_at_rank=30, + ) print(str(exc)) assert err_msg in str(exc) def test_ClassificationResult_build_krona_result(): - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - cr = ClassificationResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30, query_ani_at_rank=0.97) - #cr.set_status(query_info=qInf, rank='phylum') - kr, ukr = cr.build_krona_result(rank='phylum') + QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + cr = ClassificationResult( + rank="phylum", + fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + query_ani_at_rank=0.97, + ) + # cr.set_status(query_info=qInf, rank='phylum') + kr, ukr = cr.build_krona_result(rank="phylum") print(kr) - assert kr == (0.2, 'a', 'b') + assert kr == (0.2, "a", "b") print(ukr) - assert ukr == (0.8, 'unclassified', 'unclassified') + assert ukr == (0.8, "unclassified", "unclassified") def test_ClassificationResult_build_krona_result_no_rank(): - qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', - query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') - cr = ClassificationResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.3, bp_match_at_rank=30, query_ani_at_rank=0.97) + qInf = QueryInfo( + query_name="q1", + query_md5="md5", + query_filename="f1", + query_bp="100", + query_n_hashes="10", + ksize="31", + scaled="10", + total_weighted_hashes="200", + ) + cr = ClassificationResult( + rank="phylum", + fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, + bp_match_at_rank=30, + query_ani_at_rank=0.97, + ) cr.set_status(query_info=qInf, containment_threshold=0.1) @@ -407,7 +802,7 @@ def test_GatherRow_old_gather(): # gather does not contain query_name column gA = {"name": "gA.1 name"} with pytest.raises(TypeError) as exc: - make_GatherRow(gA, exclude_cols=['query_bp']) + make_GatherRow(gA, exclude_cols=["query_bp"]) print(str(exc)) assert "__init__() missing 1 required positional argument: 'query_bp'" in str(exc) @@ -433,7 +828,12 @@ def test_AnnotateTaxResult_get_ident_default(): def test_AnnotateTaxResult_get_ident_idcol(): - gA = {"name": "n1", "match_name": "n2", "ident": "n3", "accession": "n4"} # gather result with match name as GCF_001881345.1 + gA = { + "name": "n1", + "match_name": "n2", + "ident": "n3", + "accession": "n4", + } # gather result with match name as GCF_001881345.1 taxres = AnnotateTaxResult(raw=gA) print(taxres.match_ident) assert taxres.match_ident == "n1" @@ -449,7 +849,12 @@ def test_AnnotateTaxResult_get_ident_idcol(): def test_AnnotateTaxResult_get_ident_idcol_fail(): - gA = {"name": "n1", "match_name": "n2", "ident": "n3", "accession": "n4"} # gather result with match name as GCF_001881345.1 + gA = { + "name": "n1", + "match_name": "n2", + "ident": "n3", + "accession": "n4", + } # gather result with match name as GCF_001881345.1 with pytest.raises(ValueError) as exc: AnnotateTaxResult(raw=gA, id_col="NotACol") print(str(exc)) @@ -467,7 +872,7 @@ def test_TaxResult_get_ident_split_but_keep_version(): taxres = make_TaxResult(gA, keep_ident_version=True) print("raw ident: ", taxres.raw.name) print("keep_full?: ", taxres.keep_full_identifiers) - print("keep_version?: ",taxres.keep_identifier_versions) + print("keep_version?: ", taxres.keep_identifier_versions) print("final ident: ", taxres.match_ident) assert taxres.match_ident == "GCF_001881345.1" @@ -475,9 +880,9 @@ def test_TaxResult_get_ident_split_but_keep_version(): def test_AnnotateTaxResult_get_ident_split_but_keep_version(): gA = {"name": "GCF_001881345.1 secondname"} taxres = AnnotateTaxResult(gA, keep_identifier_versions=True) - print("raw ident: ", taxres.raw['name']) + print("raw ident: ", taxres.raw["name"]) print("keep_full?: ", taxres.keep_full_identifiers) - print("keep_version?: ",taxres.keep_identifier_versions) + print("keep_version?: ", taxres.keep_identifier_versions) print("final ident: ", taxres.match_ident) assert taxres.match_ident == "GCF_001881345.1" @@ -493,7 +898,7 @@ def test_TaxResult_get_ident_keep_full(): taxres = make_TaxResult(gA, keep_full_ident=True) print("raw ident: ", taxres.raw.name) print("keep_full?: ", taxres.keep_full_identifiers) - print("keep_version?: ",taxres.keep_identifier_versions) + print("keep_version?: ", taxres.keep_identifier_versions) print("final ident: ", taxres.match_ident) assert taxres.match_ident == "GCF_001881345.1 secondname" @@ -501,32 +906,32 @@ def test_TaxResult_get_ident_keep_full(): def test_AnnotateTaxResult_get_ident_keep_full(): gA = {"name": "GCF_001881345.1 secondname"} taxres = AnnotateTaxResult(gA, keep_full_identifiers=True) - print("raw ident: ", taxres.raw['name']) + print("raw ident: ", taxres.raw["name"]) print("keep_full?: ", taxres.keep_full_identifiers) - print("keep_version?: ",taxres.keep_identifier_versions) + print("keep_version?: ", taxres.keep_identifier_versions) print("final ident: ", taxres.match_ident) assert taxres.match_ident == "GCF_001881345.1 secondname" def test_collect_gather_csvs(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") from_file = runtmp.output("tmp-from-file.txt") - with open(from_file, 'w') as fp: + with open(from_file, "w") as fp: fp.write(f"{g_csv}\n") gather_files = collect_gather_csvs([g_csv], from_file=from_file) print("gather_files: ", gather_files) assert len(gather_files) == 1 - assert basename(gather_files[0]) == 'test1.gather.csv' + assert basename(gather_files[0]) == "test1.gather.csv" def test_check_and_load_gather_csvs_empty(runtmp): - g_res = runtmp.output('empty.gather.csv') - with open(g_res, 'w') as fp: + g_res = runtmp.output("empty.gather.csv") + with open(g_res, "w") as fp: fp.write("") csvs = [g_res] # load taxonomy csv - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") tax_assign = MultiLineageDB.load([taxonomy_csv], keep_full_identifiers=1) print(tax_assign) @@ -537,24 +942,27 @@ def test_check_and_load_gather_csvs_empty(runtmp): def test_check_and_load_gather_csvs_with_empty_force(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") # make gather results with taxonomy name not in tax_assign - g_res2 = runtmp.output('gA.gather.csv') - g_results = [x.replace("GCF_001881345.1", "gA") + "\n" for x in Path(g_csv).read_text().splitlines()] - with open(g_res2, 'w') as fp: + g_res2 = runtmp.output("gA.gather.csv") + g_results = [ + x.replace("GCF_001881345.1", "gA") + "\n" + for x in Path(g_csv).read_text().splitlines() + ] + with open(g_res2, "w") as fp: fp.writelines(g_results) # make empty gather results - g_res3 = runtmp.output('empty.gather.csv') - with open(g_res3, 'w') as fp: + g_res3 = runtmp.output("empty.gather.csv") + with open(g_res3, "w") as fp: fp.write("") csvs = [g_res2, g_res3] # load taxonomy csv - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], - keep_full_identifiers=False, - keep_identifier_versions=False) + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + tax_assign = MultiLineageDB.load( + [taxonomy_csv], keep_full_identifiers=False, keep_identifier_versions=False + ) print(tax_assign) # check gather results and missing ids gather_results = check_and_load_gather_csvs(csvs, tax_assign, force=True) @@ -562,214 +970,269 @@ def test_check_and_load_gather_csvs_with_empty_force(runtmp): q_res = gather_results[0] assert len(q_res.raw_taxresults) == 4 assert q_res.n_missed == 1 - assert 'gA' in q_res.missed_idents + assert "gA" in q_res.missed_idents assert q_res.n_skipped == 0 def test_check_and_load_gather_lineage_csvs_empty(runtmp): # try loading an empty annotated gather file - g_res = runtmp.output('empty.gather-tax.csv') - with open(g_res, 'w') as fp: + g_res = runtmp.output("empty.gather-tax.csv") + with open(g_res, "w") as fp: fp.write("") with pytest.raises(ValueError) as exc: - tax_assign = LineageDB.load_from_gather_with_lineages(g_res) + LineageDB.load_from_gather_with_lineages(g_res) assert "cannot read taxonomy assignments" in str(exc.value) def test_check_and_load_gather_lineage_csvs_bad_header(runtmp): # test on file with wrong headers - g_res = runtmp.output('empty.gather-tax.csv') - with open(g_res, 'w', newline="") as fp: + g_res = runtmp.output("empty.gather-tax.csv") + with open(g_res, "w", newline="") as fp: fp.write("x,y,z") with pytest.raises(ValueError) as exc: - tax_assign = LineageDB.load_from_gather_with_lineages(g_res) - assert "Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?" in str(exc.value) + LineageDB.load_from_gather_with_lineages(g_res) + assert ( + "Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?" + in str(exc.value) + ) def test_check_and_load_gather_lineage_csvs_dne(runtmp): # test loading with-lineage file that does not exist - g_res = runtmp.output('empty.gather-tax.csv') + g_res = runtmp.output("empty.gather-tax.csv") with pytest.raises(ValueError) as exc: - tax_assign = LineageDB.load_from_gather_with_lineages(g_res) + LineageDB.load_from_gather_with_lineages(g_res) assert "does not exist" in str(exc.value) def test_check_and_load_gather_lineage_csvs_isdir(runtmp): # test loading a with-lineage file that is actually a directory - g_res = runtmp.output('empty.gather-tax.csv') + g_res = runtmp.output("empty.gather-tax.csv") os.mkdir(g_res) with pytest.raises(ValueError) as exc: - tax_assign = LineageDB.load_from_gather_with_lineages(g_res) + LineageDB.load_from_gather_with_lineages(g_res) assert "is a directory" in str(exc.value) def test_check_and_load_gather_csvs_fail_on_missing(runtmp): - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data("tax/test1.gather.csv") # make gather results with taxonomy name not in tax_assign - g_res2 = runtmp.output('gA.gather.csv') - g_results = [x.replace("GCF_001881345.1", "gA") + "\n" for x in Path(g_csv).read_text().splitlines()] - with open(g_res2, 'w') as fp: + g_res2 = runtmp.output("gA.gather.csv") + g_results = [ + x.replace("GCF_001881345.1", "gA") + "\n" + for x in Path(g_csv).read_text().splitlines() + ] + with open(g_res2, "w") as fp: fp.writelines(g_results) csvs = [g_res2] # load taxonomy csv - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") tax_assign = MultiLineageDB.load([taxonomy_csv], keep_full_identifiers=1) print(tax_assign) # check gather results and missing ids with pytest.raises(ValueError) as exc: - check_and_load_gather_csvs(csvs, tax_assign, fail_on_missing_taxonomy=True, force=True) + check_and_load_gather_csvs( + csvs, tax_assign, fail_on_missing_taxonomy=True, force=True + ) assert "Failing, as requested via --fail-on-missing-taxonomy" in str(exc) def test_load_gather_results(): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], - keep_full_identifiers=False, - keep_identifier_versions=False) - gather_csv = utils.get_test_data('tax/test1.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + tax_assign = MultiLineageDB.load( + [taxonomy_csv], keep_full_identifiers=False, keep_identifier_versions=False + ) + gather_csv = utils.get_test_data("tax/test1.gather.csv") gather_results, header = load_gather_results(gather_csv, tax_assignments=tax_assign) assert len(gather_results) == 1 for query_name, res in gather_results.items(): - assert query_name == 'test1' + assert query_name == "test1" assert len(res.raw_taxresults) == 4 def test_load_gather_results_gzipped(runtmp): - gather_csv = utils.get_test_data('tax/test1.gather.csv') - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], - keep_full_identifiers=False, - keep_identifier_versions=False) - gather_csv = utils.get_test_data('tax/test1.gather.csv') + gather_csv = utils.get_test_data("tax/test1.gather.csv") + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + tax_assign = MultiLineageDB.load( + [taxonomy_csv], keep_full_identifiers=False, keep_identifier_versions=False + ) + gather_csv = utils.get_test_data("tax/test1.gather.csv") # rewrite gather_csv as gzipped csv - gz_gather = runtmp.output('g.csv.gz') - with open(gather_csv, 'rb') as f_in, gzip.open(gz_gather, 'wb') as f_out: + gz_gather = runtmp.output("g.csv.gz") + with open(gather_csv, "rb") as f_in, gzip.open(gz_gather, "wb") as f_out: f_out.writelines(f_in) - #gather_results, header, seen_queries = load_gather_results(gz_gather) + # gather_results, header, seen_queries = load_gather_results(gz_gather) gather_results, header = load_gather_results(gz_gather, tax_assignments=tax_assign) assert len(gather_results) == 1 for query_name, res in gather_results.items(): - assert query_name == 'test1' + assert query_name == "test1" assert len(res.raw_taxresults) == 4 def test_load_gather_results_bad_header(runtmp): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], - keep_full_identifiers=False, - keep_identifier_versions=False) - g_csv = utils.get_test_data('tax/test1.gather.csv') - - bad_g_csv = runtmp.output('g.csv') - - #creates bad gather result - bad_g = [x.replace("f_unique_to_query", "nope") + "\n" for x in Path(g_csv).read_text().splitlines()] - with open(bad_g_csv, 'w') as fp: + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + tax_assign = MultiLineageDB.load( + [taxonomy_csv], keep_full_identifiers=False, keep_identifier_versions=False + ) + g_csv = utils.get_test_data("tax/test1.gather.csv") + + bad_g_csv = runtmp.output("g.csv") + + # creates bad gather result + bad_g = [ + x.replace("f_unique_to_query", "nope") + "\n" + for x in Path(g_csv).read_text().splitlines() + ] + with open(bad_g_csv, "w") as fp: fp.writelines(bad_g) print("bad_gather_results: \n", bad_g) with pytest.raises(ValueError) as exc: - gather_results, header = load_gather_results(bad_g_csv, tax_assignments=tax_assign) - assert f"'{bad_g_csv}' is missing columns needed for taxonomic summarization" in str(exc.value) + gather_results, header = load_gather_results( + bad_g_csv, tax_assignments=tax_assign + ) + assert ( + f"'{bad_g_csv}' is missing columns needed for taxonomic summarization" + in str(exc.value) + ) def test_load_gather_results_empty(runtmp): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], - keep_full_identifiers=False, - keep_identifier_versions=False) - empty_csv = runtmp.output('g.csv') - - #creates empty gather result - with open(empty_csv, 'w') as fp: - fp.write('') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + tax_assign = MultiLineageDB.load( + [taxonomy_csv], keep_full_identifiers=False, keep_identifier_versions=False + ) + empty_csv = runtmp.output("g.csv") + + # creates empty gather result + with open(empty_csv, "w") as fp: + fp.write("") with pytest.raises(ValueError) as exc: - gather_results, header = load_gather_results(empty_csv, tax_assignments=tax_assign) - assert f"Cannot read gather results from '{empty_csv}'. Is file empty?" in str(exc.value) + gather_results, header = load_gather_results( + empty_csv, tax_assignments=tax_assign + ) + assert f"Cannot read gather results from '{empty_csv}'. Is file empty?" in str( + exc.value + ) def test_load_taxonomy_csv(): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") tax_assign = MultiLineageDB.load([taxonomy_csv]) print("taxonomy assignments: \n", tax_assign) - assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1'] - assert len(tax_assign) == 6 # should have read 6 rows + assert list(tax_assign.keys()) == [ + "GCF_001881345.1", + "GCF_009494285.1", + "GCF_013368705.1", + "GCF_003471795.1", + "GCF_000017325.1", + "GCF_000021665.1", + ] + assert len(tax_assign) == 6 # should have read 6 rows def test_load_taxonomy_csv_LIN(): - taxonomy_csv = utils.get_test_data('tax/test.LIN-taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.LIN-taxonomy.csv") tax_assign = MultiLineageDB.load([taxonomy_csv], lins=True) print("taxonomy assignments: \n", tax_assign) - assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1'] - #assert list(tax_assign.keys()) == ["GCF_000010525.1", "GCF_000007365.1", "GCF_000007725.1", "GCF_000009605.1", "GCF_000021065.1", "GCF_000021085.1"] - assert len(tax_assign) == 6 # should have read 6 rows + assert list(tax_assign.keys()) == [ + "GCF_001881345.1", + "GCF_009494285.1", + "GCF_013368705.1", + "GCF_003471795.1", + "GCF_000017325.1", + "GCF_000021665.1", + ] + # assert list(tax_assign.keys()) == ["GCF_000010525.1", "GCF_000007365.1", "GCF_000007725.1", "GCF_000009605.1", "GCF_000021065.1", "GCF_000021085.1"] + assert len(tax_assign) == 6 # should have read 6 rows print(tax_assign.available_ranks) - assert tax_assign.available_ranks == {str(x) for x in range(0,20)} + assert tax_assign.available_ranks == {str(x) for x in range(0, 20)} def test_load_taxonomy_csv_LIN_fail(): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") with pytest.raises(ValueError) as exc: MultiLineageDB.load([taxonomy_csv], lins=True) - assert f"'lin' column not found: cannot read LIN taxonomy assignments from {taxonomy_csv}." in str(exc.value) + assert ( + f"'lin' column not found: cannot read LIN taxonomy assignments from {taxonomy_csv}." + in str(exc.value) + ) def test_load_taxonomy_csv_LIN_mismatch_in_taxfile(runtmp): - taxonomy_csv = utils.get_test_data('tax/test.LIN-taxonomy.csv') - mimatchLIN_csv = runtmp.output('mmLIN-taxonomy.csv') - with open(mimatchLIN_csv, 'w') as mm: - tax21=[] + taxonomy_csv = utils.get_test_data("tax/test.LIN-taxonomy.csv") + mimatchLIN_csv = runtmp.output("mmLIN-taxonomy.csv") + with open(mimatchLIN_csv, "w") as mm: + tax21 = [] tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] for n, taxline in enumerate(tax): - if n == 2: # add ;0 to a LIN - taxlist = taxline.split(',') - taxlist[1] += ';0' # add 21st position to LIN + if n == 2: # add ;0 to a LIN + taxlist = taxline.split(",") + taxlist[1] += ";0" # add 21st position to LIN tax21.append(",".join(taxlist)) else: tax21.append(taxline) mm.write("\n".join(tax21)) with pytest.raises(ValueError) as exc: MultiLineageDB.load([mimatchLIN_csv], lins=True) - assert "For taxonomic summarization, all LIN assignments must use the same number of LIN positions." in str(exc.value) + assert ( + "For taxonomic summarization, all LIN assignments must use the same number of LIN positions." + in str(exc.value) + ) def test_load_taxonomy_csv_gzip(runtmp): # test loading a gzipped taxonomy csv file - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_gz = runtmp.output('tax.csv.gz') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + tax_gz = runtmp.output("tax.csv.gz") - with gzip.open(tax_gz, 'wt') as outfp: - with open(taxonomy_csv, 'rt') as infp: + with gzip.open(tax_gz, "wt") as outfp: + with open(taxonomy_csv) as infp: data = infp.read() outfp.write(data) tax_assign = MultiLineageDB.load([tax_gz]) print("taxonomy assignments: \n", tax_assign) - assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1'] - assert len(tax_assign) == 6 # should have read 6 rows + assert list(tax_assign.keys()) == [ + "GCF_001881345.1", + "GCF_009494285.1", + "GCF_013368705.1", + "GCF_003471795.1", + "GCF_000017325.1", + "GCF_000021665.1", + ] + assert len(tax_assign) == 6 # should have read 6 rows def test_load_taxonomy_csv_split_id(): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], keep_full_identifiers=0, - keep_identifier_versions=False) + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + tax_assign = MultiLineageDB.load( + [taxonomy_csv], keep_full_identifiers=0, keep_identifier_versions=False + ) print("taxonomy assignments: \n", tax_assign) - assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', 'GCF_000017325', 'GCF_000021665'] - assert len(tax_assign) == 6 # should have read 6 rows + assert list(tax_assign.keys()) == [ + "GCF_001881345", + "GCF_009494285", + "GCF_013368705", + "GCF_003471795", + "GCF_000017325", + "GCF_000021665", + ] + assert len(tax_assign) == 6 # should have read 6 rows def test_load_taxonomy_csv_with_ncbi_id(runtmp): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") upd_csv = runtmp.output("updated_taxonomy.csv") - with open(upd_csv, 'w') as new_tax: + with open(upd_csv, "w") as new_tax: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] ncbi_id = "ncbi_id after_space" fake_lin = [ncbi_id] + ["sk", "phy", "cls", "ord", "fam", "gen", "sp"] @@ -779,14 +1242,22 @@ def test_load_taxonomy_csv_with_ncbi_id(runtmp): tax_assign = MultiLineageDB.load([upd_csv], keep_full_identifiers=True) print("taxonomy assignments: \n", tax_assign) - assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1', "ncbi_id after_space"] + assert list(tax_assign.keys()) == [ + "GCF_001881345.1", + "GCF_009494285.1", + "GCF_013368705.1", + "GCF_003471795.1", + "GCF_000017325.1", + "GCF_000021665.1", + "ncbi_id after_space", + ] assert len(tax_assign) == 7 # should have read 7 rows def test_load_taxonomy_csv_split_id_ncbi(runtmp): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") upd_csv = runtmp.output("updated_taxonomy.csv") - with open(upd_csv, 'w') as new_tax: + with open(upd_csv, "w") as new_tax: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] ncbi_id = "ncbi_id after_space" fake_lin = [ncbi_id] + ["sk", "phy", "cls", "ord", "fam", "gen", "sp"] @@ -794,24 +1265,34 @@ def test_load_taxonomy_csv_split_id_ncbi(runtmp): tax.append(ncbi_tax) new_tax.write("\n".join(tax)) - tax_assign = MultiLineageDB.load([upd_csv], keep_full_identifiers=False, - keep_identifier_versions=False) + tax_assign = MultiLineageDB.load( + [upd_csv], keep_full_identifiers=False, keep_identifier_versions=False + ) print("taxonomy assignments: \n", tax_assign) - assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', 'GCF_000017325', 'GCF_000021665', "ncbi_id"] - assert len(tax_assign) == 7 # should have read 7 rows + assert list(tax_assign.keys()) == [ + "GCF_001881345", + "GCF_009494285", + "GCF_013368705", + "GCF_003471795", + "GCF_000017325", + "GCF_000021665", + "ncbi_id", + ] + assert len(tax_assign) == 7 # should have read 7 rows # check for non-sensical args. with pytest.raises(ValueError): - tax_assign = MultiLineageDB.load([upd_csv], keep_full_identifiers=1, - keep_identifier_versions=False) + tax_assign = MultiLineageDB.load( + [upd_csv], keep_full_identifiers=1, keep_identifier_versions=False + ) def test_load_taxonomy_csv_duplicate(runtmp): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") duplicated_csv = runtmp.output("duplicated_taxonomy.csv") - with open(duplicated_csv, 'w') as dup: + with open(duplicated_csv, "w") as dup: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax.append(tax[1] + 'FOO') # add first tax_assign again + tax.append(tax[1] + "FOO") # add first tax_assign again print(tax[-1]) dup.write("\n".join(tax)) @@ -823,73 +1304,132 @@ def test_load_taxonomy_csv_duplicate(runtmp): def test_load_taxonomy_csv_duplicate_force(runtmp): - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") duplicated_csv = runtmp.output("duplicated_taxonomy.csv") - with open(duplicated_csv, 'w') as dup: + with open(duplicated_csv, "w") as dup: tax = [x.rstrip() for x in Path(taxonomy_csv).read_text().splitlines()] - tax.append(tax[1]) # add first tax_assign again + tax.append(tax[1]) # add first tax_assign again dup.write("\n".join(tax)) # now force tax_assign = MultiLineageDB.load([duplicated_csv], force=True) print("taxonomy assignments: \n", tax_assign) - assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1'] + assert list(tax_assign.keys()) == [ + "GCF_001881345.1", + "GCF_009494285.1", + "GCF_013368705.1", + "GCF_003471795.1", + "GCF_000017325.1", + "GCF_000021665.1", + ] def test_format_for_krona_summarization(): """test format for krona""" # make gather results - # make mini taxonomy + # make mini taxonomy gA_tax = ("gA", "a;b") gB_tax = ("gB", "a;c") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) + taxD = make_mini_taxonomy([gA_tax, gB_tax]) - gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.2,'f_unique_to_query': 0.2,'unique_intersect_bp': 50}, - {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, summarize=True, single_query=True) - kres, header = format_for_krona([q_res], 'superkingdom') - assert header == ['fraction', 'superkingdom'] + gather_results = [ + { + "query_name": "queryA", + "name": "gA", + "f_unique_weighted": 0.2, + "f_unique_to_query": 0.2, + "unique_intersect_bp": 50, + }, + { + "query_name": "queryA", + "name": "gB", + "f_unique_weighted": 0.3, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, summarize=True, single_query=True + ) + kres, header = format_for_krona([q_res], "superkingdom") + assert header == ["fraction", "superkingdom"] print("krona_res: ", kres) - assert kres == [(0.5, 'a'), (0.5, 'unclassified')] - kres, header = format_for_krona([q_res], 'phylum') - assert header == ['fraction', 'superkingdom', 'phylum'] - assert kres == [(0.3, 'a', 'c'), (0.2, 'a', 'b'), (0.5, 'unclassified', 'unclassified')] + assert kres == [(0.5, "a"), (0.5, "unclassified")] + kres, header = format_for_krona([q_res], "phylum") + assert header == ["fraction", "superkingdom", "phylum"] + assert kres == [ + (0.3, "a", "c"), + (0.2, "a", "b"), + (0.5, "unclassified", "unclassified"), + ] def test_format_for_krona_classification(): """test format for krona""" # make gather results - # make mini taxonomy + # make mini taxonomy gA_tax = ("gA", "a;b") gB_tax = ("gB", "a;c") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) + taxD = make_mini_taxonomy([gA_tax, gB_tax]) - gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.2,'f_unique_to_query': 0.2,'unique_intersect_bp': 50}, - {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, classify=True, single_query=True) - kres, header = format_for_krona([q_res], 'superkingdom', classification=True) - assert header == ['fraction', 'superkingdom'] + gather_results = [ + { + "query_name": "queryA", + "name": "gA", + "f_unique_weighted": 0.2, + "f_unique_to_query": 0.2, + "unique_intersect_bp": 50, + }, + { + "query_name": "queryA", + "name": "gB", + "f_unique_weighted": 0.3, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, classify=True, single_query=True + ) + kres, header = format_for_krona([q_res], "superkingdom", classification=True) + assert header == ["fraction", "superkingdom"] print("krona_res: ", kres) - assert kres == [(0.5, 'a')]#, (0.5, 'unclassified')] - kres, header = format_for_krona([q_res], 'phylum', classification=True) - assert header == ['fraction', 'superkingdom', 'phylum'] - assert kres == [(0.3, 'a', 'c')]#, (0.7, 'unclassified', 'unclassified')] + assert kres == [(0.5, "a")] # , (0.5, 'unclassified')] + kres, header = format_for_krona([q_res], "phylum", classification=True) + assert header == ["fraction", "superkingdom", "phylum"] + assert kres == [(0.3, "a", "c")] # , (0.7, 'unclassified', 'unclassified')] def test_format_for_krona_improper_rank(): """test format for krona""" # make gather results - # make mini taxonomy + # make mini taxonomy gA_tax = ("gA", "a;b") gB_tax = ("gB", "a;c") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) + taxD = make_mini_taxonomy([gA_tax, gB_tax]) - gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.2,'f_unique_to_query': 0.2,'unique_intersect_bp': 50}, - {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, summarize=True, single_query=True) + gather_results = [ + { + "query_name": "queryA", + "name": "gA", + "f_unique_weighted": 0.2, + "f_unique_to_query": 0.2, + "unique_intersect_bp": 50, + }, + { + "query_name": "queryA", + "name": "gB", + "f_unique_weighted": 0.3, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, summarize=True, single_query=True + ) with pytest.raises(ValueError) as exc: - format_for_krona([q_res], 'NotARank') + format_for_krona([q_res], "NotARank") print(str(exc)) assert "Rank 'NotARank' not present in summarized ranks." in str(exc) @@ -897,33 +1437,57 @@ def test_format_for_krona_improper_rank(): def test_format_for_krona_summarization_two_queries(): """test format for krona with multiple queries (normalize by n_queries)""" # make gather results - # make mini taxonomy + # make mini taxonomy gA_tax = ("gA", "a;b") gB_tax = ("gB", "a;c") - taxD = make_mini_taxonomy([gA_tax,gB_tax]) + taxD = make_mini_taxonomy([gA_tax, gB_tax]) - gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.2,'f_unique_to_query': 0.2,'unique_intersect_bp': 50}, - {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}, - {'query_name': 'queryB', "name": 'gB', 'f_unique_weighted': 0.5,'f_unique_to_query': 0.5,'unique_intersect_bp': 50}] + gather_results = [ + { + "query_name": "queryA", + "name": "gA", + "f_unique_weighted": 0.2, + "f_unique_to_query": 0.2, + "unique_intersect_bp": 50, + }, + { + "query_name": "queryA", + "name": "gB", + "f_unique_weighted": 0.3, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + { + "query_name": "queryB", + "name": "gB", + "f_unique_weighted": 0.5, + "f_unique_to_query": 0.5, + "unique_intersect_bp": 50, + }, + ] gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, summarize=True) - kres, header = format_for_krona(list(gres.values()), 'superkingdom') - assert header == ['fraction', 'superkingdom'] + kres, header = format_for_krona(list(gres.values()), "superkingdom") + assert header == ["fraction", "superkingdom"] print("krona_res: ", kres) - assert kres == [(0.5, 'a'), (0.5, 'unclassified')] - kres, header = format_for_krona(list(gres.values()), 'phylum') - assert header == ['fraction', 'superkingdom', 'phylum'] - assert kres == [(0.4, 'a', 'c'), (0.1, 'a', 'b'), (0.5, 'unclassified', 'unclassified')] + assert kres == [(0.5, "a"), (0.5, "unclassified")] + kres, header = format_for_krona(list(gres.values()), "phylum") + assert header == ["fraction", "superkingdom", "phylum"] + assert kres == [ + (0.4, "a", "c"), + (0.1, "a", "b"), + (0.5, "unclassified", "unclassified"), + ] def test_write_krona(runtmp): """test two matches, equal f_unique_to_query""" - krona_results = [(0.5, 'a', 'b', 'c'), (0.5, 'a', 'b', 'd')] - header = ['fraction', 'superkingdom', 'phylum', 'class'] - outk= runtmp.output("outkrona.tsv") - with open(outk, 'w') as out_fp: + krona_results = [(0.5, "a", "b", "c"), (0.5, "a", "b", "d")] + header = ["fraction", "superkingdom", "phylum", "class"] + outk = runtmp.output("outkrona.tsv") + with open(outk, "w") as out_fp: write_krona(header, krona_results, out_fp) - kr = [x.strip().split('\t') for x in Path(outk).read_text().splitlines()] + kr = [x.strip().split("\t") for x in Path(outk).read_text().splitlines()] print("krona_results_from_file: \n", kr) assert kr[0] == ["fraction", "superkingdom", "phylum", "class"] assert kr[1] == ["0.5", "a", "b", "c"] @@ -931,65 +1495,73 @@ def test_write_krona(runtmp): def test_write_lineage_sample_frac(runtmp): - outfrac = runtmp.output('outfrac.csv') - sample_names = ['sample1', 'sample2'] - sk_linD = {'a': {'sample1': '0.500' ,'sample2': '0.700'}} - with open(outfrac, 'w') as out_fp: + outfrac = runtmp.output("outfrac.csv") + sample_names = ["sample1", "sample2"] + sk_linD = {"a": {"sample1": "0.500", "sample2": "0.700"}} + with open(outfrac, "w") as out_fp: write_lineage_sample_frac(sample_names, sk_linD, out_fp) - frac_lines = [x.strip().split('\t') for x in Path(outfrac).read_text().splitlines()] + frac_lines = [x.strip().split("\t") for x in Path(outfrac).read_text().splitlines()] print("csv_lines: ", frac_lines) - assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a', '0.500', '0.700']] + assert frac_lines == [["lineage", "sample1", "sample2"], ["a", "0.500", "0.700"]] - phy_linD = {'a;b': {'sample1': '0.500'}, 'a;c': {'sample2': '0.700'}} - with open(outfrac, 'w') as out_fp: + phy_linD = {"a;b": {"sample1": "0.500"}, "a;c": {"sample2": "0.700"}} + with open(outfrac, "w") as out_fp: write_lineage_sample_frac(sample_names, phy_linD, out_fp) - frac_lines = [x.strip().split('\t') for x in Path(outfrac).read_text().splitlines()] + frac_lines = [x.strip().split("\t") for x in Path(outfrac).read_text().splitlines()] print("csv_lines: ", frac_lines) - assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a;b', '0.500', '0'], ['a;c', '0', '0.700']] + assert frac_lines == [ + ["lineage", "sample1", "sample2"], + ["a;b", "0.500", "0"], + ["a;c", "0", "0.700"], + ] def test_write_lineage_sample_frac_format_lineage(runtmp): - outfrac = runtmp.output('outfrac.csv') - sample_names = ['sample1', 'sample2'] - sk_lineage='a' + outfrac = runtmp.output("outfrac.csv") + sample_names = ["sample1", "sample2"] + sk_lineage = "a" print(sk_lineage) - sk_linD = {sk_lineage: {'sample1': '0.500' ,'sample2': '0.700'}} - with open(outfrac, 'w') as out_fp: + sk_linD = {sk_lineage: {"sample1": "0.500", "sample2": "0.700"}} + with open(outfrac, "w") as out_fp: write_lineage_sample_frac(sample_names, sk_linD, out_fp) - frac_lines = [x.strip().split('\t') for x in Path(outfrac).read_text().splitlines()] + frac_lines = [x.strip().split("\t") for x in Path(outfrac).read_text().splitlines()] print("csv_lines: ", frac_lines) - assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a', '0.500', '0.700']] + assert frac_lines == [["lineage", "sample1", "sample2"], ["a", "0.500", "0.700"]] - phy_lineage='a;b' + phy_lineage = "a;b" print(phy_lineage) - phy2_lineage = 'a;c' + phy2_lineage = "a;c" print(phy2_lineage) - phy_linD = {phy_lineage: {'sample1': '0.500'}, phy2_lineage: {'sample2': '0.700'}} - with open(outfrac, 'w') as out_fp: + phy_linD = {phy_lineage: {"sample1": "0.500"}, phy2_lineage: {"sample2": "0.700"}} + with open(outfrac, "w") as out_fp: write_lineage_sample_frac(sample_names, phy_linD, out_fp) - frac_lines = [x.strip().split('\t') for x in Path(outfrac).read_text().splitlines()] + frac_lines = [x.strip().split("\t") for x in Path(outfrac).read_text().splitlines()] print("csv_lines: ", frac_lines) - assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a;b', '0.500', '0'], ['a;c', '0', '0.700']] + assert frac_lines == [ + ["lineage", "sample1", "sample2"], + ["a;b", "0.500", "0"], + ["a;c", "0", "0.700"], + ] def test_tax_multi_load_files(runtmp): # test loading various good and bad files - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv') - badcsv = utils.get_test_data('tax/47+63_x_gtdb-rs202.gather.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + taxonomy_csv2 = utils.get_test_data("tax/test-strain.taxonomy.csv") + badcsv = utils.get_test_data("tax/47+63_x_gtdb-rs202.gather.csv") db = MultiLineageDB.load([taxonomy_csv]) assert len(db) == 6 - assert 'strain' not in db.available_ranks + assert "strain" not in db.available_ranks db = MultiLineageDB.load([taxonomy_csv2]) assert len(db) == 6 - assert 'strain' in db.available_ranks - assert db['GCF_001881345.1'][0].rank == 'superkingdom' + assert "strain" in db.available_ranks + assert db["GCF_001881345.1"][0].rank == "superkingdom" # load a string rather than a list with pytest.raises(TypeError): @@ -1001,75 +1573,83 @@ def test_tax_multi_load_files(runtmp): # load a directory with pytest.raises(ValueError): - MultiLineageDB.load([runtmp.output('')]) + MultiLineageDB.load([runtmp.output("")]) # file does not exist with pytest.raises(ValueError): - MultiLineageDB.load([runtmp.output('no-such-file')]) + MultiLineageDB.load([runtmp.output("no-such-file")]) def test_tax_sql_load_new_file(runtmp): # test loading a newer-format sql file with sourmash_internals table - taxonomy_db = utils.get_test_data('sqlite/test.taxonomy.db') + taxonomy_db = utils.get_test_data("sqlite/test.taxonomy.db") db = MultiLineageDB.load([taxonomy_db]) print(list(db.keys())) assert len(db) == 6 - assert 'strain' not in db.available_ranks - assert db['GCF_001881345'][0].rank == 'superkingdom' + assert "strain" not in db.available_ranks + assert db["GCF_001881345"][0].rank == "superkingdom" def test_tax_multi_load_files_shadowed(runtmp): # test loading various good and bad files - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv') - taxonomy_db = utils.get_test_data('tax/test.taxonomy.db') - - db = MultiLineageDB.load([taxonomy_csv, taxonomy_csv2, taxonomy_db], - keep_full_identifiers=False, - keep_identifier_versions=False) + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + taxonomy_csv2 = utils.get_test_data("tax/test-strain.taxonomy.csv") + taxonomy_db = utils.get_test_data("tax/test.taxonomy.db") + + db = MultiLineageDB.load( + [taxonomy_csv, taxonomy_csv2, taxonomy_db], + keep_full_identifiers=False, + keep_identifier_versions=False, + ) assert len(db.shadowed_identifiers()) == 6 # we should have everything including strain assert set(RankLineageInfo().taxlist) == set(db.available_ranks) - db = MultiLineageDB.load([taxonomy_csv, taxonomy_db], - keep_full_identifiers=False, - keep_identifier_versions=False) + db = MultiLineageDB.load( + [taxonomy_csv, taxonomy_db], + keep_full_identifiers=False, + keep_identifier_versions=False, + ) assert len(db.shadowed_identifiers()) == 6 assert set(RankLineageInfo().taxlist[:-1]) == set(db.available_ranks) def test_tax_multi_save_files(runtmp, keep_identifiers, keep_versions): # test save - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") if keep_identifiers and not keep_versions: with pytest.raises(ValueError): - db = MultiLineageDB.load([taxonomy_csv], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db = MultiLineageDB.load( + [taxonomy_csv], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) return - db = MultiLineageDB.load([taxonomy_csv], - keep_full_identifiers=keep_identifiers, - keep_identifier_versions=keep_versions) + db = MultiLineageDB.load( + [taxonomy_csv], + keep_full_identifiers=keep_identifiers, + keep_identifier_versions=keep_versions, + ) - out_db = runtmp.output('out.db') - out_csv = runtmp.output('out.csv') - out2_csv = runtmp.output('out2.csv') + out_db = runtmp.output("out.db") + out_csv = runtmp.output("out.csv") + out2_csv = runtmp.output("out2.csv") # can't save to fp with sql - with open(out_csv, 'wt') as fp: + with open(out_csv, "w") as fp: with pytest.raises(ValueError): - db.save(fp, 'sql') + db.save(fp, "sql") # these should all work... - with open(out_csv, 'wt') as fp: - db.save(fp, 'csv') + with open(out_csv, "w") as fp: + db.save(fp, "csv") - db.save(out2_csv, 'csv') - db.save(out_db, 'sql') + db.save(out2_csv, "csv") + db.save(out_db, "sql") # ...and be equal db1 = db.load([out_db]) @@ -1078,19 +1658,20 @@ def test_tax_multi_save_files(runtmp, keep_identifiers, keep_versions): def strip_strain(it): for k, v in it: - if v[-1].rank == 'strain': + if v[-1].rank == "strain": v = v[:-1] yield k, v import pprint + db_items = list(strip_strain(db.items())) db1_items = list(strip_strain(db1.items())) db2_items = list(strip_strain(db2.items())) db3_items = list(strip_strain(db3.items())) pprint.pprint(db_items) - print('XXX') + print("XXX") pprint.pprint(list(db1_items)) - print('XXX') + print("XXX") pprint.pprint(list(db2_items)) assert set(db_items) == set(db1_items) @@ -1100,18 +1681,18 @@ def strip_strain(it): def test_lineage_db_csv_load(runtmp): # test LineageDB.load - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv') - badcsv = utils.get_test_data('tax/47+63_x_gtdb-rs202.gather.csv') - badcsv2 = utils.get_test_data('tax/test-missing-ranks.taxonomy.csv') + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") + taxonomy_csv2 = utils.get_test_data("tax/test-strain.taxonomy.csv") + badcsv = utils.get_test_data("tax/47+63_x_gtdb-rs202.gather.csv") + badcsv2 = utils.get_test_data("tax/test-missing-ranks.taxonomy.csv") db = LineageDB.load(taxonomy_csv) assert len(db) == 6 - assert 'strain' not in db.available_ranks + assert "strain" not in db.available_ranks db = LineageDB.load(taxonomy_csv2) assert len(db) == 6 - assert 'strain' in db.available_ranks + assert "strain" in db.available_ranks # load the wrong kind of csv with pytest.raises(ValueError): @@ -1123,32 +1704,32 @@ def test_lineage_db_csv_load(runtmp): # load a directory with pytest.raises(ValueError): - LineageDB.load(runtmp.output('')) + LineageDB.load(runtmp.output("")) # file does not exist with pytest.raises(ValueError): - LineageDB.load(runtmp.output('no-such-file')) + LineageDB.load(runtmp.output("no-such-file")) # construct a CSV with bad headers - with open(runtmp.output('xxx.csv'), 'w', newline="") as fp: - fp.write('x,y,z\n') + with open(runtmp.output("xxx.csv"), "w", newline="") as fp: + fp.write("x,y,z\n") with pytest.raises(ValueError): - LineageDB.load(runtmp.output('xxx.csv')) + LineageDB.load(runtmp.output("xxx.csv")) def test_lineage_db_sql_load(runtmp): # test LineageDB_sqlite.load - taxonomy_db = utils.get_test_data('tax/test.taxonomy.db') - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_db = utils.get_test_data("tax/test.taxonomy.db") + taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv") db = LineageDB_Sqlite.load(taxonomy_db) assert bool(db) assert len(db) == 6 db.available_ranks - assert 'strain' not in db.available_ranks - assert db['GCF_001881345'][0].rank == 'superkingdom' + assert "strain" not in db.available_ranks + assert db["GCF_001881345"][0].rank == "superkingdom" with pytest.raises(KeyError): - db['foo'] + db["foo"] # load any kind of CSV with pytest.raises(ValueError): @@ -1156,57 +1737,63 @@ def test_lineage_db_sql_load(runtmp): # load a directory with pytest.raises(ValueError): - LineageDB_Sqlite.load(runtmp.output('')) + LineageDB_Sqlite.load(runtmp.output("")) # file does not exist with pytest.raises(ValueError): - LineageDB_Sqlite.load(runtmp.output('no-such-file')) + LineageDB_Sqlite.load(runtmp.output("no-such-file")) def test_LineagePair(): - lin = LineagePair(rank="rank1", name='name1') + lin = LineagePair(rank="rank1", name="name1") print(lin) - assert lin.rank=="rank1" - assert lin.name =="name1" - assert lin.taxid==None + assert lin.rank == "rank1" + assert lin.name == "name1" + assert lin.taxid is None def test_LineagePair_1(): - lin = LineagePair(rank="rank1", name='name1', taxid=1) - assert lin.rank=="rank1" - assert lin.name =="name1" - assert lin.taxid==1 + lin = LineagePair(rank="rank1", name="name1", taxid=1) + assert lin.rank == "rank1" + assert lin.name == "name1" + assert lin.taxid == 1 print(lin) def test_BaseLineageInfo_init_empty(): - ranks=["A", "B", "C"] + ranks = ["A", "B", "C"] taxinf = BaseLineageInfo(ranks=ranks) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['', '', ''] # this is a bit odd, but it's what preserves empty ranks... + assert taxinf.zip_lineage() == [ + "", + "", + "", + ] # this is a bit odd, but it's what preserves empty ranks... print(taxinf.filled_lineage) assert taxinf.filled_lineage == () - assert taxinf.lowest_lineage_name == None - assert taxinf.lowest_lineage_taxid == None + assert taxinf.lowest_lineage_name is None + assert taxinf.lowest_lineage_taxid is None assert taxinf.filled_ranks == () - assert taxinf.name_at_rank("A") == None - assert taxinf.lowest_rank == None + assert taxinf.name_at_rank("A") is None + assert taxinf.lowest_rank is None assert taxinf.display_lineage() == "" assert taxinf.display_lineage(null_as_unclassified=True) == "unclassified" def test_BaseLineageInfo_init_lineage_str(): x = "a;b;c" - ranks=["A", "B", "C"] + ranks = ["A", "B", "C"] taxinf = BaseLineageInfo(lineage_str=x, ranks=ranks) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', 'b', 'c'] + assert taxinf.zip_lineage() == ["a", "b", "c"] print(taxinf.filled_lineage) - assert taxinf.filled_lineage == (LineagePair(rank='A', name='a', taxid=None), - LineagePair(rank='B', name='b', taxid=None), - LineagePair(rank='C', name='c', taxid=None)) + assert taxinf.filled_lineage == ( + LineagePair(rank="A", name="a", taxid=None), + LineagePair(rank="B", name="b", taxid=None), + LineagePair(rank="C", name="c", taxid=None), + ) assert taxinf.lowest_lineage_name == "c" assert taxinf.lowest_rank == "C" assert taxinf.name_at_rank("A") == "a" @@ -1214,37 +1801,39 @@ def test_BaseLineageInfo_init_lineage_str(): def test_BaseLineageInfo_init_lineage_str_comma_sep(): x = "a,b,c" - ranks=["A", "B", "C"] + ranks = ["A", "B", "C"] taxinf = BaseLineageInfo(lineage_str=x, ranks=ranks) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', 'b', 'c'] + assert taxinf.zip_lineage() == ["a", "b", "c"] print(taxinf.filled_lineage) assert taxinf.lowest_lineage_name == "c" def test_BaseLineageInfo_init_lineage_tups(): - ranks=["A", "B", "C"] - lin_tups = (LineagePair(rank="A", name='a'), LineagePair(rank="C", name='b')) + ranks = ["A", "B", "C"] + lin_tups = (LineagePair(rank="A", name="a"), LineagePair(rank="C", name="b")) taxinf = BaseLineageInfo(lineage=lin_tups, ranks=ranks) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', '', 'b'] + assert taxinf.zip_lineage() == ["a", "", "b"] def test_BaseLineageInfo_init_lca_lineage_tups(): - ranks=["A", "B", "C"] - lin_tups = (LineagePair(rank="A", name='a'), LineagePair(rank="C", name='b')) + ranks = ["A", "B", "C"] + lin_tups = (LineagePair(rank="A", name="a"), LineagePair(rank="C", name="b")) taxinf = BaseLineageInfo(lineage=lin_tups, ranks=ranks) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', '', 'b'] + assert taxinf.zip_lineage() == ["a", "", "b"] def test_BaseLineageInfo_init_no_ranks(): x = "a;b;c" - rankD = {"superkingdom": "a", "phylum": "b", "class": "c"} - lin_tups = (LineagePair(rank="rank2", name='name1'), LineagePair(rank="rank1", name='name1')) + lin_tups = ( + LineagePair(rank="rank2", name="name1"), + LineagePair(rank="rank1", name="name1"), + ) with pytest.raises(TypeError) as exc: BaseLineageInfo(lineage_str=x) print(exc) @@ -1256,9 +1845,8 @@ def test_BaseLineageInfo_init_no_ranks(): def test_BaseLineageInfo_init_with_wrong_ranks(): - ranks=["A", "B", "C"] - lin_tups = [LineagePair(rank="rank1", name='name1')] - linD = {"rank1": "a"} + ranks = ["A", "B", "C"] + lin_tups = [LineagePair(rank="rank1", name="name1")] with pytest.raises(ValueError) as exc: BaseLineageInfo(lineage=lin_tups, ranks=ranks) print(str(exc)) @@ -1266,7 +1854,7 @@ def test_BaseLineageInfo_init_with_wrong_ranks(): def test_BaseLineageInfo_init_not_lineagepair(): - ranks=["A", "B", "C"] + ranks = ["A", "B", "C"] lin_tups = (("rank1", "name1"),) with pytest.raises(ValueError) as exc: BaseLineageInfo(lineage=lin_tups, ranks=ranks) @@ -1276,7 +1864,16 @@ def test_BaseLineageInfo_init_not_lineagepair(): def test_RankLineageInfo_taxlist(): taxinf = RankLineageInfo() - taxranks = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain') + taxranks = ( + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + "strain", + ) assert taxinf.taxlist == taxranks assert taxinf.ascending_taxlist == taxranks[::-1] @@ -1286,14 +1883,14 @@ def test_RankLineageInfo_init_lineage_str(): taxinf = RankLineageInfo(lineage_str=x) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', 'b', 'c', '', '', '', '', ''] + assert taxinf.zip_lineage() == ["a", "b", "c", "", "", "", "", ""] def test_LINLineageInfo_init_empty(): taxinf = LINLineageInfo() assert taxinf.n_lin_positions == 0 - assert taxinf.zip_lineage()== [] - assert taxinf.display_lineage()== "" + assert taxinf.zip_lineage() == [] + assert taxinf.display_lineage() == "" assert taxinf.filled_ranks == () assert taxinf.n_filled_pos == 0 @@ -1304,7 +1901,7 @@ def test_LINLineageInfo_init_n_pos(): print(taxinf.lineage) print(taxinf.lineage_str) assert taxinf.n_lin_positions == 5 - assert taxinf.zip_lineage()== ['', '', '', '', ''] + assert taxinf.zip_lineage() == ["", "", "", "", ""] assert taxinf.filled_ranks == () assert taxinf.n_filled_pos == 0 @@ -1316,8 +1913,8 @@ def test_LINLineageInfo_init_n_pos_and_lineage_str(): print(taxinf.lineage) print(taxinf.lineage_str) assert taxinf.n_lin_positions == 5 - assert taxinf.zip_lineage()== ['0', '0', '1', '', ''] - assert taxinf.filled_ranks == ("0","1","2") + assert taxinf.zip_lineage() == ["0", "0", "1", "", ""] + assert taxinf.filled_ranks == ("0", "1", "2") assert taxinf.n_filled_pos == 3 @@ -1327,7 +1924,10 @@ def test_LINLineageInfo_init_n_pos_and_lineage_str_fail(): with pytest.raises(ValueError) as exc: LINLineageInfo(lineage_str=x, n_lin_positions=n_pos) print(str(exc)) - assert "Provided 'n_lin_positions' has fewer positions than provided 'lineage_str'." in str(exc) + assert ( + "Provided 'n_lin_positions' has fewer positions than provided 'lineage_str'." + in str(exc) + ) def test_LINLineageInfo_init_lineage_str_only(): @@ -1336,8 +1936,8 @@ def test_LINLineageInfo_init_lineage_str_only(): print(taxinf.lineage) print(taxinf.lineage_str) assert taxinf.n_lin_positions == 3 - assert taxinf.zip_lineage()== ['0', '0', '1'] - assert taxinf.filled_ranks == ("0","1","2") + assert taxinf.zip_lineage() == ["0", "0", "1"] + assert taxinf.filled_ranks == ("0", "1", "2") assert taxinf.n_filled_pos == 3 @@ -1350,12 +1950,15 @@ def test_LINLineageInfo_init_not_lineagepair(): def test_LINLineageInfo_init_lineagepair(): - lin_tups = (LineagePair("rank1", "name1"), LineagePair("rank2", None),) + lin_tups = ( + LineagePair("rank1", "name1"), + LineagePair("rank2", None), + ) taxinf = LINLineageInfo(lineage=lin_tups) print(taxinf.lineage) assert taxinf.n_lin_positions == 2 - assert taxinf.zip_lineage()== ["name1", ""] - assert taxinf.zip_lineage(truncate_empty=True)== ["name1"] + assert taxinf.zip_lineage() == ["name1", ""] + assert taxinf.zip_lineage(truncate_empty=True) == ["name1"] assert taxinf.filled_ranks == ("rank1",) assert taxinf.ranks == ("rank1", "rank2") assert taxinf.n_filled_pos == 1 @@ -1363,7 +1966,7 @@ def test_LINLineageInfo_init_lineagepair(): def test_lca_LINLineageInfo_diff_n_pos(): x = "0;0;1" - y = '0' + y = "0" lin1 = LINLineageInfo(lineage_str=x) lin2 = LINLineageInfo(lineage_str=y) assert lin1.is_compatible(lin2) @@ -1376,30 +1979,30 @@ def test_lca_LINLineageInfo_diff_n_pos(): def test_lca_LINLineageInfo_no_lca(): x = "0;0;1" - y = '12;0;1' + y = "12;0;1" lin1 = LINLineageInfo(lineage_str=x) lin2 = LINLineageInfo(lineage_str=y) assert lin1.is_compatible(lin2) assert lin2.is_compatible(lin1) lca_from_lin1 = lin1.find_lca(lin2) lca_from_lin2 = lin2.find_lca(lin1) - assert lca_from_lin1 == lca_from_lin2 == None + assert lca_from_lin1 == lca_from_lin2 is None def test_lca_RankLineageInfo_no_lca(): x = "a;b;c" - y = 'd;e;f;g' + y = "d;e;f;g" lin1 = RankLineageInfo(lineage_str=x) lin2 = RankLineageInfo(lineage_str=y) assert lin1.is_compatible(lin2) assert lin2.is_compatible(lin1) lca_from_lin1 = lin1.find_lca(lin2) lca_from_lin2 = lin2.find_lca(lin1) - assert lca_from_lin1 == lca_from_lin2 == None + assert lca_from_lin1 == lca_from_lin2 is None def test_incompatibility_LINLineageInfo_RankLineageInfo(): - x="a;b;c" + x = "a;b;c" lin1 = RankLineageInfo(lineage_str=x) lin2 = LINLineageInfo(lineage_str=x) assert not lin1.is_compatible(lin2) @@ -1408,64 +2011,75 @@ def test_incompatibility_LINLineageInfo_RankLineageInfo(): def test_RankLineageInfo_init_lineage_str_with_ranks_as_list(): x = "a;b;c" - taxranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] + taxranks = [ + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + ] taxinf = RankLineageInfo(lineage_str=x, ranks=taxranks) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', 'b', 'c', '', '', '', ''] + assert taxinf.zip_lineage() == ["a", "b", "c", "", "", "", ""] def test_RankLineageInfo_init_lineage_tups(): - x = (LineagePair(rank="superkingdom", name='a'), LineagePair(rank="phylum", name='b')) + x = ( + LineagePair(rank="superkingdom", name="a"), + LineagePair(rank="phylum", name="b"), + ) taxinf = RankLineageInfo(lineage=x) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', 'b', '', '', '', '', '', ''] + assert taxinf.zip_lineage() == ["a", "b", "", "", "", "", "", ""] def test_RankLineageInfo_init_lineage_dict_fail(): - ranks=["A", "B", "C"] - lin_tups = (LineagePair(rank="A", name='a'), LineagePair(rank="C", name='b')) + ranks = ["A", "B", "C"] + lin_tups = (LineagePair(rank="A", name="a"), LineagePair(rank="C", name="b")) with pytest.raises(ValueError) as exc: - taxinf = RankLineageInfo(ranks=ranks, lineage_dict=lin_tups) + RankLineageInfo(ranks=ranks, lineage_dict=lin_tups) print(str(exc)) assert "is not dictionary" in str(exc) def test_RankLineageInfo_init_lineage_dict(): - x = {'rank1': 'name1', 'rank2': 'name2'} + x = {"rank1": "name1", "rank2": "name2"} taxinf = RankLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"]) print("ranks: ", taxinf.ranks) print("lineage: ", taxinf.lineage) print("zipped lineage: ", taxinf.zip_lineage()) - assert taxinf.zip_lineage()== ['name1', 'name2'] + assert taxinf.zip_lineage() == ["name1", "name2"] def test_RankLineageInfo_init_lineage_dict_default_ranks(): - x = {"superkingdom":'a',"phylum":'b'} + x = {"superkingdom": "a", "phylum": "b"} taxinf = RankLineageInfo(lineage_dict=x) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', 'b', '', '', '', '', '', ''] + assert taxinf.zip_lineage() == ["a", "b", "", "", "", "", "", ""] def test_RankLineageInfo_init_lineage_dict_withtaxpath(): - x = {'rank1': 'name1', 'rank2': 'name2', 'taxpath': "1|2"} + x = {"rank1": "name1", "rank2": "name2", "taxpath": "1|2"} taxinf = RankLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"]) print("ranks: ", taxinf.ranks) print("lineage: ", taxinf.lineage) print("zipped lineage: ", taxinf.zip_lineage()) print("zipped taxids: ", taxinf.zip_taxid()) - assert taxinf.zip_lineage()== ['name1', 'name2'] - assert taxinf.zip_taxid()== ['1', '2'] + assert taxinf.zip_lineage() == ["name1", "name2"] + assert taxinf.zip_taxid() == ["1", "2"] assert taxinf.lowest_lineage_taxid == "2" assert taxinf.lowest_lineage_name == "name2" def test_RankLineageInfo_init_lineage_str_lineage_dict_test_eq(): x = "a;b;c" - ranks=["A", "B", "C"] + ranks = ["A", "B", "C"] rankD = {"A": "a", "B": "b", "C": "c"} lin1 = RankLineageInfo(lineage_str=x, ranks=ranks) lin2 = RankLineageInfo(lineage_dict=rankD, ranks=ranks) @@ -1473,56 +2087,56 @@ def test_RankLineageInfo_init_lineage_str_lineage_dict_test_eq(): def test_RankLineageInfo_init_lineage_dict_missing_rank(): - x = {'superkingdom': 'name1', 'class': 'name2'} + x = {"superkingdom": "name1", "class": "name2"} taxinf = RankLineageInfo(lineage_dict=x) print("ranks: ", taxinf.ranks) print("lineage: ", taxinf.lineage) print("zipped lineage: ", taxinf.zip_lineage()) - assert taxinf.zip_lineage()== ['name1', '', 'name2', '', '', '', '', ''] - assert taxinf.zip_lineage(truncate_empty=True)== ['name1', '', 'name2'] + assert taxinf.zip_lineage() == ["name1", "", "name2", "", "", "", "", ""] + assert taxinf.zip_lineage(truncate_empty=True) == ["name1", "", "name2"] def test_RankLineageInfo_init_lineage_dict_missing_rank_with_taxpath(): - x = {'superkingdom': 'name1', 'class': 'name2', 'taxpath': '1||2'} + x = {"superkingdom": "name1", "class": "name2", "taxpath": "1||2"} taxinf = RankLineageInfo(lineage_dict=x) print("ranks: ", taxinf.ranks) print("lineage: ", taxinf.lineage) print("zipped lineage: ", taxinf.zip_lineage()) - assert taxinf.zip_lineage()== ['name1', '', 'name2', '', '', '', '', ''] - assert taxinf.zip_taxid()== ['1', '', '2', '', '', '', '', ''] + assert taxinf.zip_lineage() == ["name1", "", "name2", "", "", "", "", ""] + assert taxinf.zip_taxid() == ["1", "", "2", "", "", "", "", ""] def test_RankLineageInfo_init_lineage_dict_name_taxpath_mismatch(): # If there's no name, we don't report the taxpath, because lineage is not "filled". # Is this desired behavior? - x = {'superkingdom': 'name1', 'taxpath': '1||2'} + x = {"superkingdom": "name1", "taxpath": "1||2"} taxinf = RankLineageInfo(lineage_dict=x) print("ranks: ", taxinf.ranks) print("lineage: ", taxinf.lineage) print("zipped lineage: ", taxinf.zip_lineage()) - assert taxinf.zip_lineage()== ['name1', '', '', '', '', '', '', ''] - assert taxinf.zip_taxid()== ['1', '', '', '', '', '', '', ''] + assert taxinf.zip_lineage() == ["name1", "", "", "", "", "", "", ""] + assert taxinf.zip_taxid() == ["1", "", "", "", "", "", "", ""] def test_RankLineageInfo_init_lineage_dict_name_taxpath_missing_taxids(): # If there's no name, we don't report the taxpath, because lineage is not "filled". # Is this desired behavior? - x = {'superkingdom': 'name1', 'phylum': "name2", "class": "name3", 'taxpath': '|2'} + x = {"superkingdom": "name1", "phylum": "name2", "class": "name3", "taxpath": "|2"} taxinf = RankLineageInfo(lineage_dict=x) print("ranks: ", taxinf.ranks) print("lineage: ", taxinf.lineage) print("zipped lineage: ", taxinf.zip_lineage()) print("zipped taxids: ", taxinf.zip_taxid()) - assert taxinf.zip_lineage()== ['name1', 'name2', 'name3', '', '', '', '', ''] - assert taxinf.zip_taxid()== ['', '2', '', '', '', '', '', ''] + assert taxinf.zip_lineage() == ["name1", "name2", "name3", "", "", "", "", ""] + assert taxinf.zip_taxid() == ["", "2", "", "", "", "", "", ""] def test_RankLineageInfo_init_lineage_dict_taxpath_too_long(): - x = {'superkingdom': 'name1', 'class': 'name2', 'taxpath': '1||2||||||||||'} + x = {"superkingdom": "name1", "class": "name2", "taxpath": "1||2||||||||||"} with pytest.raises(ValueError) as exc: RankLineageInfo(lineage_dict=x) print(str(exc)) - assert f"Number of NCBI taxids (13) exceeds number of ranks (8)" in str(exc) + assert "Number of NCBI taxids (13) exceeds number of ranks (8)" in str(exc) def test_RankLineageInfo_init_lineage_str_lineage_dict_test_eq(): @@ -1540,7 +2154,7 @@ def test_RankLineageInfo_init_lineage_str_1_truncate(): taxinf = RankLineageInfo(lineage_str=x) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage(truncate_empty=True)== ['a', 'b', 'c'] + assert taxinf.zip_lineage(truncate_empty=True) == ["a", "b", "c"] def test_RankLineageInfo_init_lineage_str_2(): @@ -1548,7 +2162,7 @@ def test_RankLineageInfo_init_lineage_str_2(): taxinf = RankLineageInfo(lineage_str=x) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage()== ['a', 'b', '', 'c' '', '', '', '', ''] + assert taxinf.zip_lineage() == ["a", "b", "", "c" "", "", "", "", ""] def test_RankLineageInfo_init_lineage_str_2_truncate(): @@ -1556,72 +2170,92 @@ def test_RankLineageInfo_init_lineage_str_2_truncate(): taxinf = RankLineageInfo(lineage_str=x) print(taxinf.lineage) print(taxinf.lineage_str) - assert taxinf.zip_lineage(truncate_empty=True)== ['a', 'b', '', 'c'] + assert taxinf.zip_lineage(truncate_empty=True) == ["a", "b", "", "c"] def test_RankLineageInfo_init_lineage_with_incorrect_rank(): - x = [ LineagePair('superkingdom', 'a'), LineagePair("NotARank", ''), LineagePair('class', 'c') ] + x = [ + LineagePair("superkingdom", "a"), + LineagePair("NotARank", ""), + LineagePair("class", "c"), + ] with pytest.raises(ValueError) as exc: RankLineageInfo(lineage=x) print(str(exc)) - assert f"Rank 'NotARank' not present in " in str(exc) + assert "Rank 'NotARank' not present in " in str(exc) def test_zip_lineage_1(): - x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ] + x = [LineagePair("superkingdom", "a"), LineagePair("phylum", "b")] taxinf = RankLineageInfo(lineage=x) print("ranks: ", taxinf.ranks) print("zipped lineage: ", taxinf.zip_lineage()) - assert taxinf.zip_lineage() == ['a', 'b', '', '', '', '', '', ''] + assert taxinf.zip_lineage() == ["a", "b", "", "", "", "", "", ""] def test_zip_lineage_2(): - x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ] + x = [LineagePair("superkingdom", "a"), LineagePair("phylum", "b")] taxinf = RankLineageInfo(lineage=x) print("ranks: ", taxinf.ranks) print("zipped lineage: ", taxinf.zip_lineage(truncate_empty=True)) - assert taxinf.zip_lineage(truncate_empty=True) == ['a', 'b'] + assert taxinf.zip_lineage(truncate_empty=True) == ["a", "b"] def test_zip_lineage_3(): - x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ] + x = [ + LineagePair("superkingdom", "a"), + LineagePair(None, ""), + LineagePair("class", "c"), + ] taxinf = RankLineageInfo(lineage=x) - assert taxinf.zip_lineage() == ['a', '', 'c', '', '', '', '', ''] + assert taxinf.zip_lineage() == ["a", "", "c", "", "", "", "", ""] def test_zip_lineage_3_truncate(): - x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ] + x = [ + LineagePair("superkingdom", "a"), + LineagePair(None, ""), + LineagePair("class", "c"), + ] taxinf = RankLineageInfo(lineage=x) - assert taxinf.zip_lineage(truncate_empty=True) == ['a', '', 'c'] + assert taxinf.zip_lineage(truncate_empty=True) == ["a", "", "c"] def test_zip_lineage_4(): - x = [ LineagePair('superkingdom', 'a'), LineagePair('class', 'c') ] + x = [LineagePair("superkingdom", "a"), LineagePair("class", "c")] taxinf = RankLineageInfo(lineage=x) - assert taxinf.zip_lineage(truncate_empty=True) == ['a', '', 'c'] + assert taxinf.zip_lineage(truncate_empty=True) == ["a", "", "c"] def test_display_lineage_1(): - x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ] + x = [LineagePair("superkingdom", "a"), LineagePair("phylum", "b")] taxinf = RankLineageInfo(lineage=x) assert taxinf.display_lineage() == "a;b" def test_display_lineage_2(): - x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ] + x = [ + LineagePair("superkingdom", "a"), + LineagePair(None, ""), + LineagePair("class", "c"), + ] taxinf = RankLineageInfo(lineage=x) assert taxinf.display_lineage() == "a;;c" def test_display_taxid_1(): - x = [ LineagePair('superkingdom', 'a', 1), LineagePair('phylum', 'b', 2) ] + x = [LineagePair("superkingdom", "a", 1), LineagePair("phylum", "b", 2)] taxinf = RankLineageInfo(lineage=x) print(taxinf) assert taxinf.display_taxid() == "1;2" def test_display_taxid_2(): - x = [ LineagePair('superkingdom', 'name1', 1), LineagePair(None, ''), LineagePair ('class', 'name2',2) ] + x = [ + LineagePair("superkingdom", "name1", 1), + LineagePair(None, ""), + LineagePair("class", "name2", 2), + ] taxinf = RankLineageInfo(lineage=x) print(taxinf) assert taxinf.display_taxid() == "1;;2" @@ -1629,54 +2263,53 @@ def test_display_taxid_2(): def test_is_lineage_match_1(): # basic behavior: match at order and above, but not at family or below. - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__e') - lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__e") + lin2 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") print(lin1.lineage) assert lin1.is_compatible(lin2) - assert lin1.is_lineage_match(lin2, 'superkingdom') - assert lin2.is_lineage_match(lin1, 'superkingdom') - assert lin1.is_lineage_match(lin2, 'phylum') - assert lin2.is_lineage_match(lin1, 'phylum') - assert lin1.is_lineage_match(lin2, 'class') - assert lin2.is_lineage_match(lin1, 'class') - assert lin1.is_lineage_match(lin2, 'order') - assert lin2.is_lineage_match(lin1, 'order') - - assert not lin1.is_lineage_match(lin2, 'family') - assert not lin2.is_lineage_match(lin1, 'family') - assert not lin1.is_lineage_match(lin2, 'genus') - assert not lin2.is_lineage_match(lin1, 'genus') - assert not lin1.is_lineage_match(lin2, 'species') - assert not lin2.is_lineage_match(lin1, 'species') + assert lin1.is_lineage_match(lin2, "superkingdom") + assert lin2.is_lineage_match(lin1, "superkingdom") + assert lin1.is_lineage_match(lin2, "phylum") + assert lin2.is_lineage_match(lin1, "phylum") + assert lin1.is_lineage_match(lin2, "class") + assert lin2.is_lineage_match(lin1, "class") + assert lin1.is_lineage_match(lin2, "order") + assert lin2.is_lineage_match(lin1, "order") + + assert not lin1.is_lineage_match(lin2, "family") + assert not lin2.is_lineage_match(lin1, "family") + assert not lin1.is_lineage_match(lin2, "genus") + assert not lin2.is_lineage_match(lin1, "genus") + assert not lin1.is_lineage_match(lin2, "species") + assert not lin2.is_lineage_match(lin1, "species") lca_from_lin1 = lin1.find_lca(lin2) print(lca_from_lin1.display_lineage()) lca_from_lin2 = lin2.find_lca(lin1) assert lca_from_lin1 == lca_from_lin2 assert lca_from_lin1.display_lineage() == "d__a;p__b;c__c;o__d" - def test_is_lineage_match_2(): # match at family, and above, levels; no genus or species to match - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') - lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") + lin2 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") assert lin1.is_compatible(lin2) - assert lin1.is_lineage_match(lin2, 'superkingdom') - assert lin2.is_lineage_match(lin1, 'superkingdom') - assert lin1.is_lineage_match(lin2, 'phylum') - assert lin2.is_lineage_match(lin1, 'phylum') - assert lin1.is_lineage_match(lin2, 'class') - assert lin2.is_lineage_match(lin1, 'class') - assert lin1.is_lineage_match(lin2, 'order') - assert lin2.is_lineage_match(lin1, 'order') - assert lin1.is_lineage_match(lin2, 'family') - assert lin2.is_lineage_match(lin1, 'family') - - assert not lin1.is_lineage_match(lin2, 'genus') - assert not lin2.is_lineage_match(lin1, 'genus') - assert not lin1.is_lineage_match(lin2, 'species') - assert not lin2.is_lineage_match(lin1, 'species') + assert lin1.is_lineage_match(lin2, "superkingdom") + assert lin2.is_lineage_match(lin1, "superkingdom") + assert lin1.is_lineage_match(lin2, "phylum") + assert lin2.is_lineage_match(lin1, "phylum") + assert lin1.is_lineage_match(lin2, "class") + assert lin2.is_lineage_match(lin1, "class") + assert lin1.is_lineage_match(lin2, "order") + assert lin2.is_lineage_match(lin1, "order") + assert lin1.is_lineage_match(lin2, "family") + assert lin2.is_lineage_match(lin1, "family") + + assert not lin1.is_lineage_match(lin2, "genus") + assert not lin2.is_lineage_match(lin1, "genus") + assert not lin1.is_lineage_match(lin2, "species") + assert not lin2.is_lineage_match(lin1, "species") lca_from_lin1 = lin1.find_lca(lin2) print(lca_from_lin1.display_lineage()) @@ -1688,70 +2321,79 @@ def test_is_lineage_match_2(): def test_is_lineage_match_3(): # one lineage is empty lin1 = RankLineageInfo() - lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + lin2 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") assert lin1.is_compatible(lin2) - assert not lin1.is_lineage_match(lin2, 'superkingdom') - assert not lin2.is_lineage_match(lin1, 'superkingdom') - assert not lin1.is_lineage_match(lin2, 'phylum') - assert not lin2.is_lineage_match(lin1, 'phylum') - assert not lin1.is_lineage_match(lin2, 'class') - assert not lin2.is_lineage_match(lin1, 'class') - assert not lin1.is_lineage_match(lin2, 'order') - assert not lin2.is_lineage_match(lin1, 'order') - assert not lin1.is_lineage_match(lin2, 'family') - assert not lin2.is_lineage_match(lin1, 'family') - assert not lin1.is_lineage_match(lin2, 'genus') - assert not lin2.is_lineage_match(lin1, 'genus') - assert not lin1.is_lineage_match(lin2, 'species') - assert not lin2.is_lineage_match(lin1, 'species') + assert not lin1.is_lineage_match(lin2, "superkingdom") + assert not lin2.is_lineage_match(lin1, "superkingdom") + assert not lin1.is_lineage_match(lin2, "phylum") + assert not lin2.is_lineage_match(lin1, "phylum") + assert not lin1.is_lineage_match(lin2, "class") + assert not lin2.is_lineage_match(lin1, "class") + assert not lin1.is_lineage_match(lin2, "order") + assert not lin2.is_lineage_match(lin1, "order") + assert not lin1.is_lineage_match(lin2, "family") + assert not lin2.is_lineage_match(lin1, "family") + assert not lin1.is_lineage_match(lin2, "genus") + assert not lin2.is_lineage_match(lin1, "genus") + assert not lin1.is_lineage_match(lin2, "species") + assert not lin2.is_lineage_match(lin1, "species") def test_is_lineage_match_incorrect_ranks(): - #test comparison with incompatible ranks - taxranks = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain') - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__e', ranks=taxranks[::-1]) - lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + # test comparison with incompatible ranks + taxranks = ( + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + "strain", + ) + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__e", ranks=taxranks[::-1]) + lin2 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") print(lin1.lineage) assert not lin1.is_compatible(lin2) with pytest.raises(ValueError) as exc: - lin1.is_lineage_match(lin2, 'superkingdom') + lin1.is_lineage_match(lin2, "superkingdom") print(str(exc)) - assert 'Cannot compare lineages from taxonomies with different ranks.' in str(exc) + assert "Cannot compare lineages from taxonomies with different ranks." in str(exc) def test_is_lineage_match_improper_rank(): - #test comparison with incompatible ranks - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__e') - lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + # test comparison with incompatible ranks + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__e") + lin2 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") print(lin1.lineage) assert lin1.is_compatible(lin2) with pytest.raises(ValueError) as exc: - lin1.is_lineage_match(lin2, 'NotARank') + lin1.is_lineage_match(lin2, "NotARank") print(str(exc)) assert "Desired Rank 'NotARank' not available for this lineage" in str(exc) def test_pop_to_rank_1(): # basic behavior - pop to order? - lin1 = RankLineageInfo(lineage_str='d__a;p__b;c__c;o__d') - lin2 = RankLineageInfo(lineage_str='d__a;p__b;c__c;o__d;f__f') + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d") + lin2 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") print(lin1) - popped = lin2.pop_to_rank('order') + popped = lin2.pop_to_rank("order") print(popped) assert popped == lin1 def test_pop_to_rank_2(): # what if we're already above rank? - lin2 = RankLineageInfo(lineage_str='d__a;p__b;c__c;o__d;f__f') - print(lin2.pop_to_rank('species')) - assert lin2.pop_to_rank('species') == lin2 + lin2 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") + print(lin2.pop_to_rank("species")) + assert lin2.pop_to_rank("species") == lin2 def test_pop_to_rank_rank_not_avail(): - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") with pytest.raises(ValueError) as exc: lin1.pop_to_rank("NotARank") print(str(exc)) @@ -1759,15 +2401,17 @@ def test_pop_to_rank_rank_not_avail(): def test_lineage_at_rank_norank(): - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") with pytest.raises(TypeError) as exc: lin1.lineage_at_rank() print(str(exc)) - assert "lineage_at_rank() missing 1 required positional argument: 'rank'" in str(exc) + assert "lineage_at_rank() missing 1 required positional argument: 'rank'" in str( + exc + ) def test_lineage_at_rank_rank_not_avail(): - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") with pytest.raises(ValueError) as exc: lin1.lineage_at_rank("NotARank") print(str(exc)) @@ -1775,27 +2419,33 @@ def test_lineage_at_rank_rank_not_avail(): def test_lineage_at_rank_1(): - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') - print(lin1.lineage_at_rank('superkingdom')) - - assert lin1.lineage_at_rank('superkingdom') == (LineagePair(rank='superkingdom', name='d__a', taxid=None),) - print(lin1.lineage_at_rank('class')) - assert lin1.lineage_at_rank('class') == (LineagePair(rank='superkingdom', name='d__a', taxid=None), - LineagePair(rank='phylum', name='p__b', taxid=None), - LineagePair(rank='class', name='c__c', taxid=None)) + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") + print(lin1.lineage_at_rank("superkingdom")) + + assert lin1.lineage_at_rank("superkingdom") == ( + LineagePair(rank="superkingdom", name="d__a", taxid=None), + ) + print(lin1.lineage_at_rank("class")) + assert lin1.lineage_at_rank("class") == ( + LineagePair(rank="superkingdom", name="d__a", taxid=None), + LineagePair(rank="phylum", name="p__b", taxid=None), + LineagePair(rank="class", name="c__c", taxid=None), + ) def test_lineage_at_rank_below_rank(): - lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') - print(lin1.lineage_at_rank('superkingdom')) + lin1 = RankLineageInfo(lineage_str="d__a;p__b;c__c;o__d;f__f") + print(lin1.lineage_at_rank("superkingdom")) # if rank is not provided, we only return the filled lineage, to follow original pop_to_rank behavior. - print(lin1.lineage_at_rank('genus')) - assert lin1.lineage_at_rank('genus') == (LineagePair(rank='superkingdom', name='d__a', taxid=None), - LineagePair(rank='phylum', name='p__b', taxid=None), - LineagePair(rank='class', name='c__c', taxid=None), - LineagePair(rank='order', name='o__d', taxid=None), - LineagePair(rank='family', name='f__f', taxid=None)) + print(lin1.lineage_at_rank("genus")) + assert lin1.lineage_at_rank("genus") == ( + LineagePair(rank="superkingdom", name="d__a", taxid=None), + LineagePair(rank="phylum", name="p__b", taxid=None), + LineagePair(rank="class", name="c__c", taxid=None), + LineagePair(rank="order", name="o__d", taxid=None), + LineagePair(rank="family", name="f__f", taxid=None), + ) def test_TaxResult_get_match_lineage_1(): @@ -1825,13 +2475,15 @@ def test_TaxResult_get_match_lineage_skip_ident(): gA = {"name": "gA.1 name"} taxres = make_TaxResult(gA) - taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gA']) + taxres.get_match_lineage(tax_assignments=taxD, skip_idents=["gA"]) print("skipped_ident?: ", taxres.skipped_ident) print("missed_ident?: ", taxres.missed_ident) assert taxres.skipped_ident == True assert taxres.lineageInfo == RankLineageInfo() assert taxres.lineageInfo.display_lineage() == "" - assert taxres.lineageInfo.display_lineage(null_as_unclassified=True) == "unclassified" + assert ( + taxres.lineageInfo.display_lineage(null_as_unclassified=True) == "unclassified" + ) def test_TaxResult_get_match_lineage_missed_ident_fail_on_missing(): @@ -1840,14 +2492,16 @@ def test_TaxResult_get_match_lineage_missed_ident_fail_on_missing(): gA = {"name": "gA.1 name"} taxres = make_TaxResult(gA) - taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gB']) + taxres.get_match_lineage(tax_assignments=taxD, skip_idents=["gB"]) print("skipped_ident?: ", taxres.skipped_ident) print("missed_ident?: ", taxres.missed_ident) assert taxres.skipped_ident == False assert taxres.missed_ident == True assert taxres.lineageInfo == RankLineageInfo() assert taxres.lineageInfo.display_lineage() == "" - assert taxres.lineageInfo.display_lineage(null_as_unclassified=True) == "unclassified" + assert ( + taxres.lineageInfo.display_lineage(null_as_unclassified=True) == "unclassified" + ) def test_TaxResult_get_match_lineage_missed_ident_fail_on_missing(): @@ -1857,7 +2511,9 @@ def test_TaxResult_get_match_lineage_missed_ident_fail_on_missing(): gA = {"name": "gA.1 name"} taxres = make_TaxResult(gA) with pytest.raises(ValueError) as exc: - taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gB'], fail_on_missing_taxonomy=True) + taxres.get_match_lineage( + tax_assignments=taxD, skip_idents=["gB"], fail_on_missing_taxonomy=True + ) print(str(exc)) assert "Error: ident 'gA' is not in the taxonomy database." in str(exc) @@ -1881,7 +2537,16 @@ def test_QueryTaxResult(): assert q_res.skipped_idents == set() assert q_res.missed_idents == set() assert q_res.summarized_lineage_results == {} - taxranks = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain') + taxranks = ( + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + "strain", + ) assert q_res.ranks == taxranks assert q_res.ascending_ranks == taxranks[::-1] @@ -1891,7 +2556,7 @@ def test_QueryTaxResult_add_incompatible(): tax_info = [("gA", "a;b;c")] taxD = make_mini_taxonomy(tax_info=tax_info) taxres = make_TaxResult(taxD=taxD) - taxres2 = make_TaxResult({'query_name': 'q2'}, taxD=taxD) + taxres2 = make_TaxResult({"query_name": "q2"}, taxD=taxD) # initialize q_res = QueryTaxResult(taxres.query_info) # check that new querytaxres is compatible with taxres and not taxres2 @@ -1906,22 +2571,25 @@ def test_QueryTaxResult_add_incompatible(): def test_QueryTaxResult_add_without_tax_info(): "initialize and add a taxresult with missed ident" - taxres = make_TaxResult() # do not add taxonomic info + taxres = make_TaxResult() # do not add taxonomic info # initialize q_res = QueryTaxResult(taxres.query_info) print("attempted to add lineage info?: ", taxres.match_lineage_attempted) with pytest.raises(ValueError) as exc: q_res.add_taxresult(taxres) print(str(exc)) - assert "Error: Cannot add TaxResult. Please use get_match_lineage() to add taxonomic lineage information first." in str(exc) - - + assert ( + "Error: Cannot add TaxResult. Please use get_match_lineage() to add taxonomic lineage information first." + in str(exc) + ) + + def test_QueryTaxResult_add_skipped_ident(): "initialize and add a taxresult with skipped ident" gA_tax = ("gA", "a;b;c") taxD = make_mini_taxonomy([gA_tax]) - taxres = make_TaxResult(taxD=taxD, skip_idents = ['gA']) -# taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gA']) + taxres = make_TaxResult(taxD=taxD, skip_idents=["gA"]) + # taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gA']) # initialize q_res = QueryTaxResult(taxres.query_info) q_res.add_taxresult(taxres) @@ -1953,16 +2621,16 @@ def test_QueryTaxResult_track_missed_and_skipped(): taxD = make_mini_taxonomy(tax_info=tax_info) # make results taxres = make_TaxResult() - taxres2 = make_TaxResult({"name": 'gB'}) # skipped - taxres3 = make_TaxResult({"name": 'gB'}) # skipped - taxres4 = make_TaxResult({"name": 'gC'}) # skipped - taxres5 = make_TaxResult({"name": 'gD'}) # missed - taxres6 = make_TaxResult({"name": 'gE'}) # missed + taxres2 = make_TaxResult({"name": "gB"}) # skipped + taxres3 = make_TaxResult({"name": "gB"}) # skipped + taxres4 = make_TaxResult({"name": "gC"}) # skipped + taxres5 = make_TaxResult({"name": "gD"}) # missed + taxres6 = make_TaxResult({"name": "gE"}) # missed # initialize q_res = QueryTaxResult(taxres.query_info) # add taxonomic info to taxres, add to q_res for n, tr in enumerate([taxres, taxres2, taxres3, taxres4, taxres5, taxres6]): - tr.get_match_lineage(tax_assignments=taxD, skip_idents=['gB', 'gC']) + tr.get_match_lineage(tax_assignments=taxD, skip_idents=["gB", "gC"]) print("num: ", n) print("skipped?: ", tr.skipped_ident) print("missed?: ", tr.missed_ident) @@ -1972,18 +2640,27 @@ def test_QueryTaxResult_track_missed_and_skipped(): print(q_res.n_missed) assert q_res.n_missed == 2 assert q_res.n_skipped == 3 - assert 'gB' in q_res.skipped_idents + assert "gB" in q_res.skipped_idents assert len(q_res.skipped_idents) == 2 - assert 'gD' in q_res.missed_idents + assert "gD" in q_res.missed_idents assert q_res.summarized_lineage_results == {} def test_QueryTaxResult_track_missed_and_skipped_using_fn(): "make sure missed and skipped idents are being tracked. Same as above but use helper fn." taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}, {"name": 'gB'}, {"name": 'gC'}, {"name": 'gD'}, {"name": 'gE'}] - gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, skip_idents=['gB', 'gC']) - # should have 6 results for default query 'q1' + gather_results = [ + {}, + {"name": "gB"}, + {"name": "gB"}, + {"name": "gC"}, + {"name": "gD"}, + {"name": "gE"}, + ] + gres = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, skip_idents=["gB", "gC"] + ) + # should have 6 results for default query 'q1' print(gres.keys()) q_res = next(iter(gres.values())) assert len(q_res.raw_taxresults) == 6 @@ -1991,237 +2668,411 @@ def test_QueryTaxResult_track_missed_and_skipped_using_fn(): print(q_res.n_missed) assert q_res.n_missed == 2 assert q_res.n_skipped == 3 - assert 'gB' in q_res.skipped_idents + assert "gB" in q_res.skipped_idents assert len(q_res.skipped_idents) == 2 - assert 'gD' in q_res.missed_idents + assert "gD" in q_res.missed_idents assert q_res.summarized_lineage_results == {} def test_QueryTaxResult_summarize_up_ranks_1(): "basic functionality: summarize up ranks" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] + gather_results = [{}, {"name": "gB"}] gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD) assert len(gres.keys()) == 1 q_res = next(iter(gres.values())) # now summarize up the ranks q_res.summarize_up_ranks() assert len(q_res.raw_taxresults) == 2 - #print(q_res.sum_uniq_weighted.values()) - #print(q_res.sum_uniq_weighted['superkingdom']) - assert list(q_res.sum_uniq_weighted.keys()) == ['class', 'phylum', 'superkingdom'] - assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.4)} - assert q_res.sum_uniq_to_query['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.2)} - assert q_res.sum_uniq_bp['superkingdom'] == {RankLineageInfo(lineage_str="a"): 40} - assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.4)} - assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.2)} - assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 40} - assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2), - RankLineageInfo(lineage_str="a;b;d"): approx(0.2)} - assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1), - RankLineageInfo(lineage_str="a;b;d"): approx(0.1)} - assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20, - RankLineageInfo(lineage_str="a;b;d"): 20} + # print(q_res.sum_uniq_weighted.values()) + # print(q_res.sum_uniq_weighted['superkingdom']) + assert list(q_res.sum_uniq_weighted.keys()) == ["class", "phylum", "superkingdom"] + assert q_res.sum_uniq_weighted["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.4) + } + assert q_res.sum_uniq_to_query["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.2) + } + assert q_res.sum_uniq_bp["superkingdom"] == {RankLineageInfo(lineage_str="a"): 40} + assert q_res.sum_uniq_weighted["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.4) + } + assert q_res.sum_uniq_to_query["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.2) + } + assert q_res.sum_uniq_bp["phylum"] == {RankLineageInfo(lineage_str="a;b"): 40} + assert q_res.sum_uniq_weighted["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.2), + RankLineageInfo(lineage_str="a;b;d"): approx(0.2), + } + assert q_res.sum_uniq_to_query["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.1), + RankLineageInfo(lineage_str="a;b;d"): approx(0.1), + } + assert q_res.sum_uniq_bp["class"] == { + RankLineageInfo(lineage_str="a;b;c"): 20, + RankLineageInfo(lineage_str="a;b;d"): 20, + } def test_QueryTaxResult_summarize_up_ranks_2(): "summarize up ranks: different values" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [ + {}, + { + "name": "gB", + "f_unique_weighted": 0.1, + "f_unique_to_query": 0.05, + "unique_intersect_bp": 10, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) # now summarize up the ranks q_res.summarize_up_ranks() assert len(q_res.raw_taxresults) == 2 print(q_res.sum_uniq_weighted.values()) - print(q_res.sum_uniq_weighted['superkingdom']) - assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.3)} - assert q_res.sum_uniq_to_query['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.15)} - assert q_res.sum_uniq_bp['superkingdom'] == {RankLineageInfo(lineage_str="a"): 30} - assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.3)} - assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.15)} - assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 30} - assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2), - RankLineageInfo(lineage_str="a;b;d"): approx(0.1)} - assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1), - RankLineageInfo(lineage_str="a;b;d"): approx(0.05)} - assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20, - RankLineageInfo(lineage_str="a;b;d"): 10} + print(q_res.sum_uniq_weighted["superkingdom"]) + assert q_res.sum_uniq_weighted["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.3) + } + assert q_res.sum_uniq_to_query["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.15) + } + assert q_res.sum_uniq_bp["superkingdom"] == {RankLineageInfo(lineage_str="a"): 30} + assert q_res.sum_uniq_weighted["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.3) + } + assert q_res.sum_uniq_to_query["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.15) + } + assert q_res.sum_uniq_bp["phylum"] == {RankLineageInfo(lineage_str="a;b"): 30} + assert q_res.sum_uniq_weighted["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.2), + RankLineageInfo(lineage_str="a;b;d"): approx(0.1), + } + assert q_res.sum_uniq_to_query["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.1), + RankLineageInfo(lineage_str="a;b;d"): approx(0.05), + } + assert q_res.sum_uniq_bp["class"] == { + RankLineageInfo(lineage_str="a;b;c"): 20, + RankLineageInfo(lineage_str="a;b;d"): 10, + } def test_QueryTaxResult_summarize_up_ranks_missing_lineage(): "basic functionality: summarize up ranks" taxD = make_mini_taxonomy([("gA", "a;b;c")]) - gather_results = [{}, {"name": 'gB'}] + gather_results = [{}, {"name": "gB"}] gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD) assert len(gres.keys()) == 1 q_res = next(iter(gres.values())) # now summarize up the ranks q_res.summarize_up_ranks() assert len(q_res.raw_taxresults) == 2 - #print(q_res.sum_uniq_weighted.values()) - print(q_res.sum_uniq_weighted['superkingdom']) - assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.2)} - assert q_res.sum_uniq_to_query['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.1)} - assert q_res.sum_uniq_bp['superkingdom'] == {RankLineageInfo(lineage_str="a"): 20} - assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.2)} - assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.1)} - assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 20} - assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2)} - assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1)} - assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20} + # print(q_res.sum_uniq_weighted.values()) + print(q_res.sum_uniq_weighted["superkingdom"]) + assert q_res.sum_uniq_weighted["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.2) + } + assert q_res.sum_uniq_to_query["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.1) + } + assert q_res.sum_uniq_bp["superkingdom"] == {RankLineageInfo(lineage_str="a"): 20} + assert q_res.sum_uniq_weighted["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.2) + } + assert q_res.sum_uniq_to_query["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.1) + } + assert q_res.sum_uniq_bp["phylum"] == {RankLineageInfo(lineage_str="a;b"): 20} + assert q_res.sum_uniq_weighted["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.2) + } + assert q_res.sum_uniq_to_query["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.1) + } + assert q_res.sum_uniq_bp["class"] == {RankLineageInfo(lineage_str="a;b;c"): 20} def test_QueryTaxResult_summarize_up_ranks_skipped_lineage(): "basic functionality: summarize up ranks" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, skip_idents=['gB']) + gather_results = [{}, {"name": "gB"}] + gres = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, skip_idents=["gB"] + ) assert len(gres.keys()) == 1 q_res = next(iter(gres.values())) # now summarize up the ranks q_res.summarize_up_ranks() assert len(q_res.raw_taxresults) == 2 - assert list(q_res.sum_uniq_weighted.keys()) == ['class', 'phylum', 'superkingdom'] - #print(q_res.sum_uniq_weighted.values()) - print(q_res.sum_uniq_weighted['superkingdom']) - assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.2)} - assert q_res.sum_uniq_to_query['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.1)} - assert q_res.sum_uniq_bp['superkingdom'] == {RankLineageInfo(lineage_str="a"): 20} - assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.2)} - assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.1)} - assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 20} - assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2)} - assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1)} - assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20} + assert list(q_res.sum_uniq_weighted.keys()) == ["class", "phylum", "superkingdom"] + # print(q_res.sum_uniq_weighted.values()) + print(q_res.sum_uniq_weighted["superkingdom"]) + assert q_res.sum_uniq_weighted["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.2) + } + assert q_res.sum_uniq_to_query["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.1) + } + assert q_res.sum_uniq_bp["superkingdom"] == {RankLineageInfo(lineage_str="a"): 20} + assert q_res.sum_uniq_weighted["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.2) + } + assert q_res.sum_uniq_to_query["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.1) + } + assert q_res.sum_uniq_bp["phylum"] == {RankLineageInfo(lineage_str="a;b"): 20} + assert q_res.sum_uniq_weighted["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.2) + } + assert q_res.sum_uniq_to_query["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.1) + } + assert q_res.sum_uniq_bp["class"] == {RankLineageInfo(lineage_str="a;b;c"): 20} def test_QueryTaxResult_summarize_up_ranks_perfect_match(): "summarize up ranks: different values" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{'f_unique_to_query': 1.0}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{"f_unique_to_query": 1.0}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) # now summarize up the ranks q_res.summarize_up_ranks() assert len(q_res.raw_taxresults) == 1 print(q_res.sum_uniq_weighted.values()) - print(q_res.sum_uniq_to_query['superkingdom']) - assert list(q_res.sum_uniq_to_query['superkingdom'].values()) == [1.0] - assert 'gA' in q_res.perfect_match + print(q_res.sum_uniq_to_query["superkingdom"]) + assert list(q_res.sum_uniq_to_query["superkingdom"].values()) == [1.0] + assert "gA" in q_res.perfect_match def test_QueryTaxResult_summarize_up_ranks_already_summarized(): "summarize up ranks: error, already summarized" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{'f_unique_to_query': 1.0}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{"f_unique_to_query": 1.0}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) # now summarize up the ranks q_res.summarize_up_ranks() with pytest.raises(ValueError) as exc: q_res.summarize_up_ranks() print(str(exc)) assert "Error: already summarized" in str(exc) - + def test_QueryTaxResult_summarize_up_ranks_already_summarized_force(): "summarize up ranks: already summarized but force" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [ + {}, + { + "name": "gB", + "f_unique_weighted": 0.1, + "f_unique_to_query": 0.05, + "unique_intersect_bp": 10, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) # now summarize up the ranks q_res.summarize_up_ranks() q_res.summarize_up_ranks(force_resummarize=True) - assert list(q_res.sum_uniq_weighted.keys()) == ['class', 'phylum', 'superkingdom'] + assert list(q_res.sum_uniq_weighted.keys()) == ["class", "phylum", "superkingdom"] - #check that all results are still good + # check that all results are still good assert len(q_res.raw_taxresults) == 2 - assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.3)} - assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.3)} - assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.15)} - assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 30} - assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1), - RankLineageInfo(lineage_str="a;b;d"): approx(0.05)} - assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2), - RankLineageInfo(lineage_str="a;b;d"): approx(0.1)} - assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20, - RankLineageInfo(lineage_str="a;b;d"): 10} + assert q_res.sum_uniq_weighted["superkingdom"] == { + RankLineageInfo(lineage_str="a"): approx(0.3) + } + assert q_res.sum_uniq_weighted["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.3) + } + assert q_res.sum_uniq_to_query["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.15) + } + assert q_res.sum_uniq_bp["phylum"] == {RankLineageInfo(lineage_str="a;b"): 30} + assert q_res.sum_uniq_to_query["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.1), + RankLineageInfo(lineage_str="a;b;d"): approx(0.05), + } + assert q_res.sum_uniq_weighted["class"] == { + RankLineageInfo(lineage_str="a;b;c"): approx(0.2), + RankLineageInfo(lineage_str="a;b;d"): approx(0.1), + } + assert q_res.sum_uniq_bp["class"] == { + RankLineageInfo(lineage_str="a;b;c"): 20, + RankLineageInfo(lineage_str="a;b;d"): 10, + } def test_QueryTaxResult_summarize_up_ranks_single_rank(): "summarize up ranks: different values" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [ + {}, + { + "name": "gB", + "f_unique_weighted": 0.1, + "f_unique_to_query": 0.05, + "unique_intersect_bp": 10, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) # now summarize up the ranks - q_res.summarize_up_ranks(single_rank='phylum') + q_res.summarize_up_ranks(single_rank="phylum") assert len(q_res.raw_taxresults) == 2 - assert list(q_res.sum_uniq_weighted.keys()) == ['phylum'] + assert list(q_res.sum_uniq_weighted.keys()) == ["phylum"] print(q_res.sum_uniq_weighted.keys()) print(q_res.sum_uniq_weighted.values()) - print(q_res.sum_uniq_weighted['phylum']) - assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.3)} - assert list(q_res.sum_uniq_to_query['phylum'].values()) == [approx(0.15)] - assert list(q_res.sum_uniq_bp['phylum'].values()) == [30] - assert q_res.summarized_ranks == ['phylum'] + print(q_res.sum_uniq_weighted["phylum"]) + assert q_res.sum_uniq_weighted["phylum"] == { + RankLineageInfo(lineage_str="a;b"): approx(0.3) + } + assert list(q_res.sum_uniq_to_query["phylum"].values()) == [approx(0.15)] + assert list(q_res.sum_uniq_bp["phylum"].values()) == [30] + assert q_res.summarized_ranks == ["phylum"] + def test_QueryTaxResult_summarize_up_ranks_single_rank_not_available(): "summarize up ranks: different values" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [ + {}, + { + "name": "gB", + "f_unique_weighted": 0.1, + "f_unique_to_query": 0.05, + "unique_intersect_bp": 10, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) # now summarize up the ranks with pytest.raises(ValueError) as exc: - q_res.summarize_up_ranks(single_rank='NotARank') + q_res.summarize_up_ranks(single_rank="NotARank") print(str(exc)) - assert "Error: rank 'NotARank' not in available ranks (strain, species, genus, family, order, class, phylum, superkingdom)" in str(exc) + assert ( + "Error: rank 'NotARank' not in available ranks (strain, species, genus, family, order, class, phylum, superkingdom)" + in str(exc) + ) def test_QueryTaxResult_summarize_up_ranks_single_rank_not_filled(): "summarize up ranks: different values" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [ + {}, + { + "name": "gB", + "f_unique_weighted": 0.1, + "f_unique_to_query": 0.05, + "unique_intersect_bp": 10, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) # now summarize up the ranks with pytest.raises(ValueError) as exc: - q_res.summarize_up_ranks(single_rank='species') + q_res.summarize_up_ranks(single_rank="species") print(str(exc)) - assert "Error: rank 'species' was not available for any matching lineages." in str(exc) + assert "Error: rank 'species' was not available for any matching lineages." in str( + exc + ) def test_QueryTaxResult_build_summarized_result_1(): "basic functionality: build summarized_result" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) q_res.build_summarized_result() print(q_res.summarized_lineage_results.keys()) - sk = [SummarizedGatherResult(rank='superkingdom', fraction=0.2, f_weighted_at_rank=0.4, - lineage=RankLineageInfo(lineage_str='a'), - bp_match_at_rank=40, query_ani_at_rank=approx(0.95, rel=1e-2)), - SummarizedGatherResult(rank='superkingdom', fraction=0.8, f_weighted_at_rank=0.6, - lineage=RankLineageInfo(), bp_match_at_rank=60, query_ani_at_rank=None)] - print(q_res.summarized_lineage_results['superkingdom']) - assert q_res.summarized_lineage_results['superkingdom'] == sk - print(q_res.summarized_lineage_results['phylum']) - phy = [SummarizedGatherResult(rank='phylum', fraction=0.2, f_weighted_at_rank=0.4, - lineage=RankLineageInfo(lineage_str='a;b'), - bp_match_at_rank=40, query_ani_at_rank=approx(0.95, rel=1e-2)), - SummarizedGatherResult(rank='phylum', fraction=0.8, f_weighted_at_rank=0.6, - lineage=RankLineageInfo(), bp_match_at_rank=60, query_ani_at_rank=None)] - assert q_res.summarized_lineage_results['phylum'] == phy - print(q_res.summarized_lineage_results['class']) - cl = [SummarizedGatherResult(rank='class', fraction=0.1, f_weighted_at_rank=0.2, - lineage=RankLineageInfo(lineage_str='a;b;c'), - bp_match_at_rank=20, query_ani_at_rank=approx(0.93, rel=1e-2)), - SummarizedGatherResult(rank='class', fraction=0.1, f_weighted_at_rank=0.2, - lineage=RankLineageInfo(lineage_str='a;b;d'), - bp_match_at_rank=20, query_ani_at_rank=approx(0.93, rel=1e-2)), - SummarizedGatherResult(rank='class', fraction=0.8, f_weighted_at_rank=0.6, - lineage=RankLineageInfo(), bp_match_at_rank=60, query_ani_at_rank=None)] - assert q_res.summarized_lineage_results['class'] == cl - - assert q_res.total_f_weighted['phylum'] == approx(0.4) - assert q_res.total_f_classified['class'] == approx(0.2) - assert q_res.total_bp_classified['superkingdom'] == 40 + sk = [ + SummarizedGatherResult( + rank="superkingdom", + fraction=0.2, + f_weighted_at_rank=0.4, + lineage=RankLineageInfo(lineage_str="a"), + bp_match_at_rank=40, + query_ani_at_rank=approx(0.95, rel=1e-2), + ), + SummarizedGatherResult( + rank="superkingdom", + fraction=0.8, + f_weighted_at_rank=0.6, + lineage=RankLineageInfo(), + bp_match_at_rank=60, + query_ani_at_rank=None, + ), + ] + print(q_res.summarized_lineage_results["superkingdom"]) + assert q_res.summarized_lineage_results["superkingdom"] == sk + print(q_res.summarized_lineage_results["phylum"]) + phy = [ + SummarizedGatherResult( + rank="phylum", + fraction=0.2, + f_weighted_at_rank=0.4, + lineage=RankLineageInfo(lineage_str="a;b"), + bp_match_at_rank=40, + query_ani_at_rank=approx(0.95, rel=1e-2), + ), + SummarizedGatherResult( + rank="phylum", + fraction=0.8, + f_weighted_at_rank=0.6, + lineage=RankLineageInfo(), + bp_match_at_rank=60, + query_ani_at_rank=None, + ), + ] + assert q_res.summarized_lineage_results["phylum"] == phy + print(q_res.summarized_lineage_results["class"]) + cl = [ + SummarizedGatherResult( + rank="class", + fraction=0.1, + f_weighted_at_rank=0.2, + lineage=RankLineageInfo(lineage_str="a;b;c"), + bp_match_at_rank=20, + query_ani_at_rank=approx(0.93, rel=1e-2), + ), + SummarizedGatherResult( + rank="class", + fraction=0.1, + f_weighted_at_rank=0.2, + lineage=RankLineageInfo(lineage_str="a;b;d"), + bp_match_at_rank=20, + query_ani_at_rank=approx(0.93, rel=1e-2), + ), + SummarizedGatherResult( + rank="class", + fraction=0.8, + f_weighted_at_rank=0.6, + lineage=RankLineageInfo(), + bp_match_at_rank=60, + query_ani_at_rank=None, + ), + ] + assert q_res.summarized_lineage_results["class"] == cl + + assert q_res.total_f_weighted["phylum"] == approx(0.4) + assert q_res.total_f_classified["class"] == approx(0.2) + assert q_res.total_bp_classified["superkingdom"] == 40 def test_QueryTaxResult_build_summarized_result_2(): @@ -2231,19 +3082,39 @@ def test_QueryTaxResult_build_summarized_result_2(): gB_tax = ("gB", "a;c") taxD = make_mini_taxonomy([gA_tax, gB_tax]) # make gather results - gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.5,'f_unique_to_query': 0.5,'unique_intersect_bp': 50}, - {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.4,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}, - {'query_name': 'queryB', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}] + gather_results = [ + { + "query_name": "queryA", + "name": "gA", + "f_unique_weighted": 0.5, + "f_unique_to_query": 0.5, + "unique_intersect_bp": 50, + }, + { + "query_name": "queryA", + "name": "gB", + "f_unique_weighted": 0.4, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + { + "query_name": "queryB", + "name": "gB", + "f_unique_weighted": 0.3, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + ] gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD) - + for query_name, q_res in gres.items(): - q_res.build_summarized_result() # summarize and build result - sk = q_res.summarized_lineage_results['superkingdom'] - phy = q_res.summarized_lineage_results['phylum'] + q_res.build_summarized_result() # summarize and build result + sk = q_res.summarized_lineage_results["superkingdom"] + phy = q_res.summarized_lineage_results["phylum"] assert len(sk) == 2 assert sk[0].lineage == RankLineageInfo(lineage_str="a") print(phy) - if query_name == 'queryA': + if query_name == "queryA": # check superkingdom results assert sk[0].fraction == approx(0.8) assert sk[0].f_weighted_at_rank == approx(0.9) @@ -2257,16 +3128,16 @@ def test_QueryTaxResult_build_summarized_result_2(): assert phy[0].fraction == approx(0.5) assert phy[0].f_weighted_at_rank == approx(0.5) assert phy[0].bp_match_at_rank == 50 - assert phy[0].lineage == RankLineageInfo(lineage_str="a;b") + assert phy[0].lineage == RankLineageInfo(lineage_str="a;b") assert phy[1].fraction == approx(0.3) assert phy[1].f_weighted_at_rank == approx(0.4) assert phy[1].bp_match_at_rank == 30 - assert phy[1].lineage == RankLineageInfo(lineage_str="a;c") + assert phy[1].lineage == RankLineageInfo(lineage_str="a;c") assert phy[2].fraction == approx(0.2) assert phy[2].f_weighted_at_rank == approx(0.1) assert phy[2].bp_match_at_rank == 20 assert phy[2].lineage == RankLineageInfo() - if query_name == 'queryB': + if query_name == "queryB": # check superkingdom results assert sk[0].fraction == approx(0.3) assert sk[0].f_weighted_at_rank == approx(0.3) @@ -2280,7 +3151,7 @@ def test_QueryTaxResult_build_summarized_result_2(): assert phy[0].fraction == approx(0.3) assert phy[0].f_weighted_at_rank == approx(0.3) assert phy[0].bp_match_at_rank == 30 - assert phy[0].lineage == RankLineageInfo(lineage_str="a;c") + assert phy[0].lineage == RankLineageInfo(lineage_str="a;c") assert phy[1].fraction == approx(0.7) assert phy[1].f_weighted_at_rank == approx(0.7) assert phy[1].bp_match_at_rank == 70 @@ -2290,91 +3161,183 @@ def test_QueryTaxResult_build_summarized_result_2(): def test_QueryTaxResult_build_summarized_result_missing_lineage(): "build summarized_result with missing lineage" taxD = make_mini_taxonomy([("gA", "a;b;c")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) q_res.build_summarized_result() print(q_res.summarized_lineage_results.keys()) - print(q_res.summarized_lineage_results['superkingdom']) - - sk = [SummarizedGatherResult(rank='superkingdom', fraction=0.1, f_weighted_at_rank=0.2, - lineage=RankLineageInfo(lineage_str="a"), - bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), - SummarizedGatherResult(rank='superkingdom', fraction=0.9, lineage=RankLineageInfo(),f_weighted_at_rank=0.8, - bp_match_at_rank=80, query_ani_at_rank=None)] - assert q_res.summarized_lineage_results['superkingdom'] == sk - print(q_res.summarized_lineage_results['phylum']) - phy = [SummarizedGatherResult(rank='phylum', fraction=0.1, f_weighted_at_rank=0.2, - lineage=RankLineageInfo(lineage_str="a;b"), - bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), - SummarizedGatherResult(rank='phylum', fraction=0.9, lineage=RankLineageInfo(),f_weighted_at_rank=0.8, - bp_match_at_rank=80, query_ani_at_rank=None)] - assert q_res.summarized_lineage_results['phylum'] == phy - print(q_res.summarized_lineage_results['class']) - cl = [SummarizedGatherResult(rank='class', fraction=0.1, lineage= RankLineageInfo(lineage_str="a;b;c"), - f_weighted_at_rank=0.2, bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), - SummarizedGatherResult(rank='class', fraction=0.9, lineage=RankLineageInfo(), f_weighted_at_rank=0.8, - bp_match_at_rank=80, query_ani_at_rank=None)] - assert q_res.summarized_lineage_results['class'] == cl - - assert q_res.total_f_weighted['phylum'] == approx(0.2) - assert q_res.total_f_classified['class'] == approx(0.1) - assert q_res.total_bp_classified['superkingdom'] == 20 + print(q_res.summarized_lineage_results["superkingdom"]) + + sk = [ + SummarizedGatherResult( + rank="superkingdom", + fraction=0.1, + f_weighted_at_rank=0.2, + lineage=RankLineageInfo(lineage_str="a"), + bp_match_at_rank=20, + query_ani_at_rank=approx(0.928, rel=1e-2), + ), + SummarizedGatherResult( + rank="superkingdom", + fraction=0.9, + lineage=RankLineageInfo(), + f_weighted_at_rank=0.8, + bp_match_at_rank=80, + query_ani_at_rank=None, + ), + ] + assert q_res.summarized_lineage_results["superkingdom"] == sk + print(q_res.summarized_lineage_results["phylum"]) + phy = [ + SummarizedGatherResult( + rank="phylum", + fraction=0.1, + f_weighted_at_rank=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + bp_match_at_rank=20, + query_ani_at_rank=approx(0.928, rel=1e-2), + ), + SummarizedGatherResult( + rank="phylum", + fraction=0.9, + lineage=RankLineageInfo(), + f_weighted_at_rank=0.8, + bp_match_at_rank=80, + query_ani_at_rank=None, + ), + ] + assert q_res.summarized_lineage_results["phylum"] == phy + print(q_res.summarized_lineage_results["class"]) + cl = [ + SummarizedGatherResult( + rank="class", + fraction=0.1, + lineage=RankLineageInfo(lineage_str="a;b;c"), + f_weighted_at_rank=0.2, + bp_match_at_rank=20, + query_ani_at_rank=approx(0.928, rel=1e-2), + ), + SummarizedGatherResult( + rank="class", + fraction=0.9, + lineage=RankLineageInfo(), + f_weighted_at_rank=0.8, + bp_match_at_rank=80, + query_ani_at_rank=None, + ), + ] + assert q_res.summarized_lineage_results["class"] == cl + + assert q_res.total_f_weighted["phylum"] == approx(0.2) + assert q_res.total_f_classified["class"] == approx(0.1) + assert q_res.total_bp_classified["superkingdom"] == 20 def test_QueryTaxResult_build_summarized_result_skipped_lineage(): "build summarized_result with skipped lineage" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, skip_idents=['gB']) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, skip_idents=["gB"] + ) q_res.build_summarized_result() print(q_res.summarized_lineage_results.keys()) - print(q_res.summarized_lineage_results['superkingdom']) - - sk = [SummarizedGatherResult(rank='superkingdom', fraction=0.1, f_weighted_at_rank=0.2, - lineage=RankLineageInfo(lineage_str="a"), - bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), - SummarizedGatherResult(rank='superkingdom', fraction=0.9, lineage=RankLineageInfo(),f_weighted_at_rank=0.8, - bp_match_at_rank=80, query_ani_at_rank=None)] - assert q_res.summarized_lineage_results['superkingdom'] == sk - print(q_res.summarized_lineage_results['phylum']) - phy = [SummarizedGatherResult(rank='phylum', fraction=0.1, lineage=RankLineageInfo(lineage_str="a;b"), - f_weighted_at_rank=0.2, bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), - SummarizedGatherResult(rank='phylum', fraction=0.9, lineage=RankLineageInfo(), f_weighted_at_rank=0.8, bp_match_at_rank=80, - query_ani_at_rank=None)] - assert q_res.summarized_lineage_results['phylum'] == phy - print(q_res.summarized_lineage_results['class']) - cl = [SummarizedGatherResult(rank='class', fraction=0.1,lineage=RankLineageInfo(lineage_str="a;b;c"), - f_weighted_at_rank=0.2, bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), - SummarizedGatherResult(rank='class', fraction=0.9, lineage=RankLineageInfo(), f_weighted_at_rank=0.8, bp_match_at_rank=80, - query_ani_at_rank=None)] - assert q_res.summarized_lineage_results['class'] == cl - - assert q_res.total_f_weighted['phylum'] == approx(0.2) - assert q_res.total_f_classified['class'] == approx(0.1) - assert q_res.total_bp_classified['superkingdom'] == 20 + print(q_res.summarized_lineage_results["superkingdom"]) + + sk = [ + SummarizedGatherResult( + rank="superkingdom", + fraction=0.1, + f_weighted_at_rank=0.2, + lineage=RankLineageInfo(lineage_str="a"), + bp_match_at_rank=20, + query_ani_at_rank=approx(0.928, rel=1e-2), + ), + SummarizedGatherResult( + rank="superkingdom", + fraction=0.9, + lineage=RankLineageInfo(), + f_weighted_at_rank=0.8, + bp_match_at_rank=80, + query_ani_at_rank=None, + ), + ] + assert q_res.summarized_lineage_results["superkingdom"] == sk + print(q_res.summarized_lineage_results["phylum"]) + phy = [ + SummarizedGatherResult( + rank="phylum", + fraction=0.1, + lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.2, + bp_match_at_rank=20, + query_ani_at_rank=approx(0.928, rel=1e-2), + ), + SummarizedGatherResult( + rank="phylum", + fraction=0.9, + lineage=RankLineageInfo(), + f_weighted_at_rank=0.8, + bp_match_at_rank=80, + query_ani_at_rank=None, + ), + ] + assert q_res.summarized_lineage_results["phylum"] == phy + print(q_res.summarized_lineage_results["class"]) + cl = [ + SummarizedGatherResult( + rank="class", + fraction=0.1, + lineage=RankLineageInfo(lineage_str="a;b;c"), + f_weighted_at_rank=0.2, + bp_match_at_rank=20, + query_ani_at_rank=approx(0.928, rel=1e-2), + ), + SummarizedGatherResult( + rank="class", + fraction=0.9, + lineage=RankLineageInfo(), + f_weighted_at_rank=0.8, + bp_match_at_rank=80, + query_ani_at_rank=None, + ), + ] + assert q_res.summarized_lineage_results["class"] == cl + + assert q_res.total_f_weighted["phylum"] == approx(0.2) + assert q_res.total_f_classified["class"] == approx(0.1) + assert q_res.total_bp_classified["superkingdom"] == 20 def test_QueryTaxResult_build_summarized_result_over100percent(): "summarize up ranks: different values" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB','f_unique_to_query': 0.95}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB", "f_unique_to_query": 0.95}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) # now summarize up the ranks assert len(q_res.raw_taxresults) == 2 with pytest.raises(ValueError) as exc: q_res.build_summarized_result() print(str(exc)) - assert "Summarized fraction is > 100% of the query! This should not be possible" in str(exc) + assert ( + "Summarized fraction is > 100% of the query! This should not be possible" + in str(exc) + ) def test_build_summarized_result_rank_fail_not_available_resummarize(): "build classification result" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) - q_res.summarize_up_ranks('superkingdom') + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) + q_res.summarize_up_ranks("superkingdom") with pytest.raises(ValueError) as exc: - q_res.build_summarized_result(single_rank='order') + q_res.build_summarized_result(single_rank="order") print(str(exc)) assert "Error: rank 'order' not in summarized rank(s), superkingdom" in str(exc) @@ -2386,15 +3349,31 @@ def test_aggregate_by_lineage_at_rank(): gB_tax = ("gB", "a;c") taxD = make_mini_taxonomy([gA_tax, gB_tax]) # make gather results - gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.5,'f_unique_to_query': 0.4,'unique_intersect_bp': 50}, - {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) - summarized, all_queries = aggregate_by_lineage_at_rank([q_res], rank='phylum', by_query=False) + gather_results = [ + { + "query_name": "queryA", + "name": "gA", + "f_unique_weighted": 0.5, + "f_unique_to_query": 0.4, + "unique_intersect_bp": 50, + }, + { + "query_name": "queryA", + "name": "gB", + "f_unique_weighted": 0.3, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) + summarized, all_queries = aggregate_by_lineage_at_rank( + [q_res], rank="phylum", by_query=False + ) print(summarized) - assert summarized == {'a;b': 0.4, - 'a;c': 0.3, - 'unclassified': approx(0.3, rel=1e-2)} - assert all_queries == ['queryA'] + assert summarized == {"a;b": 0.4, "a;c": 0.3, "unclassified": approx(0.3, rel=1e-2)} + assert all_queries == ["queryA"] def test_aggregate_by_lineage_at_rank_not_available(): @@ -2404,11 +3383,27 @@ def test_aggregate_by_lineage_at_rank_not_available(): gB_tax = ("gB", "a;c") taxD = make_mini_taxonomy([gA_tax, gB_tax]) # make gather results - gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.5,'f_unique_to_query': 0.4,'unique_intersect_bp': 50}, - {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + gather_results = [ + { + "query_name": "queryA", + "name": "gA", + "f_unique_weighted": 0.5, + "f_unique_to_query": 0.4, + "unique_intersect_bp": 50, + }, + { + "query_name": "queryA", + "name": "gB", + "f_unique_weighted": 0.3, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) with pytest.raises(ValueError) as exc: - aggregate_by_lineage_at_rank([q_res], rank='species', by_query=False) + aggregate_by_lineage_at_rank([q_res], rank="species", by_query=False) print(str(exc)) assert "Error: rank 'species' not available for aggregation." in str(exc) @@ -2420,49 +3415,85 @@ def test_aggregate_by_lineage_at_rank_by_query(): gB_tax = ("gB", "a;c") taxD = make_mini_taxonomy([gA_tax, gB_tax]) # make gather results - gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.2,'f_unique_to_query': 0.2,'unique_intersect_bp': 50}, - {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}, - {'query_name': 'queryB', "name": 'gB', 'f_unique_weighted': 0.4,'f_unique_to_query': 0.4,'unique_intersect_bp': 30}] + gather_results = [ + { + "query_name": "queryA", + "name": "gA", + "f_unique_weighted": 0.2, + "f_unique_to_query": 0.2, + "unique_intersect_bp": 50, + }, + { + "query_name": "queryA", + "name": "gB", + "f_unique_weighted": 0.3, + "f_unique_to_query": 0.3, + "unique_intersect_bp": 30, + }, + { + "query_name": "queryB", + "name": "gB", + "f_unique_weighted": 0.4, + "f_unique_to_query": 0.4, + "unique_intersect_bp": 30, + }, + ] gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, summarize=True) # check by query - summarized, all_queries = aggregate_by_lineage_at_rank(gres.values(), rank='superkingdom', by_query=True) + summarized, all_queries = aggregate_by_lineage_at_rank( + gres.values(), rank="superkingdom", by_query=True + ) print(summarized) - assert summarized == {"a": {'queryA': 0.5, 'queryB': 0.4}, - "unclassified": {'queryA': 0.5, 'queryB': 0.6}} - #assert summarized == {'a': {'queryA': approx(0.1, rel=1e-2), 'queryB': 0.7}} - assert all_queries == ['queryA', 'queryB'] - summarized, all_queries = aggregate_by_lineage_at_rank(gres.values(), rank='phylum', by_query=True) + assert summarized == { + "a": {"queryA": 0.5, "queryB": 0.4}, + "unclassified": {"queryA": 0.5, "queryB": 0.6}, + } + # assert summarized == {'a': {'queryA': approx(0.1, rel=1e-2), 'queryB': 0.7}} + assert all_queries == ["queryA", "queryB"] + summarized, all_queries = aggregate_by_lineage_at_rank( + gres.values(), rank="phylum", by_query=True + ) print(summarized) - assert summarized == {'a;c': {'queryA': 0.3, 'queryB': 0.4}, - 'a;b': {'queryA': 0.2}, - "unclassified": {'queryA': 0.5, 'queryB': 0.6}} - + assert summarized == { + "a;c": {"queryA": 0.3, "queryB": 0.4}, + "a;b": {"queryA": 0.2}, + "unclassified": {"queryA": 0.5, "queryB": 0.6}, + } + def test_build_classification_result_containment_threshold_fail(): "classification result: improper containment threshold" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) with pytest.raises(ValueError) as exc: q_res.build_classification_result(containment_threshold=1.2) print(str(exc)) - assert "Containment threshold must be between 0 and 1 (input value: 1.2)." in str(exc) + assert "Containment threshold must be between 0 and 1 (input value: 1.2)." in str( + exc + ) with pytest.raises(ValueError) as exc: - q_res.build_classification_result(containment_threshold=-.1) + q_res.build_classification_result(containment_threshold=-0.1) print(str(exc)) - assert "Containment threshold must be between 0 and 1 (input value: -0.1)." in str(exc) + assert "Containment threshold must be between 0 and 1 (input value: -0.1)." in str( + exc + ) def test_build_classification_result_containment_threshold(): "basic functionality: build classification result using containment threshold" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) q_res.build_classification_result(containment_threshold=0.1) print("classif: ", q_res.classification_result) - assert q_res.classification_result.status == 'match' - assert q_res.classification_result.rank == 'class' + assert q_res.classification_result.status == "match" + assert q_res.classification_result.rank == "class" assert q_res.classification_result.fraction == 0.1 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") assert q_res.classification_result.f_weighted_at_rank == 0.2 @@ -2471,8 +3502,8 @@ def test_build_classification_result_containment_threshold(): q_res.build_classification_result(containment_threshold=0.2) print("classif: ", q_res.classification_result) - assert q_res.classification_result.status == 'match' - assert q_res.classification_result.rank == 'phylum' + assert q_res.classification_result.status == "match" + assert q_res.classification_result.rank == "phylum" assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b") assert q_res.classification_result.f_weighted_at_rank == 0.4 assert q_res.classification_result.fraction == 0.2 @@ -2481,8 +3512,8 @@ def test_build_classification_result_containment_threshold(): q_res.build_classification_result(containment_threshold=1.0) print("classif: ", q_res.classification_result) - assert q_res.classification_result.status == 'below_threshold' - assert q_res.classification_result.rank == 'superkingdom' + assert q_res.classification_result.status == "below_threshold" + assert q_res.classification_result.rank == "superkingdom" assert q_res.classification_result.fraction == 0.2 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a") assert q_res.classification_result.f_weighted_at_rank == 0.4 @@ -2493,23 +3524,25 @@ def test_build_classification_result_containment_threshold(): def test_build_classification_result_ani_threshold(): "basic functionality: build classification result" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) - q_res.build_classification_result(ani_threshold=.92) + q_res.build_classification_result(ani_threshold=0.92) print("classif: ", q_res.classification_result) - assert q_res.classification_result.status == 'match' - assert q_res.classification_result.rank == 'class' + assert q_res.classification_result.status == "match" + assert q_res.classification_result.rank == "class" assert q_res.classification_result.fraction == 0.1 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") assert q_res.classification_result.f_weighted_at_rank == 0.2 assert q_res.classification_result.bp_match_at_rank == 20 assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2) - q_res.build_classification_result(ani_threshold=0.94) # should classify at phylum + q_res.build_classification_result(ani_threshold=0.94) # should classify at phylum print("classif: ", q_res.classification_result) - assert q_res.classification_result.status == 'match' - assert q_res.classification_result.rank == 'phylum' + assert q_res.classification_result.status == "match" + assert q_res.classification_result.rank == "phylum" assert q_res.classification_result.fraction == 0.2 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b") assert q_res.classification_result.f_weighted_at_rank == 0.4 @@ -2519,8 +3552,8 @@ def test_build_classification_result_ani_threshold(): # superk result, but doesn't meet ANI threshold q_res.build_classification_result(ani_threshold=0.96) print("classif: ", q_res.classification_result) - assert q_res.classification_result.status == 'below_threshold' - assert q_res.classification_result.rank == 'superkingdom' + assert q_res.classification_result.status == "below_threshold" + assert q_res.classification_result.rank == "superkingdom" assert q_res.classification_result.fraction == 0.2 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a") assert q_res.classification_result.f_weighted_at_rank == 0.4 @@ -2531,14 +3564,16 @@ def test_build_classification_result_ani_threshold(): def test_build_classification_result_ani_threshold_fail(): "classification result: improper ANI threshold" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) with pytest.raises(ValueError) as exc: q_res.build_classification_result(ani_threshold=1.2) print(str(exc)) assert "ANI threshold must be between 0 and 1 (input value: 1.2)." in str(exc) with pytest.raises(ValueError) as exc: - q_res.build_classification_result(ani_threshold=-.1) + q_res.build_classification_result(ani_threshold=-0.1) print(str(exc)) assert "ANI threshold must be between 0 and 1 (input value: -0.1)." in str(exc) @@ -2546,22 +3581,28 @@ def test_build_classification_result_ani_threshold_fail(): def test_build_classification_result_rank_fail_not_filled(): "classification result: rank not available (wasn't filled in tax lineage matches)" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) with pytest.raises(ValueError) as exc: - q_res.build_classification_result(rank='order') + q_res.build_classification_result(rank="order") print(str(exc)) - assert "Error: rank 'order' was not available for any matching lineages." in str(exc) + assert "Error: rank 'order' was not available for any matching lineages." in str( + exc + ) def test_build_classification_result_rank_fail_not_available_resummarize(): "classification result: rank not available (wasn't summarized)" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) - q_res.summarize_up_ranks('superkingdom') + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) + q_res.summarize_up_ranks("superkingdom") with pytest.raises(ValueError) as exc: - q_res.build_classification_result(rank='order') + q_res.build_classification_result(rank="order") print(str(exc)) assert "Error: rank 'order' not in summarized rank(s), superkingdom" in str(exc) @@ -2569,33 +3610,40 @@ def test_build_classification_result_rank_fail_not_available_resummarize(): def test_build_classification_result_rank_fail_not_available(): "classification result: rank not available" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) with pytest.raises(ValueError) as exc: - q_res.build_classification_result(rank='NotARank') + q_res.build_classification_result(rank="NotARank") print(str(exc)) - assert "Error: rank 'NotARank' not in available ranks (strain, species, genus, family, order, class, phylum, superkingdom)" in str(exc) + assert ( + "Error: rank 'NotARank' not in available ranks (strain, species, genus, family, order, class, phylum, superkingdom)" + in str(exc) + ) def test_build_classification_result_rank_containment_threshold(): "classification result - rank and containment threshold (default)" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) - q_res.build_classification_result(rank='class') + q_res.build_classification_result(rank="class") print("classif: ", q_res.classification_result) - assert q_res.classification_result.status == 'match' - assert q_res.classification_result.rank == 'class' + assert q_res.classification_result.status == "match" + assert q_res.classification_result.rank == "class" assert q_res.classification_result.fraction == 0.1 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") assert q_res.classification_result.f_weighted_at_rank == 0.2 assert q_res.classification_result.bp_match_at_rank == 20 assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2) - q_res.build_classification_result(rank='class', containment_threshold=0.4) - assert q_res.classification_result.status == 'below_threshold' - assert q_res.classification_result.rank == 'class' + q_res.build_classification_result(rank="class", containment_threshold=0.4) + assert q_res.classification_result.status == "below_threshold" + assert q_res.classification_result.rank == "class" assert q_res.classification_result.fraction == 0.1 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") assert q_res.classification_result.f_weighted_at_rank == 0.2 @@ -2606,21 +3654,23 @@ def test_build_classification_result_rank_containment_threshold(): def test_build_classification_result_rank_ani_threshold(): "classification result with rank and ANI threshold" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) - - q_res.build_classification_result(rank='class', ani_threshold=0.92) - assert q_res.classification_result.status == 'match' - assert q_res.classification_result.rank == 'class' + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) + + q_res.build_classification_result(rank="class", ani_threshold=0.92) + assert q_res.classification_result.status == "match" + assert q_res.classification_result.rank == "class" assert q_res.classification_result.fraction == 0.1 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") assert q_res.classification_result.f_weighted_at_rank == 0.2 assert q_res.classification_result.bp_match_at_rank == 20 assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2) - q_res.build_classification_result(rank='class', ani_threshold=0.95) - assert q_res.classification_result.status == 'below_threshold' - assert q_res.classification_result.rank == 'class' + q_res.build_classification_result(rank="class", ani_threshold=0.95) + assert q_res.classification_result.status == "below_threshold" + assert q_res.classification_result.rank == "class" assert q_res.classification_result.fraction == 0.1 assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") assert q_res.classification_result.f_weighted_at_rank == 0.2 @@ -2631,55 +3681,63 @@ def test_build_classification_result_rank_ani_threshold(): def test_krona_classified(): "basic functionality: build classification result using containment threshold" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) q_res.build_classification_result() - assert q_res.krona_classified == None - q_res.build_classification_result(rank='phylum')#, force_resummarize=True) + assert q_res.krona_classified is None + q_res.build_classification_result(rank="phylum") # , force_resummarize=True) print(q_res.krona_classified) - assert q_res.krona_classified == (0.2, 'a', 'b') - assert q_res.krona_unclassified == (0.8, 'unclassified', 'unclassified') - q_res.build_classification_result(rank='superkingdom') + assert q_res.krona_classified == (0.2, "a", "b") + assert q_res.krona_unclassified == (0.8, "unclassified", "unclassified") + q_res.build_classification_result(rank="superkingdom") print(q_res.krona_classified) - assert q_res.krona_classified == (0.2, 'a') - assert q_res.krona_unclassified == (0.8, 'unclassified') + assert q_res.krona_classified == (0.2, "a") + assert q_res.krona_unclassified == (0.8, "unclassified") # make sure this goes back to None if we reclassify without rank q_res.build_classification_result() - assert q_res.krona_classified == None - assert q_res.krona_unclassified == None + assert q_res.krona_classified is None + assert q_res.krona_unclassified is None assert q_res.krona_header == [] def test_make_krona_header_basic(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] + gather_results = [{}, {"name": "gB"}] phy_header = ["fraction", "superkingdom", "phylum"] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) - q_res.build_classification_result(rank='phylum') + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) + q_res.build_classification_result(rank="phylum") print(q_res.krona_classified) print(q_res.krona_header) assert q_res.krona_header == phy_header - hd = q_res.make_krona_header('phylum') + hd = q_res.make_krona_header("phylum") print("header: ", hd) assert hd == phy_header def test_make_krona_header_basic_1(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] + gather_results = [{}, {"name": "gB"}] class_header = ["fraction", "superkingdom", "phylum", "class"] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) - q_res.build_classification_result(rank='class') + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True + ) + q_res.build_classification_result(rank="class") assert q_res.krona_header == class_header - hd = q_res.make_krona_header(min_rank='class') + hd = q_res.make_krona_header(min_rank="class") print("header: ", hd) assert hd == class_header def test_make_krona_header_fail(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) with pytest.raises(ValueError) as exc: q_res.make_krona_header("order") assert "Rank 'order' not present in summarized ranks." in str(exc.value) @@ -2690,305 +3748,740 @@ def test_make_krona_header_fail(): def test_make_human_summary(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) - hs = q_res.make_human_summary(display_rank = "superkingdom") + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) + hs = q_res.make_human_summary(display_rank="superkingdom") print(hs) - assert hs == [{'rank': 'superkingdom', 'fraction': '0.800', 'lineage': 'unclassified', - 'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': "60", 'query_ani_at_rank': '- ', - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', - 'total_weighted_hashes': "0"}, - {'rank': 'superkingdom', 'fraction': '0.200', 'lineage': "a", - 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': "40", 'query_ani_at_rank': '94.9%', - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': "0"}] + assert hs == [ + { + "rank": "superkingdom", + "fraction": "0.800", + "lineage": "unclassified", + "f_weighted_at_rank": "60.0%", + "bp_match_at_rank": "60", + "query_ani_at_rank": "- ", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "superkingdom", + "fraction": "0.200", + "lineage": "a", + "f_weighted_at_rank": "40.0%", + "bp_match_at_rank": "40", + "query_ani_at_rank": "94.9%", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + ] def test_make_human_summary_2(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) - hs = q_res.make_human_summary(display_rank = "phylum") + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) + hs = q_res.make_human_summary(display_rank="phylum") print(hs) - assert hs == [{'rank': 'phylum', 'fraction': '0.800', 'lineage': 'unclassified', - 'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': "60", 'query_ani_at_rank': '- ', - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', - 'total_weighted_hashes': "0"}, - {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', - 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': "40", 'query_ani_at_rank': '94.9%', - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': "0"}] + assert hs == [ + { + "rank": "phylum", + "fraction": "0.800", + "lineage": "unclassified", + "f_weighted_at_rank": "60.0%", + "bp_match_at_rank": "60", + "query_ani_at_rank": "- ", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "phylum", + "fraction": "0.200", + "lineage": "a;b", + "f_weighted_at_rank": "40.0%", + "bp_match_at_rank": "40", + "query_ani_at_rank": "94.9%", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + ] def test_make_human_summary_classification(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True, classify_rank="superkingdom") - hs = q_res.make_human_summary(display_rank = "superkingdom", classification=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, + taxD=taxD, + single_query=True, + classify=True, + classify_rank="superkingdom", + ) + hs = q_res.make_human_summary(display_rank="superkingdom", classification=True) print(hs) - assert hs == [{'rank': 'superkingdom', 'fraction': '0.200', 'lineage': 'a', - 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': "40", - 'query_ani_at_rank': '94.9%', 'status': 'match', 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': "0"}] + assert hs == [ + { + "rank": "superkingdom", + "fraction": "0.200", + "lineage": "a", + "f_weighted_at_rank": "40.0%", + "bp_match_at_rank": "40", + "query_ani_at_rank": "94.9%", + "status": "match", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + } + ] def test_make_human_summary_classification_2(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True, classify_rank="phylum") - hs = q_res.make_human_summary(display_rank = "phylum", classification=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, + taxD=taxD, + single_query=True, + classify=True, + classify_rank="phylum", + ) + hs = q_res.make_human_summary(display_rank="phylum", classification=True) print(hs) - assert hs == [{'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', - 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': "40", - 'query_ani_at_rank': '94.9%', 'status': 'match', - 'query_name': 'q1', 'query_md5': 'md5', - 'query_filename': 'query_fn', 'total_weighted_hashes': "0"}] + assert hs == [ + { + "rank": "phylum", + "fraction": "0.200", + "lineage": "a;b", + "f_weighted_at_rank": "40.0%", + "bp_match_at_rank": "40", + "query_ani_at_rank": "94.9%", + "status": "match", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + } + ] def test_make_full_summary(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) header, fs = q_res.make_full_summary() - assert header == ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', - 'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank', 'total_weighted_hashes'] + assert header == [ + "query_name", + "rank", + "fraction", + "lineage", + "query_md5", + "query_filename", + "f_weighted_at_rank", + "bp_match_at_rank", + "query_ani_at_rank", + "total_weighted_hashes", + ] print(fs) - assert fs == [{'rank': 'superkingdom', 'fraction': '0.2', 'lineage': 'a', 'f_weighted_at_rank': '0.4', - 'bp_match_at_rank': '40', 'query_ani_at_rank': approx(0.949,rel=1e-3), 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'superkingdom', 'fraction': '0.8', 'lineage': 'unclassified', 'f_weighted_at_rank': - '0.6', 'bp_match_at_rank': '60', 'query_ani_at_rank': None, - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', - 'total_weighted_hashes': '0'}, - {'rank': 'phylum', 'fraction': '0.2', 'lineage': 'a;b', 'f_weighted_at_rank': '0.4', - 'bp_match_at_rank': '40', 'query_ani_at_rank': approx(0.949,rel=1e-3), 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'phylum', 'fraction': '0.8', 'lineage': 'unclassified', 'f_weighted_at_rank': '0.6', - 'bp_match_at_rank': '60', 'query_ani_at_rank': None, 'query_name': 'q1', 'query_md5': 'md5', - 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'class', 'fraction': '0.1', 'lineage': 'a;b;c', 'f_weighted_at_rank': '0.2', - 'bp_match_at_rank': '20', 'query_ani_at_rank': approx(0.928, rel=1e-3), - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'class', 'fraction': '0.1', 'lineage': 'a;b;d','f_weighted_at_rank': '0.2', - 'bp_match_at_rank': '20', 'query_ani_at_rank': approx(0.928, rel=1e-3), 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'class', 'fraction': '0.8', 'lineage': 'unclassified', 'f_weighted_at_rank': '0.6', - 'bp_match_at_rank': '60', 'query_ani_at_rank': None, 'query_name': 'q1', 'query_md5': 'md5', - 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}] - + assert fs == [ + { + "rank": "superkingdom", + "fraction": "0.2", + "lineage": "a", + "f_weighted_at_rank": "0.4", + "bp_match_at_rank": "40", + "query_ani_at_rank": approx(0.949, rel=1e-3), + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "superkingdom", + "fraction": "0.8", + "lineage": "unclassified", + "f_weighted_at_rank": "0.6", + "bp_match_at_rank": "60", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "phylum", + "fraction": "0.2", + "lineage": "a;b", + "f_weighted_at_rank": "0.4", + "bp_match_at_rank": "40", + "query_ani_at_rank": approx(0.949, rel=1e-3), + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "phylum", + "fraction": "0.8", + "lineage": "unclassified", + "f_weighted_at_rank": "0.6", + "bp_match_at_rank": "60", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "class", + "fraction": "0.1", + "lineage": "a;b;c", + "f_weighted_at_rank": "0.2", + "bp_match_at_rank": "20", + "query_ani_at_rank": approx(0.928, rel=1e-3), + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "class", + "fraction": "0.1", + "lineage": "a;b;d", + "f_weighted_at_rank": "0.2", + "bp_match_at_rank": "20", + "query_ani_at_rank": approx(0.928, rel=1e-3), + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "class", + "fraction": "0.8", + "lineage": "unclassified", + "f_weighted_at_rank": "0.6", + "bp_match_at_rank": "60", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + ] + header, fs = q_res.make_full_summary(limit_float=True) - assert header == ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', - 'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank', 'total_weighted_hashes'] + assert header == [ + "query_name", + "rank", + "fraction", + "lineage", + "query_md5", + "query_filename", + "f_weighted_at_rank", + "bp_match_at_rank", + "query_ani_at_rank", + "total_weighted_hashes", + ] print(fs) - assert fs == [{'rank': 'superkingdom', 'fraction': '0.200', 'lineage': 'a', 'f_weighted_at_rank': '0.400', - 'bp_match_at_rank': '40', 'query_ani_at_rank': "0.949", 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'superkingdom', 'fraction': '0.800', 'lineage': 'unclassified', 'f_weighted_at_rank': - '0.600', 'bp_match_at_rank': '60', 'query_ani_at_rank': None, - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', - 'total_weighted_hashes': '0'}, - {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', 'f_weighted_at_rank': '0.400', - 'bp_match_at_rank': '40', 'query_ani_at_rank': "0.949", 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'phylum', 'fraction': '0.800', 'lineage': 'unclassified', 'f_weighted_at_rank': '0.600', - 'bp_match_at_rank': '60', 'query_ani_at_rank': None, 'query_name': 'q1', 'query_md5': 'md5', - 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'class', 'fraction': '0.100', 'lineage': 'a;b;c', 'f_weighted_at_rank': '0.200', - 'bp_match_at_rank': '20', 'query_ani_at_rank': "0.928", - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'class', 'fraction': '0.100', 'lineage': 'a;b;d','f_weighted_at_rank': '0.200', - 'bp_match_at_rank': '20', 'query_ani_at_rank': "0.928", 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, - {'rank': 'class', 'fraction': '0.800', 'lineage': 'unclassified', 'f_weighted_at_rank': '0.600', - 'bp_match_at_rank': '60', 'query_ani_at_rank': None, 'query_name': 'q1', 'query_md5': 'md5', - 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}] + assert fs == [ + { + "rank": "superkingdom", + "fraction": "0.200", + "lineage": "a", + "f_weighted_at_rank": "0.400", + "bp_match_at_rank": "40", + "query_ani_at_rank": "0.949", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "superkingdom", + "fraction": "0.800", + "lineage": "unclassified", + "f_weighted_at_rank": "0.600", + "bp_match_at_rank": "60", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "phylum", + "fraction": "0.200", + "lineage": "a;b", + "f_weighted_at_rank": "0.400", + "bp_match_at_rank": "40", + "query_ani_at_rank": "0.949", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "phylum", + "fraction": "0.800", + "lineage": "unclassified", + "f_weighted_at_rank": "0.600", + "bp_match_at_rank": "60", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "class", + "fraction": "0.100", + "lineage": "a;b;c", + "f_weighted_at_rank": "0.200", + "bp_match_at_rank": "20", + "query_ani_at_rank": "0.928", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "class", + "fraction": "0.100", + "lineage": "a;b;d", + "f_weighted_at_rank": "0.200", + "bp_match_at_rank": "20", + "query_ani_at_rank": "0.928", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + { + "rank": "class", + "fraction": "0.800", + "lineage": "unclassified", + "f_weighted_at_rank": "0.600", + "bp_match_at_rank": "60", + "query_ani_at_rank": None, + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + "total_weighted_hashes": "0", + }, + ] def test_make_full_summary_summarization_fail(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=False) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=False + ) with pytest.raises(ValueError) as exc: q_res.make_full_summary() print(str(exc)) - assert 'not summarized yet' in str(exc) + assert "not summarized yet" in str(exc) def test_make_full_summary_classification(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, classify=True + ) header, fs = q_res.make_full_summary(classification=True) - assert header == ["query_name", "status", "rank", "fraction", "lineage", - "query_md5", "query_filename", "f_weighted_at_rank", - "bp_match_at_rank", "query_ani_at_rank"] + assert header == [ + "query_name", + "status", + "rank", + "fraction", + "lineage", + "query_md5", + "query_filename", + "f_weighted_at_rank", + "bp_match_at_rank", + "query_ani_at_rank", + ] print(fs) - assert fs == [{'rank': 'class', 'fraction': '0.1', 'lineage': 'a;b;c', 'f_weighted_at_rank': '0.2', - 'bp_match_at_rank': '20', 'query_ani_at_rank': approx(0.928, rel=1e-3), - 'status': 'match', 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn'}] + assert fs == [ + { + "rank": "class", + "fraction": "0.1", + "lineage": "a;b;c", + "f_weighted_at_rank": "0.2", + "bp_match_at_rank": "20", + "query_ani_at_rank": approx(0.928, rel=1e-3), + "status": "match", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + } + ] + - def test_make_full_summary_classification_limit_float(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, classify=True + ) header, fs = q_res.make_full_summary(classification=True, limit_float=True) - assert header == ["query_name", "status", "rank", "fraction", "lineage", - "query_md5", "query_filename", "f_weighted_at_rank", - "bp_match_at_rank", "query_ani_at_rank"] + assert header == [ + "query_name", + "status", + "rank", + "fraction", + "lineage", + "query_md5", + "query_filename", + "f_weighted_at_rank", + "bp_match_at_rank", + "query_ani_at_rank", + ] print(fs) - assert fs == [{'rank': 'class', 'fraction': '0.100', 'lineage': 'a;b;c', 'f_weighted_at_rank': '0.200', - 'bp_match_at_rank': '20', 'query_ani_at_rank': "0.928", - 'status': 'match', 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn'}] + assert fs == [ + { + "rank": "class", + "fraction": "0.100", + "lineage": "a;b;c", + "f_weighted_at_rank": "0.200", + "bp_match_at_rank": "20", + "query_ani_at_rank": "0.928", + "status": "match", + "query_name": "q1", + "query_md5": "md5", + "query_filename": "query_fn", + } + ] def test_make_full_summary_classification_fail(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) with pytest.raises(ValueError) as exc: q_res.make_full_summary(classification=True) print(str(exc)) - assert 'not classified yet' in str(exc) + assert "not classified yet" in str(exc) def test_make_kreport_results(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;c;d;e;f;g")]) - #need to go down to species to check that `num_bp_assigned` is happening correctly - gather_results = [{"total_weighted_hashes":100}, {"name": 'gB', "total_weighted_hashes":100}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + # need to go down to species to check that `num_bp_assigned` is happening correctly + gather_results = [ + {"total_weighted_hashes": 100}, + {"name": "gB", "total_weighted_hashes": 100}, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) header, krepD = q_res.make_kreport_results() print(krepD) - assert krepD == [{'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', - 'rank_code': 'D', 'sci_name': 'a', 'ncbi_taxid': None}, - {'num_bp_assigned': '60', 'percent_containment': '60.00', 'num_bp_contained': '60', - 'sci_name': 'unclassified', 'rank_code': 'U', 'ncbi_taxid': None}, - {'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', - 'rank_code': 'P', 'sci_name': 'b', 'ncbi_taxid': None}, - {'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', - 'rank_code': 'C', 'sci_name': 'c', 'ncbi_taxid': None}, - {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', - 'rank_code': 'O', 'sci_name': 'd', 'ncbi_taxid': None}, - {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', - 'rank_code': 'F', 'sci_name': 'e', 'ncbi_taxid': None}, - {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', - 'rank_code': 'G', 'sci_name': 'f', 'ncbi_taxid': None}, - {'num_bp_assigned': '20', 'percent_containment': '20.00', 'num_bp_contained': '20', - 'rank_code': 'S', 'sci_name': 'g', 'ncbi_taxid': None}] + assert krepD == [ + { + "num_bp_assigned": "0", + "percent_containment": "40.00", + "num_bp_contained": "40", + "rank_code": "D", + "sci_name": "a", + "ncbi_taxid": None, + }, + { + "num_bp_assigned": "60", + "percent_containment": "60.00", + "num_bp_contained": "60", + "sci_name": "unclassified", + "rank_code": "U", + "ncbi_taxid": None, + }, + { + "num_bp_assigned": "0", + "percent_containment": "40.00", + "num_bp_contained": "40", + "rank_code": "P", + "sci_name": "b", + "ncbi_taxid": None, + }, + { + "num_bp_assigned": "0", + "percent_containment": "40.00", + "num_bp_contained": "40", + "rank_code": "C", + "sci_name": "c", + "ncbi_taxid": None, + }, + { + "num_bp_assigned": "0", + "percent_containment": "20.00", + "num_bp_contained": "20", + "rank_code": "O", + "sci_name": "d", + "ncbi_taxid": None, + }, + { + "num_bp_assigned": "0", + "percent_containment": "20.00", + "num_bp_contained": "20", + "rank_code": "F", + "sci_name": "e", + "ncbi_taxid": None, + }, + { + "num_bp_assigned": "0", + "percent_containment": "20.00", + "num_bp_contained": "20", + "rank_code": "G", + "sci_name": "f", + "ncbi_taxid": None, + }, + { + "num_bp_assigned": "20", + "percent_containment": "20.00", + "num_bp_contained": "20", + "rank_code": "S", + "sci_name": "g", + "ncbi_taxid": None, + }, + ] def test_make_kreport_results_with_taxids(): - taxD = make_mini_taxonomy_with_taxids([("gA", "a;b;c", "1;2;3"), ("gB", "a;b;c;d;e;f;g", "1;2;3;4;5;6;7")]) + taxD = make_mini_taxonomy_with_taxids( + [("gA", "a;b;c", "1;2;3"), ("gB", "a;b;c;d;e;f;g", "1;2;3;4;5;6;7")] + ) print(taxD) - #need to go down to species to check that `num_bp_assigned` is happening correctly - gather_results = [{"total_weighted_hashes":100}, {"name": 'gB', "total_weighted_hashes":100}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + # need to go down to species to check that `num_bp_assigned` is happening correctly + gather_results = [ + {"total_weighted_hashes": 100}, + {"name": "gB", "total_weighted_hashes": 100}, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) header, krepD = q_res.make_kreport_results() print(krepD) - assert krepD == [{'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', - 'rank_code': 'D', 'sci_name': 'a', 'ncbi_taxid': '1'}, - {'num_bp_assigned': '60', 'percent_containment': '60.00', 'num_bp_contained': '60', - 'sci_name': 'unclassified', 'rank_code': 'U', 'ncbi_taxid': None}, - {'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', - 'rank_code': 'P', 'sci_name': 'b', 'ncbi_taxid': '2'}, - {'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', - 'rank_code': 'C', 'sci_name': 'c', 'ncbi_taxid': '3'}, - {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', - 'rank_code': 'O', 'sci_name': 'd', 'ncbi_taxid': '4'}, - {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', - 'rank_code': 'F', 'sci_name': 'e', 'ncbi_taxid': '5'}, - {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', - 'rank_code': 'G', 'sci_name': 'f', 'ncbi_taxid': '6'}, - {'num_bp_assigned': '20', 'percent_containment': '20.00', 'num_bp_contained': '20', - 'rank_code': 'S', 'sci_name': 'g', 'ncbi_taxid': '7'}] + assert krepD == [ + { + "num_bp_assigned": "0", + "percent_containment": "40.00", + "num_bp_contained": "40", + "rank_code": "D", + "sci_name": "a", + "ncbi_taxid": "1", + }, + { + "num_bp_assigned": "60", + "percent_containment": "60.00", + "num_bp_contained": "60", + "sci_name": "unclassified", + "rank_code": "U", + "ncbi_taxid": None, + }, + { + "num_bp_assigned": "0", + "percent_containment": "40.00", + "num_bp_contained": "40", + "rank_code": "P", + "sci_name": "b", + "ncbi_taxid": "2", + }, + { + "num_bp_assigned": "0", + "percent_containment": "40.00", + "num_bp_contained": "40", + "rank_code": "C", + "sci_name": "c", + "ncbi_taxid": "3", + }, + { + "num_bp_assigned": "0", + "percent_containment": "20.00", + "num_bp_contained": "20", + "rank_code": "O", + "sci_name": "d", + "ncbi_taxid": "4", + }, + { + "num_bp_assigned": "0", + "percent_containment": "20.00", + "num_bp_contained": "20", + "rank_code": "F", + "sci_name": "e", + "ncbi_taxid": "5", + }, + { + "num_bp_assigned": "0", + "percent_containment": "20.00", + "num_bp_contained": "20", + "rank_code": "G", + "sci_name": "f", + "ncbi_taxid": "6", + }, + { + "num_bp_assigned": "20", + "percent_containment": "20.00", + "num_bp_contained": "20", + "rank_code": "S", + "sci_name": "g", + "ncbi_taxid": "7", + }, + ] def test_make_kreport_results_fail(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=False) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=False + ) with pytest.raises(ValueError) as exc: q_res.make_kreport_results() print(str(exc)) - assert 'not summarized yet' in str(exc) + assert "not summarized yet" in str(exc) def test_make_kreport_results_fail_pre_v450(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) with pytest.raises(ValueError) as exc: q_res.make_kreport_results() print(str(exc)) - assert "cannot produce 'kreport' format from gather results before sourmash v4.5.0" in str(exc) + assert ( + "cannot produce 'kreport' format from gather results before sourmash v4.5.0" + in str(exc) + ) def test_make_cami_results_with_taxids(): - taxD = make_mini_taxonomy_with_taxids([("gA", "a;b;c", "1;2;3"), ("gB", "a;b;c;d;e;f;g", "1;2;3;4;5;6;7")]) + taxD = make_mini_taxonomy_with_taxids( + [("gA", "a;b;c", "1;2;3"), ("gB", "a;b;c;d;e;f;g", "1;2;3;4;5;6;7")] + ) print(taxD) - #need to go down to species to check that `num_bp_assigned` is happening correctly - gather_results = [{"total_weighted_hashes":100}, {"name": 'gB', "total_weighted_hashes":100}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + # need to go down to species to check that `num_bp_assigned` is happening correctly + gather_results = [ + {"total_weighted_hashes": 100}, + {"name": "gB", "total_weighted_hashes": 100}, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, taxD=taxD, single_query=True, summarize=True + ) header, camires = q_res.make_cami_bioboxes() print(camires) - assert camires == [['1', 'superkingdom', '1', 'a', '40.00'], - ['2', 'phylum', '1|2', 'a|b', '40.00'], - ['3', 'class', '1|2|3', 'a|b|c', '40.00'], - ['4', 'order', '1|2|3|4', 'a|b|c|d', '20.00'], - ['5', 'family', '1|2|3|4|5', 'a|b|c|d|e', '20.00'], - ['6', 'genus', '1|2|3|4|5|6', 'a|b|c|d|e|f', '20.00'], - ['7', 'species', '1|2|3|4|5|6|7', 'a|b|c|d|e|f|g', '20.00']] + assert camires == [ + ["1", "superkingdom", "1", "a", "40.00"], + ["2", "phylum", "1|2", "a|b", "40.00"], + ["3", "class", "1|2|3", "a|b|c", "40.00"], + ["4", "order", "1|2|3|4", "a|b|c|d", "20.00"], + ["5", "family", "1|2|3|4|5", "a|b|c|d|e", "20.00"], + ["6", "genus", "1|2|3|4|5|6", "a|b|c|d|e|f", "20.00"], + ["7", "species", "1|2|3|4|5|6|7", "a|b|c|d|e|f|g", "20.00"], + ] def test_make_lingroup_results(): - taxD = make_mini_taxonomy([("gA", "1;0;0"), ("gB", "1;0;1"), ("gC", "1;1;0")], LIN=True) + taxD = make_mini_taxonomy( + [("gA", "1;0;0"), ("gB", "1;0;1"), ("gC", "1;1;0")], LIN=True + ) print(taxD) - lingroupD = {"1":"lg1", "1;0":'lg2', '1;1': "lg3"} + lingroupD = {"1": "lg1", "1;0": "lg2", "1;1": "lg3"} print(lingroupD) - gather_results = [{"total_weighted_hashes":100}, - {"name": 'gB', "total_weighted_hashes":100}, - {"name": 'gC', "total_weighted_hashes":100}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True, LIN=True) + gather_results = [ + {"total_weighted_hashes": 100}, + {"name": "gB", "total_weighted_hashes": 100}, + {"name": "gC", "total_weighted_hashes": 100}, + ] + q_res = make_QueryTaxResults( + gather_info=gather_results, + taxD=taxD, + single_query=True, + summarize=True, + LIN=True, + ) print(q_res.summarized_lineage_results) - header, lgD = q_res.make_lingroup_results(LINgroupsD = lingroupD) + header, lgD = q_res.make_lingroup_results(LINgroupsD=lingroupD) print(header) - assert header == ['name', 'lin', 'percent_containment', 'num_bp_contained'] + assert header == ["name", "lin", "percent_containment", "num_bp_contained"] # order may change, just check that each lg entry is present in list of results - lg1 = {'percent_containment': '60.00', 'num_bp_contained': '60', - 'lin': '1', 'name': 'lg1'} - lg2 = {'percent_containment': '40.00', 'num_bp_contained': '40', - 'lin': '1;0', 'name': 'lg2'} - lg3 = {'percent_containment': '20.00', 'num_bp_contained': '20', - 'lin': '1;1', 'name': 'lg3'} + lg1 = { + "percent_containment": "60.00", + "num_bp_contained": "60", + "lin": "1", + "name": "lg1", + } + lg2 = { + "percent_containment": "40.00", + "num_bp_contained": "40", + "lin": "1;0", + "name": "lg2", + } + lg3 = { + "percent_containment": "20.00", + "num_bp_contained": "20", + "lin": "1;1", + "name": "lg3", + } assert lg1 in lgD assert lg2 in lgD assert lg3 in lgD def test_make_lingroup_results_fail_pre_v450(): - taxD = make_mini_taxonomy([("gA", "1;0;0"), ("gB", "1;0;1"), ("gC", "1;1;0")], LIN=True) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True, LIN=True) - lingroupD = {"1":"lg1", "1;0":'lg2', '1;1': "lg3"} + taxD = make_mini_taxonomy( + [("gA", "1;0;0"), ("gB", "1;0;1"), ("gC", "1;1;0")], LIN=True + ) + gather_results = [{}, {"name": "gB"}] + q_res = make_QueryTaxResults( + gather_info=gather_results, + taxD=taxD, + single_query=True, + summarize=True, + LIN=True, + ) + lingroupD = {"1": "lg1", "1;0": "lg2", "1;1": "lg3"} with pytest.raises(ValueError) as exc: q_res.make_lingroup_results(lingroupD) print(str(exc)) - assert "cannot produce 'lingroup' format from gather results before sourmash v4.5.0" in str(exc) + assert ( + "cannot produce 'lingroup' format from gather results before sourmash v4.5.0" + in str(exc) + ) def test_read_lingroups(runtmp): lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: - out.write('lin,name\n') - out.write('1,lg1\n') - out.write('1;0,lg2\n') - out.write('1;1,lg3\n') + with open(lg_file, "w") as out: + out.write("lin,name\n") + out.write("1,lg1\n") + out.write("1;0,lg2\n") + out.write("1;1,lg3\n") lgD = read_lingroups(lg_file) - assert lgD == {"1":"lg1", "1;0":'lg2', '1;1': "lg3"} + assert lgD == {"1": "lg1", "1;0": "lg2", "1;1": "lg3"} + def test_read_lingroups_empty_file(runtmp): lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: + with open(lg_file, "w") as out: out.write("") with pytest.raises(ValueError) as exc: read_lingroups(lg_file) @@ -2998,8 +4491,8 @@ def test_read_lingroups_empty_file(runtmp): def test_read_lingroups_only_header(runtmp): lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: - out.write('lin,name\n') + with open(lg_file, "w") as out: + out.write("lin,name\n") with pytest.raises(ValueError) as exc: read_lingroups(lg_file) print(str(exc)) @@ -3008,8 +4501,8 @@ def test_read_lingroups_only_header(runtmp): def test_read_lingroups_bad_header(runtmp): lg_file = runtmp.output("test.lg.csv") - with open(lg_file, 'w') as out: - out.write('LINgroup_pfx,LINgroup_nm\n') + with open(lg_file, "w") as out: + out.write("LINgroup_pfx,LINgroup_nm\n") with pytest.raises(ValueError) as exc: read_lingroups(lg_file) print(str(exc)) @@ -3021,8 +4514,10 @@ def test_LineageTree_init(): lin1 = RankLineageInfo(lineage_str=x) print(lin1) tree = LineageTree([lin1]) - assert tree.tree == { LineagePair('superkingdom', 'a'): - { LineagePair('phylum', 'b') : {}} } + assert tree.tree == { + LineagePair("superkingdom", "a"): {LineagePair("phylum", "b"): {}} + } + def test_LineageTree_init_mult(): x = "a;b" @@ -3031,10 +4526,14 @@ def test_LineageTree_init_mult(): lin2 = RankLineageInfo(lineage_str=y) print(lin1) from sourmash.tax.tax_utils import LineageTree + tree = LineageTree([lin1, lin2]) - assert tree.tree == {LineagePair(rank='superkingdom', name='a', taxid=None): - {LineagePair(rank='phylum', name='b', taxid=None): {}, - LineagePair(rank='phylum', name='c', taxid=None): {}}} + assert tree.tree == { + LineagePair(rank="superkingdom", name="a", taxid=None): { + LineagePair(rank="phylum", name="b", taxid=None): {}, + LineagePair(rank="phylum", name="c", taxid=None): {}, + } + } def test_LineageTree_init_and_add_lineage(): @@ -3044,13 +4543,18 @@ def test_LineageTree_init_and_add_lineage(): lin2 = RankLineageInfo(lineage_str=y) print(lin1) from sourmash.tax.tax_utils import LineageTree + tree = LineageTree([lin1]) - assert tree.tree == { LineagePair('superkingdom', 'a'): - { LineagePair('phylum', 'b') : {}} } + assert tree.tree == { + LineagePair("superkingdom", "a"): {LineagePair("phylum", "b"): {}} + } tree.add_lineage(lin2) - assert tree.tree == {LineagePair(rank='superkingdom', name='a', taxid=None): - {LineagePair(rank='phylum', name='b', taxid=None): {}, - LineagePair(rank='phylum', name='c', taxid=None): {}}} + assert tree.tree == { + LineagePair(rank="superkingdom", name="a", taxid=None): { + LineagePair(rank="phylum", name="b", taxid=None): {}, + LineagePair(rank="phylum", name="c", taxid=None): {}, + } + } def test_LineageTree_init_and_add_lineages(): @@ -3060,13 +4564,18 @@ def test_LineageTree_init_and_add_lineages(): lin2 = RankLineageInfo(lineage_str=y) print(lin1) from sourmash.tax.tax_utils import LineageTree + tree = LineageTree([lin1]) - assert tree.tree == { LineagePair('superkingdom', 'a'): - { LineagePair('phylum', 'b') : {}} } + assert tree.tree == { + LineagePair("superkingdom", "a"): {LineagePair("phylum", "b"): {}} + } tree.add_lineages([lin2]) - assert tree.tree == {LineagePair(rank='superkingdom', name='a', taxid=None): - {LineagePair(rank='phylum', name='b', taxid=None): {}, - LineagePair(rank='phylum', name='c', taxid=None): {}}} + assert tree.tree == { + LineagePair(rank="superkingdom", name="a", taxid=None): { + LineagePair(rank="phylum", name="b", taxid=None): {}, + LineagePair(rank="phylum", name="c", taxid=None): {}, + } + } def test_build_tree_RankLineageInfo(): @@ -3074,8 +4583,9 @@ def test_build_tree_RankLineageInfo(): lin1 = RankLineageInfo(lineage_str=x) print(lin1) tree = LineageTree([lin1]) - assert tree.tree == { LineagePair('superkingdom', 'a'): - { LineagePair('phylum', 'b') : {}} } + assert tree.tree == { + LineagePair("superkingdom", "a"): {LineagePair("phylum", "b"): {}} + } def test_build_tree_LINLineageInfo(): @@ -3083,8 +4593,7 @@ def test_build_tree_LINLineageInfo(): lin1 = LINLineageInfo(lineage_str=x) print(lin1) tree = LineageTree([lin1]) - assert tree.tree == { LineagePair('0', '0'): - { LineagePair('1', '3') : {}} } + assert tree.tree == {LineagePair("0", "0"): {LineagePair("1", "3"): {}}} def test_build_tree_2(): @@ -3094,68 +4603,96 @@ def test_build_tree_2(): lin2 = RankLineageInfo(lineage_str=y) print(lin1) print(lin2) - tree = LineageTree([lin1,lin2]) + tree = LineageTree([lin1, lin2]) - assert tree.tree == { LineagePair('superkingdom', 'a'): { LineagePair('phylum', 'b') : {}, - LineagePair('phylum', 'c') : {}} } + assert tree.tree == { + LineagePair("superkingdom", "a"): { + LineagePair("phylum", "b"): {}, + LineagePair("phylum", "c"): {}, + } + } def test_build_tree_2_LineagePairs(): # build tree from LineagePairs - tree = LineageTree([[LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b')], - [LineagePair('superkingdom', 'a'), LineagePair('phylum', 'c')], - ]) + tree = LineageTree( + [ + [LineagePair("superkingdom", "a"), LineagePair("phylum", "b")], + [LineagePair("superkingdom", "a"), LineagePair("phylum", "c")], + ] + ) - assert tree.tree == { LineagePair('superkingdom', 'a'): { LineagePair('phylum', 'b') : {}, - LineagePair('phylum', 'c') : {}} } + assert tree.tree == { + LineagePair("superkingdom", "a"): { + LineagePair("phylum", "b"): {}, + LineagePair("phylum", "c"): {}, + } + } def test_build_tree_3(): # empty phylum name - x='a;' + x = "a;" lin1 = RankLineageInfo(lineage_str=x) tree = LineageTree([lin1]) - assert tree.tree == { LineagePair('superkingdom', 'a'): {} } + assert tree.tree == {LineagePair("superkingdom", "a"): {}} def test_build_tree_3_LineagePairs(): # empty phylum name: LineagePair input - lin1 = (LineagePair('superkingdom', "a", '3'), - LineagePair('phylum', '', ''),) + lin1 = ( + LineagePair("superkingdom", "a", "3"), + LineagePair("phylum", "", ""), + ) tree = LineageTree([lin1]) - assert tree.tree == { LineagePair('superkingdom', 'a', '3'): {} } + assert tree.tree == {LineagePair("superkingdom", "a", "3"): {}} def test_build_tree_5(): with pytest.raises(ValueError): - tree = LineageTree([]) + LineageTree([]) def test_build_tree_5b(): with pytest.raises(ValueError): - tree = LineageTree("") + LineageTree("") def test_build_tree_iterable(): with pytest.raises(ValueError) as exc: - tree = LineageTree(RankLineageInfo()) - assert "Must pass in an iterable containing LineagePair or LineageInfo objects" in str(exc) + LineageTree(RankLineageInfo()) + assert ( + "Must pass in an iterable containing LineagePair or LineageInfo objects" + in str(exc) + ) def test_find_lca(): - x='a;b' + x = "a;b" lin1 = RankLineageInfo(lineage_str=x) tree = LineageTree([lin1]) lca = tree.find_lca() - assert lca == ((LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b'),), 0) + assert lca == ( + ( + LineagePair("superkingdom", "a"), + LineagePair("phylum", "b"), + ), + 0, + ) def test_find_lca_LineagePairs(): - tree = LineageTree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2')]]) + tree = LineageTree([[LineagePair("rank1", "name1"), LineagePair("rank2", "name2")]]) lca = tree.find_lca() - assert lca == ((LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2'),), 0) + assert lca == ( + ( + LineagePair("rank1", "name1"), + LineagePair("rank2", "name2"), + ), + 0, + ) def test_find_lca_2(): @@ -3167,7 +4704,7 @@ def test_find_lca_2(): tree = LineageTree([lin1, lin2]) lca = tree.find_lca() - assert lca == ((LineagePair('superkingdom', 'a'),), 2) + assert lca == ((LineagePair("superkingdom", "a"),), 2) def test_find_lca_LIN(): @@ -3179,17 +4716,20 @@ def test_find_lca_LIN(): tree = LineageTree([lin1, lin2]) lca = tree.find_lca() - assert lca == ((LineagePair('0', '5'),), 2) + assert lca == ((LineagePair("0", "5"),), 2) print(lca) def test_find_lca_2_LineagePairs(): - tree = LineageTree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')], - [LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2b')], - ]) + tree = LineageTree( + [ + [LineagePair("rank1", "name1"), LineagePair("rank2", "name2a")], + [LineagePair("rank1", "name1"), LineagePair("rank2", "name2b")], + ] + ) lca = tree.find_lca() - assert lca == ((LineagePair('rank1', 'name1'),), 2) + assert lca == ((LineagePair("rank1", "name1"),), 2) def test_find_lca_3(): @@ -3198,7 +4738,7 @@ def test_find_lca_3(): tree = LineageTree([lin1, lin2]) lca, reason = tree.find_lca() - assert lca == lin1.filled_lineage # find most specific leaf node + assert lca == lin1.filled_lineage # find most specific leaf node print(lca) @@ -3214,12 +4754,17 @@ def test_build_tree_with_initial(): lca = tree.find_lca() print(lca) - assert lca == ((LineagePair(rank='superkingdom', name='a', taxid=None), - LineagePair(rank='phylum', name='b', taxid=None)), 2) + assert lca == ( + ( + LineagePair(rank="superkingdom", name="a", taxid=None), + LineagePair(rank="phylum", name="b", taxid=None), + ), + 2, + ) tree.add_lineages([lin3]) lca2 = tree.find_lca() print(lca2) - assert lca2 == ((LineagePair('superkingdom', 'a'),), 2) + assert lca2 == ((LineagePair("superkingdom", "a"),), 2) def test_LineageTree_find_ordered_paths(): @@ -3234,14 +4779,22 @@ def test_LineageTree_find_ordered_paths(): paths = tree.ordered_paths() print(paths) - assert paths == [(LineagePair(rank='superkingdom', name='a', taxid=None), - LineagePair(rank='phylum', name='e', taxid=None)), - (LineagePair(rank='superkingdom', name='a', taxid=None), - LineagePair(rank='phylum', name='b', taxid=None), - LineagePair(rank='class', name='c', taxid=None)), - (LineagePair(rank='superkingdom', name='a', taxid=None), - LineagePair(rank='phylum', name='b', taxid=None), - LineagePair(rank='class', name='d', taxid=None))] + assert paths == [ + ( + LineagePair(rank="superkingdom", name="a", taxid=None), + LineagePair(rank="phylum", name="e", taxid=None), + ), + ( + LineagePair(rank="superkingdom", name="a", taxid=None), + LineagePair(rank="phylum", name="b", taxid=None), + LineagePair(rank="class", name="c", taxid=None), + ), + ( + LineagePair(rank="superkingdom", name="a", taxid=None), + LineagePair(rank="phylum", name="b", taxid=None), + LineagePair(rank="class", name="d", taxid=None), + ), + ] def test_LineageTree_find_ordered_paths_include_internal(): @@ -3257,14 +4810,24 @@ def test_LineageTree_find_ordered_paths_include_internal(): print(paths) - assert paths == [(LineagePair(rank='superkingdom', name='a', taxid=None),), - (LineagePair(rank='superkingdom', name='a', taxid=None), - LineagePair(rank='phylum', name='e', taxid=None)), - (LineagePair(rank='superkingdom', name='a', taxid=None), - LineagePair(rank='phylum', name='b', taxid=None)), - (LineagePair(rank='superkingdom', name='a', taxid=None), - LineagePair(rank='phylum', name='b', taxid=None), - LineagePair(rank='class', name='c', taxid=None)), - (LineagePair(rank='superkingdom', name='a', taxid=None), - LineagePair(rank='phylum', name='b', taxid=None), - LineagePair(rank='class', name='d', taxid=None))] + assert paths == [ + (LineagePair(rank="superkingdom", name="a", taxid=None),), + ( + LineagePair(rank="superkingdom", name="a", taxid=None), + LineagePair(rank="phylum", name="e", taxid=None), + ), + ( + LineagePair(rank="superkingdom", name="a", taxid=None), + LineagePair(rank="phylum", name="b", taxid=None), + ), + ( + LineagePair(rank="superkingdom", name="a", taxid=None), + LineagePair(rank="phylum", name="b", taxid=None), + LineagePair(rank="class", name="c", taxid=None), + ), + ( + LineagePair(rank="superkingdom", name="a", taxid=None), + LineagePair(rank="phylum", name="b", taxid=None), + LineagePair(rank="class", name="d", taxid=None), + ), + ] diff --git a/tests/test_test_framework.py b/tests/test_test_framework.py index abf7e2c93a..85bb3e1020 100644 --- a/tests/test_test_framework.py +++ b/tests/test_test_framework.py @@ -5,4 +5,4 @@ def test_failed_sourmash_exception(runtmp): with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('') + runtmp.sourmash("") diff --git a/tox.ini b/tox.ini index 0e5602628c..1806e48778 100644 --- a/tox.ini +++ b/tox.ini @@ -1,28 +1,34 @@ [tox] -env_list = - py311, - py312, - py310, - coverage, - docs, - package_description - fix_lint, - hypothesis, - khmer, - khmer_master -min_version = 3.27 isolated_build = true skip_missing_interpreters = true +env_list = + py311, + py312, + py310, + coverage, + docs, + package_description + fix_lint, + hypothesis, + khmer, + khmer_master +min_version = 3.27 [testenv] description = run the tests with pytest under {basepython} +deps = + pip>=19.3.1 +extras = + storage + test +commands = + pytest \ + --cov "{envsitepackagesdir}/sourmash" \ + --cov-config "{toxinidir}/tox.ini" \ + --cov-report= \ + --junitxml {toxworkdir}/junit.{envname}.xml \ + {posargs:doc tests} package = wheel -wheel_build_env = .pkg -set_env = - PIP_DISABLE_VERSION_CHECK = 1 - COVERAGE_FILE = {env:COVERAGE_FILE:{toxworkdir}/.coverage.{envname}} - VIRTUALENV_NO_DOWNLOAD = 1 - PIP_EXTRA_INDEX_URL = https://antocuni.github.io/pypy-wheels/manylinux2010 pass_env = TOXENV CURL_CA_BUNDLE @@ -38,140 +44,160 @@ pass_env = PYTHONTRACEMALLOC LIBCLANG_PATH BINDGEN_EXTRA_CLANG_ARGS -deps = - pip >= 19.3.1 -extras = - test - storage -commands = pytest \ - --cov "{envsitepackagesdir}/sourmash" \ - --cov-config "{toxinidir}/tox.ini" \ - --cov-report= \ - --junitxml {toxworkdir}/junit.{envname}.xml \ - {posargs:doc tests} + NIX_LD +set_env = + PIP_DISABLE_VERSION_CHECK = 1 + COVERAGE_FILE = {env:COVERAGE_FILE:{toxworkdir}/.coverage.{envname}} + VIRTUALENV_NO_DOWNLOAD = 1 + PIP_EXTRA_INDEX_URL = https://antocuni.github.io/pypy-wheels/manylinux2010 +wheel_build_env = .pkg [testenv:.pkg] pass_env = - LIBCLANG_PATH - BINDGEN_EXTRA_CLANG_ARGS + LIBCLANG_PATH + BINDGEN_EXTRA_CLANG_ARGS [testenv:pypy3] deps = - pip >= 19.3.1 - psutil <= 5.6.7 + pip>=19.3.1 + psutil<=5.6.7 [testenv:hypothesis] -commands = pytest \ - --cov "{envsitepackagesdir}/sourmash" \ - --cov-config "{toxinidir}/tox.ini" \ - --cov-report= \ - --junitxml {toxworkdir}/junit.{envname}.xml \ - --run-hypothesis \ - --hypothesis-show-statistics \ - --hypothesis-profile ci \ - {posargs:.} +commands = + pytest \ + --cov "{envsitepackagesdir}/sourmash" \ + --cov-config "{toxinidir}/tox.ini" \ + --cov-report= \ + --junitxml {toxworkdir}/junit.{envname}.xml \ + --run-hypothesis \ + --hypothesis-show-statistics \ + --hypothesis-profile ci \ + {posargs:.} [testenv:khmer] basepython = python3.10 deps = - khmer -commands = pytest \ - --cov "{envsitepackagesdir}/sourmash" \ - --cov-config "{toxinidir}/tox.ini" \ - --cov-report= \ - --junitxml {toxworkdir}/junit.{envname}.xml \ - -k test_nodegraph \ - {posargs:.} + khmer +commands = + pytest \ + --cov "{envsitepackagesdir}/sourmash" \ + --cov-config "{toxinidir}/tox.ini" \ + --cov-report= \ + --junitxml {toxworkdir}/junit.{envname}.xml \ + -k test_nodegraph \ + {posargs:.} [testenv:khmer_master] basepython = python3.10 deps = - -e git+https://github.com/dib-lab/khmer.git\#egg=khmer -commands = pytest \ - --cov "{envsitepackagesdir}/sourmash" \ - --cov-config "{toxinidir}/tox.ini" \ - --cov-report= \ - --junitxml {toxworkdir}/junit.{envname}.xml \ - -k test_nodegraph \ - {posargs:.} + -e +commands = + pytest \ + --cov "{envsitepackagesdir}/sourmash" \ + --cov-config "{toxinidir}/tox.ini" \ + --cov-report= \ + --junitxml {toxworkdir}/junit.{envname}.xml \ + -k test_nodegraph \ + {posargs:.} [testenv:asv] description = run asv for benchmarking (compare current commit with latest) deps = - asv==0.5.1 - virtualenv + asv==0.5.1 + virtualenv changedir = {toxinidir} commands = - asv machine --yes - asv continuous latest HEAD {posargs} + asv machine --yes + asv continuous latest HEAD {posargs} [testenv:docs] description = invoke sphinx-build to build the HTML docs basepython = python3.10 -extras = doc +extras = + doc +commands = + sphinx-build -d "{toxworkdir}/docs_doctree" doc "{toxworkdir}/docs_out" --color -bhtml {posargs} + python -c 'import pathlib; print("documentation available under file://\{0\}".format(pathlib.Path(r"{toxworkdir}") / "docs_out" / "index.html"))' allowlist_externals = pandoc -pass_env = HOME change_dir = {toxinidir} -#commands = sphinx-build -d "{toxworkdir}/docs_doctree" doc "{toxworkdir}/docs_out" --color -W -bhtml {posargs} -commands = sphinx-build -d "{toxworkdir}/docs_doctree" doc "{toxworkdir}/docs_out" --color -bhtml {posargs} - python -c 'import pathlib; print("documentation available under file://\{0\}".format(pathlib.Path(r"{toxworkdir}") / "docs_out" / "index.html"))' +pass_env = HOME [testenv:package_description] description = check that the long description is valid basepython = python3.10 -deps = twine >= 1.12.1 - # TODO installing readme-renderer[md] should not be necessary - readme-renderer[md] >= 24.0 - pip >= 19.1 skip_install = true -change_dir = {toxinidir} +deps = + pip>=19.1 + readme-renderer[md]>=24 + twine>=1.12.1 extras = -commands = pip wheel -w {envtmpdir}/build --no-deps . - twine check {envtmpdir}/build/* +commands = + pip wheel -w {envtmpdir}/build --no-deps . + twine check {envtmpdir}/build/* +change_dir = {toxinidir} [testenv:mypy] description = run mypy checker basepython = python3.10 +deps = + mypy +commands = + mypy src/sourmash pass_env = {[testenv]pass_env} - # without PROGRAMDATA cloning using git for Windows will fail with an `error setting certificate verify locations` error - PROGRAMDATA -deps = mypy -commands = mypy src/sourmash + PROGRAMDATA [testenv:fix_lint] description = format the code base to adhere to our styles, and complain about what we cannot do automatically basepython = python3.10 +skip_install = true +deps = + pre-commit>=2 +extras = + lint +commands = + pre-commit run --all-files --show-diff-on-failure {posargs} + python -c 'import pathlib; print("hint: run \{\} install to add checks as pre-commit hook".format(pathlib.Path(r"{envdir}") / "bin" / "pre-commit"))' pass_env = {[testenv]pass_env} - # without PROGRAMDATA cloning using git for Windows will fail with an `error setting certificate verify locations` error - PROGRAMDATA - PRE_COMMIT_HOME -extras = lint -deps = pre-commit>=2 -skip_install = True -commands = pre-commit run --all-files --show-diff-on-failure {posargs} - python -c 'import pathlib; print("hint: run \{\} install to add checks as pre-commit hook".format(pathlib.Path(r"{envdir}") / "bin" / "pre-commit"))' + PROGRAMDATA + PRE_COMMIT_HOME [testenv:coverage] description = [run locally after tests]: combine coverage data and create report; - generates a diff coverage against origin/latest (can be changed by setting DIFF_AGAINST env var) -deps = {[testenv]deps} - coverage >= 5.0.1 - diff_cover -skip_install = True + generates a diff coverage against origin/latest (can be changed by setting DIFF_AGAINST env var) +skip_install = true +deps = + {[testenv]deps} + coverage>=5.0.1 + diff_cover +parallel_show_output = true +commands = + coverage combine + coverage report -i -m + coverage xml -i -o {toxworkdir}/coverage.xml + coverage html -i -d {toxworkdir}/htmlcov + diff-cover --compare-branch {env:DIFF_AGAINST:origin/latest} {toxworkdir}/coverage.xml +depends = py312, py311, py310, pypy3 pass_env = {[testenv]pass_env} - DIFF_AGAINST + DIFF_AGAINST set_env = COVERAGE_FILE={toxworkdir}/.coverage -commands = coverage combine - coverage report -i -m - coverage xml -i -o {toxworkdir}/coverage.xml - coverage html -i -d {toxworkdir}/htmlcov - diff-cover --compare-branch {env:DIFF_AGAINST:origin/latest} {toxworkdir}/coverage.xml -depends = py312, py311, py310, pypy3 -parallel_show_output = True [testenv:X] description = print the positional arguments passed in with echo -commands = echo {posargs} +commands = + echo {posargs} + +[testenv:dev] +description = dev environment with all deps at {envdir} +usedevelop = true +deps = + {[testenv]deps} +extras = + doc + storage + test +commands = + python -m pip list --format=columns + python -c "print(r'{envpython}')" [coverage:run] branch = true @@ -190,20 +216,20 @@ exclude_lines = [coverage:paths] source = src/sourmash/ - tests/ - */.tox/*/lib/python*/site-packages/sourmash - */.tox/pypy*/site-packages/sourmash - */.tox\*\Lib\site-packages\sourmash - */src/sourmash - *\src\sourmash - */tests - *\tests + tests/ + */.tox/*/lib/python*/site-packages/sourmash + */.tox/pypy*/site-packages/sourmash + */.tox\*\Lib\site-packages\sourmash + */src/sourmash + *\src\sourmash + */tests + *\tests [gh-actions] python = - 3.10: py310, docs, package_description, coverage - 3.11: py311, coverage - 3.12: py312, coverage + 3.10: py310, docs, package_description, coverage + 3.11: py311, coverage + 3.12: py312, coverage [flake8] max-complexity = 22 @@ -212,14 +238,3 @@ ignore = E203, W503, C901, E402, B011 [pep8] max-line-length = 99 - -[testenv:dev] -description = dev environment with all deps at {envdir} -extras = - test - storage - doc -deps = {[testenv]deps} -usedevelop = True -commands = python -m pip list --format=columns - python -c "print(r'{envpython}')" diff --git a/utils/cardinality_estimate_confidence.py b/utils/cardinality_estimate_confidence.py index 1f8471fbeb..85c6e5cc75 100644 --- a/utils/cardinality_estimate_confidence.py +++ b/utils/cardinality_estimate_confidence.py @@ -13,7 +13,7 @@ def set_size_chernoff(set_size, scale, relative_error=0.05): @param relative_error: the desired relative error (defaults to 5%) @return: float (the upper bound probability) """ - upper_bound = 1 - 2 * np.exp(- relative_error**2*set_size/(scale * 3)) + upper_bound = 1 - 2 * np.exp(-(relative_error**2) * set_size / (scale * 3)) return upper_bound @@ -28,7 +28,9 @@ def get_set_size(scale, num_sketches): return int(np.floor(scale * num_sketches)) -def set_size_estimate_is_accurate(scale, num_sketches, relative_error=0.05, confidence=0.95): +def set_size_estimate_is_accurate( + scale, num_sketches, relative_error=0.05, confidence=0.95 +): set_size = get_set_size(scale, num_sketches) probability = set_size_chernoff(set_size, scale, relative_error) if probability >= confidence: @@ -38,48 +40,96 @@ def set_size_estimate_is_accurate(scale, num_sketches, relative_error=0.05, conf def test_set_size_chernoff(): - eps = 10**(-6) + eps = 10 ** (-6) rel_error = 0.01 set_size = 1000000 - s = 1/0.1 # I'm used to using a scale value between 0 and 1 + s = 1 / 0.1 # I'm used to using a scale value between 0 and 1 value_from_mathematica = 0.928652 - assert np.abs(set_size_chernoff(set_size, s, rel_error) - value_from_mathematica) < eps + assert ( + np.abs(set_size_chernoff(set_size, s, rel_error) - value_from_mathematica) < eps + ) rel_error = 0.05 set_size = 10000 s = 1 value_from_mathematica = 0.999519 - assert np.abs(set_size_chernoff(set_size, s, rel_error) - value_from_mathematica) < eps + assert ( + np.abs(set_size_chernoff(set_size, s, rel_error) - value_from_mathematica) < eps + ) rel_error = 0.001 set_size = 10 - s = 1/.01 + s = 1 / 0.01 value_from_mathematica = -1 - assert np.abs(set_size_chernoff(set_size, s, rel_error) - value_from_mathematica) < eps + assert ( + np.abs(set_size_chernoff(set_size, s, rel_error) - value_from_mathematica) < eps + ) def test_set_size_estimate_is_accurate(): - eps = 10 ** (-6) + 10 ** (-6) rel_error = 0.05 set_size = 1000000 s = 1 / 0.1 # I'm used to using a scale value between 0 and 1 num_sketches = set_size / s # idealized case confidence = 0.95 - assert set_size_estimate_is_accurate(scale=s, num_sketches=num_sketches, relative_error=rel_error, confidence=confidence) is True + assert ( + set_size_estimate_is_accurate( + scale=s, + num_sketches=num_sketches, + relative_error=rel_error, + confidence=confidence, + ) + is True + ) confidence = set_size_chernoff(set_size=set_size, scale=s, relative_error=rel_error) - assert set_size_estimate_is_accurate(scale=s, num_sketches=num_sketches, relative_error=rel_error, confidence=confidence) is True + assert ( + set_size_estimate_is_accurate( + scale=s, + num_sketches=num_sketches, + relative_error=rel_error, + confidence=confidence, + ) + is True + ) # Horrible values - assert set_size_estimate_is_accurate(scale=10000, num_sketches=num_sketches, relative_error=0, confidence=1) is False + assert ( + set_size_estimate_is_accurate( + scale=10000, num_sketches=num_sketches, relative_error=0, confidence=1 + ) + is False + ) # Less horrible, but still bad values confidence = set_size_chernoff(set_size=set_size, scale=s, relative_error=rel_error) - assert set_size_estimate_is_accurate(scale=s, num_sketches=num_sketches, relative_error=rel_error, confidence=confidence*2) is False + assert ( + set_size_estimate_is_accurate( + scale=s, + num_sketches=num_sketches, + relative_error=rel_error, + confidence=confidence * 2, + ) + is False + ) # one where the confidence is negative - rel_error = .001 + rel_error = 0.001 set_size = 10 s = 100 - num_sketches = set_size/s - assert set_size_estimate_is_accurate(scale=s, num_sketches=num_sketches, relative_error=rel_error, confidence=confidence) is False - assert set_size_estimate_is_accurate(scale=s, num_sketches=0, relative_error=rel_error, confidence=confidence) is False + num_sketches = set_size / s + assert ( + set_size_estimate_is_accurate( + scale=s, + num_sketches=num_sketches, + relative_error=rel_error, + confidence=confidence, + ) + is False + ) + assert ( + set_size_estimate_is_accurate( + scale=s, num_sketches=0, relative_error=rel_error, confidence=confidence + ) + is False + ) def run_tests(): @@ -87,7 +137,7 @@ def run_tests(): test_set_size_estimate_is_accurate() -if __name__ == '__main__': +if __name__ == "__main__": print("Running tests") run_tests() print("Tests completed successfully") diff --git a/utils/check-tree.py b/utils/check-tree.py index 12fc0190de..639e376e3b 100644 --- a/utils/check-tree.py +++ b/utils/check-tree.py @@ -7,12 +7,12 @@ import sourmash from sourmash.sbtmh import search_minhashes -THRESHOLD=0.08 +THRESHOLD = 0.08 def main(): p = argparse.ArgumentParser() - p.add_argument('sbt') + p.add_argument("sbt") args = p.parse_args() db = sourmash.sbtmh.load_sbt_index(args.sbt) @@ -21,11 +21,11 @@ def main(): for leaf in db.leaves(): query = leaf.data matches = db.find(search_minhashes, query, threshold) - matches = list([ x.data for x in matches ]) + matches = list([x.data for x in matches]) if query not in matches: print(query) assert 0 - -if __name__ == '__main__': + +if __name__ == "__main__": main() diff --git a/utils/compute-dna-mh-another-way.py b/utils/compute-dna-mh-another-way.py index aad7198092..c197298d75 100755 --- a/utils/compute-dna-mh-another-way.py +++ b/utils/compute-dna-mh-another-way.py @@ -7,7 +7,9 @@ The output of this is used in test_sourmash.py to verify our C++ code. """ -__complementTranslation = { "A": "T", "C": "G", "G": "C", "T": "A", "N": "N" } +__complementTranslation = {"A": "T", "C": "G", "G": "C", "T": "A", "N": "N"} + + def complement(s): """ Return complement of 's'. @@ -26,21 +28,24 @@ def reverse(s): def kmers(seq, k): for start in range(len(seq) - k + 1): - yield seq[start:start + k] + yield seq[start : start + k] + ### K = 21 -import sys, screed +import sys +import screed import mmh3 import sourmash -print('imported sourmash:', sourmash, file=sys.stderr) + +print("imported sourmash:", sourmash, file=sys.stderr) import sourmash.signature record = next(iter(screed.open(sys.argv[1]))) -print('loaded', record.name, file=sys.stderr) -revcomp = reverse(complement((record.sequence))) +print("loaded", record.name, file=sys.stderr) +revcomp = reverse(complement(record.sequence)) mh = sourmash.MinHash(ksize=K, n=500, is_protein=False) @@ -69,5 +74,5 @@ def kmers(seq, k): mh.add_hash(hash) -s = sourmash.signature.SourmashSignature('', mh, name=record.name) +s = sourmash.signature.SourmashSignature("", mh, name=record.name) print(sourmash.signature.save_signatures([s])) diff --git a/utils/compute-input-prot-another-way.py b/utils/compute-input-prot-another-way.py index 5c1202eaee..7dec10d849 100755 --- a/utils/compute-input-prot-another-way.py +++ b/utils/compute-input-prot-another-way.py @@ -7,25 +7,77 @@ The output of this is used in test_sourmash.py to verify our C++ code. """ -dna_to_aa={'TTT':'F','TTC':'F', 'TTA':'L','TTG':'L', - 'TCT':'S','TCC':'S','TCA':'S','TCG':'S', - 'TAT':'Y','TAC':'Y', 'TAA':'*','TAG':'*','TGA':'*', - 'TGT':'C','TGC':'C', 'TGG':'W', - 'CTT':'L','CTC':'L','CTA':'L','CTG':'L', - 'CCT':'P','CCC':'P','CCA':'P','CCG':'P', - 'CAT':'H','CAC':'H', 'CAA':'Q','CAG':'Q', - 'CGT':'R','CGC':'R','CGA':'R','CGG':'R', - 'ATT':'I','ATC':'I','ATA':'I', 'ATG':'M', - 'ACT':'T','ACC':'T','ACA':'T','ACG':'T', - 'AAT':'N','AAC':'N', 'AAA':'K','AAG':'K', - 'AGT':'S','AGC':'S', 'AGA':'R','AGG':'R', - 'GTT':'V','GTC':'V','GTA':'V','GTG':'V', - 'GCT':'A','GCC':'A','GCA':'A','GCG':'A', - 'GAT':'D','GAC':'D', 'GAA':'E','GAG':'E', - 'GGT':'G','GGC':'G','GGA':'G','GGG':'G'} - - -__complementTranslation = { "A": "T", "C": "G", "G": "C", "T": "A", "N": "N" } +dna_to_aa = { + "TTT": "F", + "TTC": "F", + "TTA": "L", + "TTG": "L", + "TCT": "S", + "TCC": "S", + "TCA": "S", + "TCG": "S", + "TAT": "Y", + "TAC": "Y", + "TAA": "*", + "TAG": "*", + "TGA": "*", + "TGT": "C", + "TGC": "C", + "TGG": "W", + "CTT": "L", + "CTC": "L", + "CTA": "L", + "CTG": "L", + "CCT": "P", + "CCC": "P", + "CCA": "P", + "CCG": "P", + "CAT": "H", + "CAC": "H", + "CAA": "Q", + "CAG": "Q", + "CGT": "R", + "CGC": "R", + "CGA": "R", + "CGG": "R", + "ATT": "I", + "ATC": "I", + "ATA": "I", + "ATG": "M", + "ACT": "T", + "ACC": "T", + "ACA": "T", + "ACG": "T", + "AAT": "N", + "AAC": "N", + "AAA": "K", + "AAG": "K", + "AGT": "S", + "AGC": "S", + "AGA": "R", + "AGG": "R", + "GTT": "V", + "GTC": "V", + "GTA": "V", + "GTG": "V", + "GCT": "A", + "GCC": "A", + "GCA": "A", + "GCG": "A", + "GAT": "D", + "GAC": "D", + "GAA": "E", + "GAG": "E", + "GGT": "G", + "GGC": "G", + "GGA": "G", + "GGG": "G", +} + + +__complementTranslation = {"A": "T", "C": "G", "G": "C", "T": "A", "N": "N"} + + def complement(s): """ Return complement of 's'. @@ -44,7 +96,7 @@ def reverse(s): def peptides(seq, start): for i in range(start, len(seq), 3): - yield dna_to_aa.get(seq[i:i+3], "X") + yield dna_to_aa.get(seq[i : i + 3], "X") def translate(seq): @@ -52,27 +104,31 @@ def translate(seq): pep = peptides(seq, i) yield "".join(pep) - revcomp = reverse(complement((seq))) + revcomp = reverse(complement(seq)) for i in range(3): pep = peptides(revcomp, i) yield "".join(pep) + def kmers(seq, k): for start in range(len(seq) - k + 1): - yield seq[start:start + k] + yield seq[start : start + k] + ### K = 21 -import sys, screed +import sys +import screed import mmh3 import sourmash -print('imported sourmash:', sourmash, file=sys.stderr) + +print("imported sourmash:", sourmash, file=sys.stderr) import sourmash.signature record = next(iter(screed.open(sys.argv[1]))) -print('loaded', record.name, file=sys.stderr) +print("loaded", record.name, file=sys.stderr) mh = sourmash.MinHash(ksize=K, n=500, is_protein=True) prot_ksize = int(K / 3) @@ -86,5 +142,5 @@ def kmers(seq, k): mh.add_hash(hash) -s = sourmash.signature.SourmashSignature('', mh, name=record.name) +s = sourmash.signature.SourmashSignature("", mh, name=record.name) print(sourmash.signature.save_signatures([s])) diff --git a/utils/compute-prot-mh-another-way.py b/utils/compute-prot-mh-another-way.py index 6295204f3b..e859268d05 100755 --- a/utils/compute-prot-mh-another-way.py +++ b/utils/compute-prot-mh-another-way.py @@ -7,25 +7,77 @@ The output of this is used in test_sourmash.py to verify our C++ code. """ -dna_to_aa={'TTT':'F','TTC':'F', 'TTA':'L','TTG':'L', - 'TCT':'S','TCC':'S','TCA':'S','TCG':'S', - 'TAT':'Y','TAC':'Y', 'TAA':'*','TAG':'*','TGA':'*', - 'TGT':'C','TGC':'C', 'TGG':'W', - 'CTT':'L','CTC':'L','CTA':'L','CTG':'L', - 'CCT':'P','CCC':'P','CCA':'P','CCG':'P', - 'CAT':'H','CAC':'H', 'CAA':'Q','CAG':'Q', - 'CGT':'R','CGC':'R','CGA':'R','CGG':'R', - 'ATT':'I','ATC':'I','ATA':'I', 'ATG':'M', - 'ACT':'T','ACC':'T','ACA':'T','ACG':'T', - 'AAT':'N','AAC':'N', 'AAA':'K','AAG':'K', - 'AGT':'S','AGC':'S', 'AGA':'R','AGG':'R', - 'GTT':'V','GTC':'V','GTA':'V','GTG':'V', - 'GCT':'A','GCC':'A','GCA':'A','GCG':'A', - 'GAT':'D','GAC':'D', 'GAA':'E','GAG':'E', - 'GGT':'G','GGC':'G','GGA':'G','GGG':'G'} - - -__complementTranslation = { "A": "T", "C": "G", "G": "C", "T": "A", "N": "N" } +dna_to_aa = { + "TTT": "F", + "TTC": "F", + "TTA": "L", + "TTG": "L", + "TCT": "S", + "TCC": "S", + "TCA": "S", + "TCG": "S", + "TAT": "Y", + "TAC": "Y", + "TAA": "*", + "TAG": "*", + "TGA": "*", + "TGT": "C", + "TGC": "C", + "TGG": "W", + "CTT": "L", + "CTC": "L", + "CTA": "L", + "CTG": "L", + "CCT": "P", + "CCC": "P", + "CCA": "P", + "CCG": "P", + "CAT": "H", + "CAC": "H", + "CAA": "Q", + "CAG": "Q", + "CGT": "R", + "CGC": "R", + "CGA": "R", + "CGG": "R", + "ATT": "I", + "ATC": "I", + "ATA": "I", + "ATG": "M", + "ACT": "T", + "ACC": "T", + "ACA": "T", + "ACG": "T", + "AAT": "N", + "AAC": "N", + "AAA": "K", + "AAG": "K", + "AGT": "S", + "AGC": "S", + "AGA": "R", + "AGG": "R", + "GTT": "V", + "GTC": "V", + "GTA": "V", + "GTG": "V", + "GCT": "A", + "GCC": "A", + "GCA": "A", + "GCG": "A", + "GAT": "D", + "GAC": "D", + "GAA": "E", + "GAG": "E", + "GGT": "G", + "GGC": "G", + "GGA": "G", + "GGG": "G", +} + + +__complementTranslation = {"A": "T", "C": "G", "G": "C", "T": "A", "N": "N"} + + def complement(s): """ Return complement of 's'. @@ -44,7 +96,7 @@ def reverse(s): def peptides(seq, start): for i in range(start, len(seq), 3): - yield dna_to_aa.get(seq[i:i+3], "X") + yield dna_to_aa.get(seq[i : i + 3], "X") def translate(seq): @@ -52,27 +104,31 @@ def translate(seq): pep = peptides(seq, i) yield "".join(pep) - revcomp = reverse(complement((seq))) + revcomp = reverse(complement(seq)) for i in range(3): pep = peptides(revcomp, i) yield "".join(pep) + def kmers(seq, k): for start in range(len(seq) - k + 1): - yield seq[start:start + k] + yield seq[start : start + k] + ### K = 21 -import sys, screed +import sys +import screed import mmh3 import sourmash -print('imported sourmash:', sourmash, file=sys.stderr) + +print("imported sourmash:", sourmash, file=sys.stderr) import sourmash.signature record = next(iter(screed.open(sys.argv[1]))) -print('loaded', record.name, file=sys.stderr) +print("loaded", record.name, file=sys.stderr) mh = sourmash.MinHash(ksize=K, n=500, is_protein=True) prot_ksize = int(K / 3) @@ -87,5 +143,5 @@ def kmers(seq, k): mh.add_hash(hash) -s = sourmash.signature.SourmashSignature('', mh, name=record.name) +s = sourmash.signature.SourmashSignature("", mh, name=record.name) print(sourmash.signature.save_signatures([s]))