From 148f49068ea959c26680218eb0986b89aa900074 Mon Sep 17 00:00:00 2001 From: Siddharth Uppal Date: Tue, 11 Apr 2023 16:14:30 -0500 Subject: [PATCH 01/11] =?UTF-8?q?=F0=9F=A7=91=E2=80=8D=F0=9F=94=A7=20?= =?UTF-8?q?=F0=9F=93=9D=20Fix=20docs=20(#323)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Pin sphinx to version 6 * readthedocs build now requires installing autometa using `pip` in .readthedocs.yml * Add mocks for gdown, attrs, numpy, pandas, scipy, numba, skbio, trimap * Pin docutils between 0.18 and 0.20 * Pin sphinx_rtd_theme to 1.2 --- .readthedocs.yaml | 2 ++ docs/requirements.txt | 7 ++++--- docs/source/conf.py | 17 ++++++++++++++++- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index eb39074fa..be33b59ba 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -20,3 +20,5 @@ formats: all python: install: - requirements: docs/requirements.txt + - method: pip + path: . diff --git a/docs/requirements.txt b/docs/requirements.txt index 5333dd0f9..9ef24e897 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,4 @@ -sphinx==4.2.0 -sphinx_rtd_theme==1.0.0 -readthedocs-sphinx-search==0.1.1 \ No newline at end of file +sphinx==6.0 +sphinx_rtd_theme==1.2 +readthedocs-sphinx-search==0.1.1 +docutils>=0.18,<0.20 diff --git a/docs/source/conf.py b/docs/source/conf.py index 0920262c2..4a9bfe2de 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -17,7 +17,22 @@ for dirpath, dirnames, filenames in os.walk("../../", topdown=True): sys.path.insert(0, os.path.abspath(dirpath)) -autodoc_mock_imports = ["Bio", "hdbscan", "tsne", "sklearn", "umap", "tqdm"] +autodoc_mock_imports = [ + "Bio", + "hdbscan", + "tsne", + "sklearn", + "umap", + "tqdm", + "pandas", + "numpy", + "scipy", + "numba", + "skbio", + "trimap", + "attrs", + "gdown", +] # fmt: off import parse_argparse From 32f44d0fb324fbb6b25e2b040d688a5acbd7b3b0 Mon Sep 17 00:00:00 2001 From: kaw97 <31460812+kaw97@users.noreply.github.com> Date: Fri, 21 Apr 2023 12:06:23 -0500 Subject: [PATCH 02/11] Reorder `autometa-binning` parameters in step-by-step tutorial (#314) - `autometa-binning` parameter explanation is now in the same order as the commands are input - deprecated `--domain` has been replaced with `--rank-filter-name` --- docs/source/step-by-step-tutorial.rst | 70 ++++++++++++++------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/docs/source/step-by-step-tutorial.rst b/docs/source/step-by-step-tutorial.rst index 4c70aaaca..28728a970 100644 --- a/docs/source/step-by-step-tutorial.rst +++ b/docs/source/step-by-step-tutorial.rst @@ -637,51 +637,53 @@ Use the following command to perform binning: --coverages $HOME/tutorial/78mbp_metagenome.coverages.tsv \ --gc-content $HOME/tutorial/78mbp_metagenome.gc_content.tsv \ --markers $HOME/tutorial/78mbp_metagenome.markers.tsv \ + --output-binning $HOME/tutorial/78mbp_metagenome.binning.tsv \ + --output-main $HOME/tutorial/78mbp_metagenome.main.tsv \ --clustering-method dbscan \ --completeness 20 \ --purity 90 \ --cov-stddev-limit 25 \ --gc-stddev-limit 5 \ --taxonomy $HOME/tutorial/78mbp_metagenome.taxonomy.tsv \ - --output-binning $HOME/tutorial/78mbp_metagenome.binning.tsv \ - --output-main $HOME/tutorial/78mbp_metagenome.main.tsv \ --starting-rank superkingdom \ - --rank-filter superkingdom + --rank-filter superkingdom \ --rank-name-filter bacteria Let us dissect the above command: -+-------------------------+-----------------------------------------------------------------------------------------+-------------+ -| Flag | Function | Requirement | -+=========================+=========================================================================================+=============+ -| ``--kmers`` | Path to embedded k-mer frequencies table | Required | -+-------------------------+-----------------------------------------------------------------------------------------+-------------+ -| ``--coverages`` | Path to metagenome coverages table | Required | -+-------------------------+-----------------------------------------------------------------------------------------+-------------+ -| ``--gc-content`` | Path to metagenome GC contents table | Required | -+-------------------------+-----------------------------------------------------------------------------------------+-------------+ -| ``--markers`` | Path to Autometa annotated markers table | Required | -+-------------------------+-----------------------------------------------------------------------------------------+-------------+ -| ``--output-binning`` | Path to write Autometa binning results | Required | -+-------------------------+-----------------------------------------------------------------------------------------+-------------+ -| ``--output-main`` | Path to write Autometa main table | Required | -+-------------------------+-----------------------------------------------------------------------------------------+-------------+ -| ``--clustering-method`` | Clustering algorithm to use for recursive binning. Choices dbscan (default) and hdbscan | Optional | -+-------------------------+-----------------------------------------------------------------------------------------+-------------+ -| ``--completeness`` | completeness cutoff to retain cluster (default 20) | Optional | -+-------------------------+-----------------------------------------------------------------------------------------+-------------+ -| ``--purity`` | purity cutoff to retain cluster (default 95) | Optional | -+-------------------------+-----------------------------------------------------------------------------------------+-------------+ -| ``--cov-stddev-limit`` | coverage standard deviation limit to retain cluster (default 25) | Optional | -+-------------------------+-----------------------------------------------------------------------------------------+-------------+ -| ``--gc-stddev-limit`` | GC content standard deviation limit to retain cluster (default 5) | Optional | -+-------------------------+-----------------------------------------------------------------------------------------+-------------+ -| ``--taxonomy`` | Path to Autometa assigned taxonomies table | Required | -+-------------------------+-----------------------------------------------------------------------------------------+-------------+ -| ``--starting-rank`` | Canonical rank at which to begin subsetting taxonomy (default: superkingdom) | Optional | -+-------------------------+-----------------------------------------------------------------------------------------+-------------+ -| ``--domain`` | Kingdom to consider. Choices bacteria (default) and archaea | Optional | -+-------------------------+-----------------------------------------------------------------------------------------+-------------+ ++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+ +| Flag | Function | Requirement | ++=========================+====================================================================================================================+=============+ +| ``--kmers`` | Path to embedded k-mer frequencies table | Required | ++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+ +| ``--coverages`` | Path to metagenome coverages table | Required | ++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+ +| ``--gc-content`` | Path to metagenome GC contents table | Required | ++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+ +| ``--markers`` | Path to Autometa annotated markers table | Required | ++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+ +| ``--output-binning`` | Path to write Autometa binning results | Required | ++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+ +| ``--output-main`` | Path to write Autometa main table | Required | ++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+ +| ``--clustering-method`` | Clustering algorithm to use for recursive binning. Choices dbscan (default) and hdbscan | Optional | ++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+ +| ``--completeness`` | completeness cutoff to retain cluster (default 20) | Optional | ++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+ +| ``--purity`` | purity cutoff to retain cluster (default 95) | Optional | ++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+ +| ``--cov-stddev-limit`` | coverage standard deviation limit to retain cluster (default 25) | Optional | ++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+ +| ``--gc-stddev-limit`` | GC content standard deviation limit to retain cluster (default 5) | Optional | ++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+ +| ``--taxonomy`` | Path to Autometa assigned taxonomies table | Required | ++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+ +| ``--starting-rank`` | Canonical rank at which to begin subsetting taxonomy (default: superkingdom) | Optional | ++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+ +| ``--rank-filter`` | Canonical rank to subset by the value provided by ``--rank-name-filter`` default: superkingdom | Optional | ++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+ +| ``--rank-name-filter`` | Only retrieve contigs with this value in the canonical rank column provided in ``rank-filter`` (default: bacteria) | Optional | ++-------------------------+--------------------------------------------------------------------------------------------------------------------+-------------+ You can view the complete command-line options using ``autometa-binning -h`` From f13ee918691a87e2e89a162b6c423588d2545c36 Mon Sep 17 00:00:00 2001 From: shaneroesemann <59748289+shaneroesemann@users.noreply.github.com> Date: Thu, 10 Aug 2023 11:03:49 -0500 Subject: [PATCH 03/11] =?UTF-8?q?=F0=9F=8E=A8=20=F0=9F=8D=8F=20Issue=20330?= =?UTF-8?q?=20redo=20(#338)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * implements changes by @Sidd in issue #329 in a separate new PR * add pre commit hook to remove unused imports * :art::green_heart: removed sed/cut changes that belong to another PR --- .pre-commit-config.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b55482320..e1ba631c4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,3 +11,7 @@ repos: - id: end-of-file-fixer - id: debug-statements - id: check-merge-conflict + - repo: https://github.com/hadialqattan/pycln + rev: v2.1.5 # Possible releases: https://github.com/hadialqattan/pycln/releases + hooks: + - id: pycln \ No newline at end of file From 546b06145799a0bfcb86779c563014532259b6ea Mon Sep 17 00:00:00 2001 From: Evan Rees <25933122+WiscEvan@users.noreply.github.com> Date: Thu, 17 Aug 2023 12:40:37 -0400 Subject: [PATCH 04/11] :fire::whale::green_heart: Fix docker-builds by removing pinned dependencies (#340) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - πŸ’šπŸ³πŸ”₯⬆️ Remove pins for scipy, scikit-learn and joblib - πŸ’š 🐳 Add build schedule for Autometa docker images > This will help to more quickly identify when builds begin failing > Add `nightly` tag for scheduled build - :whale: change user workdir to `/Autometa` --- .github/workflows/docker_autometa.yml | 3 +++ Dockerfile | 9 +++++---- autometa-env.yml | 6 +++--- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/workflows/docker_autometa.yml b/.github/workflows/docker_autometa.yml index 4bf3d727f..b6871839e 100644 --- a/.github/workflows/docker_autometa.yml +++ b/.github/workflows/docker_autometa.yml @@ -30,6 +30,8 @@ on: branches: - main - dev + schedule: + - cron: '0 0 * * *' # every day at midnight jobs: docker_autometa: @@ -50,6 +52,7 @@ jobs: type=raw,value=latest,enable=${{ endsWith(github.ref, github.event.repository.default_branch) }} type=raw,value={{branch}} type=semver,pattern={{version}} + type=schedule,pattern=nightly - name: Login to DockerHub if: github.event_name != 'pull_request' uses: docker/login-action@v1 diff --git a/Dockerfile b/Dockerfile index 5a7d541a0..edc2f042f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM continuumio/miniconda3 +FROM condaforge/mambaforge:latest LABEL maintainer="jason.kwan@wisc.edu" # Copyright 2022 Ian J. Miller, Evan R. Rees, Kyle Wolf, Siddharth Uppal, @@ -25,11 +25,12 @@ RUN apt-get update --allow-releaseinfo-change \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* COPY autometa-env.yml ./ -RUN conda env update -n base --file=autometa-env.yml \ - && conda clean --all -y +RUN mamba env update -n base --file=autometa-env.yml \ + && mamba clean --all -y -COPY . . +COPY . /Autometa +WORKDIR /Autometa RUN make install && make clean # NOTE: DB_DIR must be an absolute path (not a relative path) diff --git a/autometa-env.yml b/autometa-env.yml index d1b2b8d1d..1c93fd218 100644 --- a/autometa-env.yml +++ b/autometa-env.yml @@ -12,7 +12,7 @@ dependencies: - gdown - hdbscan - hmmer - - joblib==1.1.0 # See https://stackoverflow.com/a/73830525/12671809 + - joblib - numba>=0.47 - numpy>=1.13 - pandas>=1.1 @@ -24,8 +24,8 @@ dependencies: - rsync - samtools>=1.11 - scikit-bio - - scipy==1.8.1 #force scipy 1.8 until scikit-bio updates to 1.9, https://github.com/KwanLab/Autometa/issues/285 - - scikit-learn==0.24 # prevent error from joblib in multiprocessing distance calculations + - scipy + - scikit-learn - seqkit - tqdm - trimap From 583369e0d4e6dfea3d93478b384de22b4d332cfb Mon Sep 17 00:00:00 2001 From: kaw97 <31460812+kaw97@users.noreply.github.com> Date: Wed, 23 Aug 2023 09:31:27 -0500 Subject: [PATCH 05/11] singularity image urls (#316) Add singularity urls for autometa 2.2.0 --- modules/local/align_reads.nf | 2 +- modules/local/binning.nf | 2 +- modules/local/binning_summary.nf | 2 +- modules/local/hmmer_hmmsearch_filter.nf | 2 +- modules/local/length_table.nf | 2 +- modules/local/majority_vote.nf | 2 +- modules/local/markers.nf | 2 +- modules/local/prepare_lca.nf | 2 +- modules/local/reduce_lca.nf | 2 +- modules/local/split_kingdoms.nf | 2 +- modules/local/unclustered_recruitment.nf | 2 +- subworkflows/local/prepare_ncbi_taxinfo.nf | 6 +++--- subworkflows/local/prepare_nr.nf | 4 ++-- 13 files changed, 16 insertions(+), 16 deletions(-) diff --git a/modules/local/align_reads.nf b/modules/local/align_reads.nf index 1a5fd618e..472a8a25f 100644 --- a/modules/local/align_reads.nf +++ b/modules/local/align_reads.nf @@ -15,7 +15,7 @@ process ALIGN_READS { conda (params.enable_conda ? "bioconda::autometa" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } diff --git a/modules/local/binning.nf b/modules/local/binning.nf index a8b9ebba3..877a65014 100644 --- a/modules/local/binning.nf +++ b/modules/local/binning.nf @@ -11,7 +11,7 @@ process BINNING { conda (params.enable_conda ? "bioconda::autometa" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } diff --git a/modules/local/binning_summary.nf b/modules/local/binning_summary.nf index ad5554e6d..f0c8010db 100644 --- a/modules/local/binning_summary.nf +++ b/modules/local/binning_summary.nf @@ -12,7 +12,7 @@ process BINNING_SUMMARY { conda (params.enable_conda ? "bioconda::autometa" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } diff --git a/modules/local/hmmer_hmmsearch_filter.nf b/modules/local/hmmer_hmmsearch_filter.nf index a21be8b3a..26d1d8cee 100644 --- a/modules/local/hmmer_hmmsearch_filter.nf +++ b/modules/local/hmmer_hmmsearch_filter.nf @@ -24,7 +24,7 @@ process HMMER_HMMSEARCH_FILTER { conda (params.enable_conda ? "autometa" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } diff --git a/modules/local/length_table.nf b/modules/local/length_table.nf index d6b95de69..a94c76833 100644 --- a/modules/local/length_table.nf +++ b/modules/local/length_table.nf @@ -12,7 +12,7 @@ process LENGTH_TABLE { conda (params.enable_conda ? "bioconda::autometa" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } diff --git a/modules/local/majority_vote.nf b/modules/local/majority_vote.nf index 846773a1b..6271b7bd2 100644 --- a/modules/local/majority_vote.nf +++ b/modules/local/majority_vote.nf @@ -12,7 +12,7 @@ process MAJORITY_VOTE { conda (params.enable_conda ? "bioconda::autometa" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } diff --git a/modules/local/markers.nf b/modules/local/markers.nf index 5f458ad8b..5835735b7 100644 --- a/modules/local/markers.nf +++ b/modules/local/markers.nf @@ -13,7 +13,7 @@ process MARKERS { conda (params.enable_conda ? "autometa" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } diff --git a/modules/local/prepare_lca.nf b/modules/local/prepare_lca.nf index e11015153..ce712cd3f 100644 --- a/modules/local/prepare_lca.nf +++ b/modules/local/prepare_lca.nf @@ -10,7 +10,7 @@ process PREPARE_LCA { conda (params.enable_conda ? "bioconda::autometa" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } diff --git a/modules/local/reduce_lca.nf b/modules/local/reduce_lca.nf index 3424d2255..031564557 100644 --- a/modules/local/reduce_lca.nf +++ b/modules/local/reduce_lca.nf @@ -12,7 +12,7 @@ process REDUCE_LCA { conda (params.enable_conda ? "bioconda::autometa" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } diff --git a/modules/local/split_kingdoms.nf b/modules/local/split_kingdoms.nf index f93c12fd8..396cd828e 100644 --- a/modules/local/split_kingdoms.nf +++ b/modules/local/split_kingdoms.nf @@ -12,7 +12,7 @@ process SPLIT_KINGDOMS { conda (params.enable_conda ? "bioconda::autometa" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } diff --git a/modules/local/unclustered_recruitment.nf b/modules/local/unclustered_recruitment.nf index dee238caf..460bf1b9a 100644 --- a/modules/local/unclustered_recruitment.nf +++ b/modules/local/unclustered_recruitment.nf @@ -12,7 +12,7 @@ process RECRUIT { conda (params.enable_conda ? "autometa" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } diff --git a/subworkflows/local/prepare_ncbi_taxinfo.nf b/subworkflows/local/prepare_ncbi_taxinfo.nf index b11c39c7c..5b6737393 100644 --- a/subworkflows/local/prepare_ncbi_taxinfo.nf +++ b/subworkflows/local/prepare_ncbi_taxinfo.nf @@ -15,7 +15,7 @@ process TEST_DOWNLOAD { conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } @@ -38,7 +38,7 @@ process DOWNLOAD_ACESSION2TAXID { conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } @@ -71,7 +71,7 @@ process DOWNLOAD_TAXDUMP { conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } diff --git a/subworkflows/local/prepare_nr.nf b/subworkflows/local/prepare_nr.nf index da70d61cc..0d1de4b68 100644 --- a/subworkflows/local/prepare_nr.nf +++ b/subworkflows/local/prepare_nr.nf @@ -16,7 +16,7 @@ process DOWNLOAD_NR { conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } @@ -46,7 +46,7 @@ process TEST_DOWNLOAD { conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } From 04b2926c150536ab44bba25ac5008909b70eb953 Mon Sep 17 00:00:00 2001 From: Evan Rees <25933122+WiscEvan@users.noreply.github.com> Date: Thu, 24 Aug 2023 11:03:06 -0400 Subject: [PATCH 06/11] =?UTF-8?q?=F0=9F=92=9A:bug::snake:=20=E2=AC=86?= =?UTF-8?q?=EF=B8=8F=20Fix=20pytest=20and=20resolve=20hdbscan=20dependency?= =?UTF-8?q?=20(#341)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * πŸ’š:bug::snake: Fix bug in unclustered recruitment that occurs when 0 predictions pass the confidence filter * πŸ’šβ¬†οΈ Add pin `scikit-learn>=1.3` * πŸ’šπŸ”₯Remove pin `hdbscan` (hdbscan available in `sklearn.cluster` in v1.3) * :snake::art: Replace `conda` commands with `mamba` in `Makefile` * :memo: replace instances of conda with mamba * :fire: Remove virtualenv commands from Makefile (resolves #331) * :memo: Replace conda with mamba in workflows --- Makefile | 24 ++-- autometa-env.yml | 4 +- autometa/binning/recursive_dbscan.py | 28 +--- autometa/binning/unclustered_recruitment.py | 10 +- docs/source/bash-workflow.rst | 25 ++-- docs/source/benchmarking.rst | 32 ++--- docs/source/how-to-contribute.rst | 10 +- docs/source/installation.rst | 85 +++++++++--- docs/source/nextflow-workflow.rst | 145 ++++++++++---------- docs/source/step-by-step-tutorial.rst | 2 +- tests/environment.yml | 6 +- 11 files changed, 204 insertions(+), 167 deletions(-) diff --git a/Makefile b/Makefile index c082a18d2..71b0112ba 100644 --- a/Makefile +++ b/Makefile @@ -10,10 +10,10 @@ PYTHON_INTERPRETER = python3 # This was retrieved from https://drive.google.com/file/d/1bSlPldaq3C6Cf9Y5Rm7iwtUDcjxAaeEk/view?usp=sharing TEST_DATA_FILEID = 1bSlPldaq3C6Cf9Y5Rm7iwtUDcjxAaeEk -ifeq (,$(shell which conda)) -HAS_CONDA=False +ifeq (,$(shell which mamba)) +HAS_MAMBA=False else -HAS_CONDA=True +HAS_MAMBA=True endif ################################################################################# @@ -35,20 +35,18 @@ black: ## Set up python interpreter environment create_environment: autometa-env.yml -ifeq (True,$(HAS_CONDA)) - @echo ">>> Detected conda, creating conda environment." +ifeq (True,$(HAS_MAMBA)) + @echo ">>> Detected mamba, creating mamba environment." ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER))) - conda env create --file=autometa-env.yml + mamba env create --file=autometa-env.yml else @echo "It looks like you are not using python 3. Autometa is only compatible with python 3. Please upgrade." endif - @echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)" + @echo ">>> New mamba env created. Activate with:\nsource activate $(PROJECT_NAME)" else - $(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper - @echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\ - export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n" - @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)" - @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)" + @echo "Mamba not detected. Please install before proceeding..." + @echo "Mamba docs: https://mamba.readthedocs.io/en/latest/" + exit endif ################################################################################# @@ -61,7 +59,7 @@ install: setup.py ## Install dependencies for test environment test_environment: tests/environment.yml - conda env update -n $(PROJECT_NAME) --file=$< + mamba env update -n $(PROJECT_NAME) --file=$< ## Build docker image from Dockerfile (auto-taggged as jasonkwan/autometa:) image: Dockerfile diff --git a/autometa-env.yml b/autometa-env.yml index 1c93fd218..72b1b4629 100644 --- a/autometa-env.yml +++ b/autometa-env.yml @@ -10,9 +10,7 @@ dependencies: - bowtie2 - diamond>=2.0 - gdown - - hdbscan - hmmer - - joblib - numba>=0.47 - numpy>=1.13 - pandas>=1.1 @@ -25,7 +23,7 @@ dependencies: - samtools>=1.11 - scikit-bio - scipy - - scikit-learn + - scikit-learn>=1.3 - seqkit - tqdm - trimap diff --git a/autometa/binning/recursive_dbscan.py b/autometa/binning/recursive_dbscan.py index 713e08673..35efa2b4f 100644 --- a/autometa/binning/recursive_dbscan.py +++ b/autometa/binning/recursive_dbscan.py @@ -16,8 +16,7 @@ import pandas as pd import numpy as np -from sklearn.cluster import DBSCAN -from hdbscan import HDBSCAN +from sklearn.cluster import DBSCAN, HDBSCAN from numba import config @@ -235,8 +234,7 @@ def run_hdbscan( df: pd.DataFrame, min_cluster_size: int, min_samples: int, - cache_dir: str = None, - core_dist_n_jobs: int = -1, + n_jobs: int = -1, ) -> pd.DataFrame: """Run clustering on `df` at provided `min_cluster_size`. @@ -261,14 +259,9 @@ def run_hdbscan( The number of samples in a neighborhood for a point to be considered a core point. - cache_dir : str, optional - Used to cache the output of the computation of the tree. - By default, no caching is done. If a string is given, it is the - path to the caching directory. - - core_dist_n_jobs: int + n_jobs: int Number of parallel jobs to run in core distance computations. - For ``core_dist_n_jobs`` below -1, (n_cpus + 1 + core_dist_n_jobs) are used. + For ``n_jobs`` below -1, (n_cpus + 1 + n_jobs) are used. Returns ------- @@ -304,8 +297,7 @@ def run_hdbscan( min_samples=min_samples, cluster_selection_method="leaf", allow_single_cluster=True, - memory=cache_dir, - core_dist_n_jobs=core_dist_n_jobs, + n_jobs=n_jobs, ).fit_predict(features_df.to_numpy()) clusters = pd.Series(clusters, index=df.index, name="cluster") # NOTE: HDBSCAN labels outliers with -1 @@ -325,7 +317,7 @@ def recursive_hdbscan( verbose: bool = False, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Recursively run HDBSCAN starting with defaults and iterating the min_samples - and min_cluster_size until only 1 cluster is recovered. + and min_cluster_size until only 1 cluster is recovered. Parameters ---------- @@ -372,14 +364,12 @@ def recursive_hdbscan( n_clusters = float("inf") best_median = float("-inf") best_df = pd.DataFrame() - cache_dir = tempfile.mkdtemp() while n_clusters > 1: binned_df = run_hdbscan( table, min_cluster_size=min_cluster_size, min_samples=min_samples, - cache_dir=cache_dir, - core_dist_n_jobs=n_jobs, + n_jobs=n_jobs, ) df, metrics_df = add_metrics(df=binned_df, markers_df=markers_df) filtered_df = apply_binning_metrics_filter( @@ -403,8 +393,6 @@ def recursive_hdbscan( ) if min_cluster_size >= max_min_cluster_size: - shutil.rmtree(cache_dir) - cache_dir = tempfile.mkdtemp() min_samples += 1 min_cluster_size = 2 else: @@ -416,8 +404,6 @@ def recursive_hdbscan( if min_samples >= max_min_samples: max_min_cluster_size *= 2 - # clean up cache now that we are out of while loop - shutil.rmtree(cache_dir) # Check our df is not empty from while loop if best_df.empty: if verbose: diff --git a/autometa/binning/unclustered_recruitment.py b/autometa/binning/unclustered_recruitment.py index 7b025a980..fa8bf9284 100644 --- a/autometa/binning/unclustered_recruitment.py +++ b/autometa/binning/unclustered_recruitment.py @@ -407,9 +407,13 @@ def get_confidence_filtered_predictions( # Filter predictions by confidence threshold confidence_threshold = num_classifications * confidence df = df[df.max(axis="columns") >= confidence_threshold] - filtered_predictions = df.idxmax(axis="columns") - filtered_predictions.name = "cluster" - return filtered_predictions.to_frame() + if df.empty: + filtered_predictions = pd.DataFrame( + [], columns=["contig", "cluster"] + ).set_index("contig") + else: + filtered_predictions = df.idxmax(axis="columns").to_frame(name="cluster") + return filtered_predictions def filter_contaminating_predictions( diff --git a/docs/source/bash-workflow.rst b/docs/source/bash-workflow.rst index 80b8a7d1d..a17a6b4d6 100644 --- a/docs/source/bash-workflow.rst +++ b/docs/source/bash-workflow.rst @@ -14,17 +14,16 @@ Getting Started Compute Environment Setup ************************* -If you have not previously installed/used Conda, you can get it using the -Miniconda installer appropriate to your system, here: ``_ +If you have not previously installed/used mamba_, you can get it from Mambaforge_. -You may either create a new Conda environment named "autometa"... +You may either create a new mamba environment named "autometa"... .. code-block:: bash - conda create -n autometa -c bioconda autometa - # Then, once Conda has finished creating the environment + mamba create -n autometa -c conda-forge -c bioconda autometa + # Then, once mamba has finished creating the environment # you may activate it: - conda activate autometa + mamba activate autometa \.\.\. or install Autometa into any of your existing environments. @@ -32,13 +31,13 @@ This installs Autometa in your current active environment: .. code-block:: bash - conda install -c bioconda autometa + mamba install -c conda-forge -c bioconda autometa The next command installs Autometa in the provided environment: .. code-block:: bash - conda install -n -c bioconda autometa + mamba install -n -c conda-forge -c bioconda autometa Download Workflow Template ************************** @@ -128,7 +127,7 @@ Alignments Preparation .. note:: The following example requires ``bwa``, ``kart`` and ``samtools`` - ``conda install -c bioconda bwa kart samtools`` + ``mamba install -c bioconda bwa kart samtools`` .. code-block:: bash @@ -158,7 +157,7 @@ ORFs **** .. note:: - The following example requires ``prodigal``. e.g. ``conda install -c bioconda prodigal`` + The following example requires ``prodigal``. e.g. ``mamba install -c bioconda prodigal`` .. code-block:: bash @@ -175,7 +174,7 @@ Diamond blastp Preparation ************************** .. note:: - The following example requires ``diamond``. e.g. ``conda install -c bioconda diamond`` + The following example requires ``diamond``. e.g. ``mamba install -c bioconda diamond`` .. code-block:: bash @@ -267,7 +266,7 @@ For example, with slurm: .. caution:: - Make sure your conda autometa environment is activated or the autometa entrypoints will not be available. + Make sure your mamba autometa environment is activated or the autometa entrypoints will not be available. Additional parameters ##################### @@ -323,3 +322,5 @@ See :ref:`advanced-usage-binning` section for details .. _Trimmomatic: http://www.usadellab.org/cms/?page=trimmomatic .. _FastQC: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ .. _metaQuast: http://quast.sourceforge.net/metaquast +.. _Mambaforge: https://github.com/conda-forge/miniforge#mambaforge +.. _mamba: https://mamba.readthedocs.io/en/latest/ diff --git a/docs/source/benchmarking.rst b/docs/source/benchmarking.rst index f58eab3e6..6c7aca511 100644 --- a/docs/source/benchmarking.rst +++ b/docs/source/benchmarking.rst @@ -7,11 +7,11 @@ Benchmarking .. note:: - The most recent Autometa benchmarking results covering multiple modules and input parameters are hosted on our - `KwanLab/metaBenchmarks `_ Github repository and provide a range of + The most recent Autometa benchmarking results covering multiple modules and input parameters are hosted on our + `KwanLab/metaBenchmarks `_ Github repository and provide a range of analyses covering multiple stages and parameter sets. These benchmarks are available with their own respective - modules so that the community may easily assess how Autometa's novel (``taxon-profiling``, ``clustering``, - ``binning``, ``refinement``) algorithms perform compared to current state-of-the-art methods. Tools were selected for + modules so that the community may easily assess how Autometa's novel (``taxon-profiling``, ``clustering``, + ``binning``, ``refinement``) algorithms perform compared to current state-of-the-art methods. Tools were selected for benchmarking based on their relevance to environmental, single-assembly, reference-free binning pipelines. Benchmarking with the ``autometa-benchmark`` module @@ -51,7 +51,7 @@ Example benchmarking with simulated communities # Set community size (see above for selection/download of other community types) community_size=78Mbp - + # Inputs ## NOTE: predictions and reference were downloaded using autometa-download-dataset predictions="$HOME/Autometa/autometa/datasets/simulated/${community_size}/taxonomy.tsv.gz" # required columns -> contig, taxid @@ -73,7 +73,7 @@ Example benchmarking with simulated communities --output-classification-reports $reports .. note:: - Using ``--benchmark=classification`` requires the path to a directory containing files (nodes.dmp, names.dmp, merged.dmp) + Using ``--benchmark=classification`` requires the path to a directory containing files (nodes.dmp, names.dmp, merged.dmp) from NCBI's taxdump tarball. This should be supplied using the ``--ncbi`` parameter. Clustering @@ -95,7 +95,7 @@ Example benchmarking with simulated communities # Outputs output_wide="${community_size}.clustering_benchmarks.wide.tsv.gz" output_long="${community_size}.clustering_benchmarks.long.tsv.gz" - + autometa-benchmark \ --benchmark clustering \ --predictions $predictions \ @@ -114,16 +114,16 @@ Example benchmarking with simulated communities # Set community size (see above for selection/download of other community types) community_size=78Mbp - + # Inputs ## NOTE: predictions and reference were downloaded using autometa-download-dataset predictions="$HOME/Autometa/autometa/datasets/simulated/${community_size}/binning.tsv.gz" # required columns -> contig, cluster reference="$HOME/Autometa/autometa/datasets/simulated/${community_size}/reference_assignments.tsv.gz" - + # Outputs output_wide="${community_size}.binning_benchmarks.wide.tsv.gz" output_long="${community_size}.binning_benchmarks.long.tsv.gz" - + autometa-benchmark \ --benchmark binning-classification \ --predictions $predictions \ @@ -172,7 +172,7 @@ Autometa is packaged with a built-in module that allows any user to download any To use retrieve these datasets one simply needs to run the ``autometa-download-dataset`` command. For example, to download the reference assignments for a simulated community as well as the most recent Autometa -binning and taxon-profiling predictions for this community, provide the following parameters: +binning and taxon-profiling predictions for this community, provide the following parameters: .. code:: bash @@ -195,15 +195,15 @@ Using ``gdrive`` You can download the individual assemblies of different datasests with the help of ``gdown`` using command line (This is what ``autometa-download-dataset`` is using behind the scenes). If you have installed ``autometa`` using -``conda`` then ``gdown`` should already be installed. If not, you can install it using -``conda install -c conda-forge gdown`` or ``pip install gdown``. +``mamba`` then ``gdown`` should already be installed. If not, you can install it using +``mamba install -c conda-forge gdown`` or ``pip install gdown``. Example for the 78Mbp simulated community """"""""""""""""""""""""""""""""""""""""" 1. Navigate to the 78Mbp community dataset using the `link `_ mentioned above. -2. Get the file ID by navigating to any of the files and right clicking, then selecting the ``get link`` option. - This will have a ``copy link`` button that you should use. The link for the metagenome assembly +2. Get the file ID by navigating to any of the files and right clicking, then selecting the ``get link`` option. + This will have a ``copy link`` button that you should use. The link for the metagenome assembly (ie. ``metagenome.fna.gz``) should look like this : ``https://drive.google.com/file/d/15CB8rmQaHTGy7gWtZedfBJkrwr51bb2y/view?usp=sharing`` 3. The file ID is within the ``/`` forward slashes between ``file/d/`` and ``/``, e.g: @@ -313,4 +313,4 @@ e.g. ``-l 1250`` would translate to 1250Mbp as the sum of total lengths for all # -s : the standard deviation of DNA/RNA fragment size for paired-end simulations. # -l : the length of reads to be simulated $ coverage = ((250 * reads) / (length * 1000000)) - $ art_illumina -p -ss HS25 -l 125 -f $coverage -o simulated_reads -m 275 -s 90 -i asm_path \ No newline at end of file + $ art_illumina -p -ss HS25 -l 125 -f $coverage -o simulated_reads -m 275 -s 90 -i asm_path diff --git a/docs/source/how-to-contribute.rst b/docs/source/how-to-contribute.rst index a210ccc42..9c5ddcc7a 100644 --- a/docs/source/how-to-contribute.rst +++ b/docs/source/how-to-contribute.rst @@ -16,10 +16,10 @@ Autometa builds documentation using `readthedocs `__. .. code-block:: bash - # Activate your autometa conda environment - conda activate autometa + # Activate your autometa mamba environment + mamba activate autometa # Install dependencies - conda install -n autometa -c conda-forge \ + mamba install -n autometa -c conda-forge \ sphinx sphinx_rtd_theme # List all make options make @@ -38,8 +38,8 @@ You will have to install certain dependencies as well as test data to be able to .. code-block:: bash - # Activate your autometa conda environment - conda activate autometa + # Activate your autometa mamba environment + mamba activate autometa # List all make options make # Install dependencies for test environment diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 498b7087b..c32e3e3fa 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -4,8 +4,8 @@ Installation ============ -Currently Autometa package installation is supported by conda_ and docker_. -For installation using conda, we suggest downloading miniconda_. +Currently Autometa package installation is supported by mamba_, and docker_. +For installation using mamba, download mamba from Mambaforge_. .. attention:: @@ -14,23 +14,74 @@ For installation using conda, we suggest downloading miniconda_. Direct installation (Quickest) ============================== -#. Install miniconda_ +#. Install mamba_ + + .. code-block:: bash + + wget "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh" + bash Mambaforge-$(uname)-$(uname -m).sh + + Follow the installation prompts and when you get to this: + + .. code-block:: bash + + Do you wish the installer to initialize Mambaforge + by running conda init? [yes|no] + [no] >>> yes + + This will require restarting the terminal, or resetting + the terminal with the source command + + .. code-block:: bash + + # To resolve the comment: + # ==> For changes to take effect, close and re-open your current shell. <== + # type: + source ~/.bashrc + + .. note:: + + If you already have conda installed, you can install mamba as a drop-in replacement. + + .. code-block:: bash + + conda -n base -c conda-forge mamba -y + + #. Create a new environment with ``autometa`` installed: .. code-block:: bash - conda create -c bioconda -n autometa autometa + mamba create -c conda-forge -c bioconda -n autometa autometa + + .. note:: + + You may add the ``bioconda`` and ``conda-forge`` channels to your mamba + config to simplify the command. + + .. code-block:: bash + + mamba config --append channels bioconda + mamba config --append channels conda-forge + + Now mamba will search the ``bioconda`` and ``conda-forge`` + channels alongside the defaults channel. + + .. code-block:: bash + + mamba create -n autometa autometa + #. Activate ``autometa`` environment: .. code-block:: - conda activate autometa + mamba activate autometa Install from source (using make) ================================ -Download and install miniconda_. Now run the following commands: +Download and install mamba_. Now run the following commands: .. code-block:: bash @@ -43,11 +94,11 @@ Download and install miniconda_. Now run the following commands: # Navigate into the cloned repository cd Autometa - # create autometa conda environment + # create autometa mamba environment make create_environment - # activate autometa conda environment - conda activate autometa + # activate autometa mamba environment + mamba activate autometa # install autometa source code in autometa environment make install @@ -59,7 +110,7 @@ Download and install miniconda_. Now run the following commands: Install from source (full commands) =================================== -Download and install miniconda_. Now run the following commands: +Download and install mamba_. Now run the following commands: .. code-block:: bash @@ -73,10 +124,10 @@ Download and install miniconda_. Now run the following commands: cd Autometa # Construct the autometa environment from autometa-env.yml - conda env create --file=autometa-env.yml + mamba env create --file=autometa-env.yml # Activate environment - conda activate autometa + mamba activate autometa # Install the autometa code base from source python -m pip install . --ignore-installed --no-deps -vv @@ -115,8 +166,8 @@ To run the tests, however, you'll first need to install the following packages a .. code-block:: bash - # Activate your autometa conda environment - conda activate autometa + # Activate your autometa mamba environment + mamba activate autometa # List all make options make @@ -141,12 +192,12 @@ You can now run different unit tests using the following commands: make unit_test_wip .. note:: + As a shortcut you can also create the test environment and run **all** the unit tests using ``make unit_test`` command. For more information about the above commands see the :ref:`Contributing Guidelines` page. Additional unit tests are provided in the test directory. These are designed to aid in future development of autometa. -.. _conda: https://docs.conda.io/en/latest/ -.. _miniconda: https://docs.conda.io/en/latest/miniconda.html +.. _mamba: https://mamba.readthedocs.io/en/latest/index.html +.. _Mambaforge: https://github.com/conda-forge/miniforge#mambaforge .. _Docker: https://www.docker.com/ -.. _anaconda: https://www.anaconda.com/ diff --git a/docs/source/nextflow-workflow.rst b/docs/source/nextflow-workflow.rst index ba097318e..ad5790e2f 100644 --- a/docs/source/nextflow-workflow.rst +++ b/docs/source/nextflow-workflow.rst @@ -16,12 +16,12 @@ System Requirements Currently the nextflow pipeline requires Docker 🐳 so it must be installed on your system. If you don't have Docker installed you can install it from `docs.docker.com/get-docker `_. We plan on removing this dependency in future versions, so that other dependency managers -(e.g. Conda, Singularity, etc) can be used. +(e.g. Conda, Mamba, Singularity, etc) can be used. Nextflow runs on any Posix compatible system. Detailed system requirements can be found in the `nextflow documentation `_ -Nextflow (required) and nf-core tools (optional but highly recommended) installation will be discussed in :ref:`install-nextflow-nfcore-with-conda`. +Nextflow (required) and nf-core tools (optional but highly recommended) installation will be discussed in :ref:`install-nextflow-nfcore-with-mamba`. Data Preparation ################ @@ -138,7 +138,7 @@ Example ``sample_sheet.csv`` Quick Start ########### -The following is a condensed summary of steps required to get Autometa installed, configured and running. +The following is a condensed summary of steps required to get Autometa installed, configured and running. There are links throughout to the appropriate documentation sections that can provide more detail if required. Installation @@ -146,14 +146,14 @@ Installation For full installation instructions, please see the :ref:`installation-page` section -If you would like to install Autometa via conda (I'd recommend it, its almost foolproof!), -you'll need to first install Miniconda on your system. You can do this in a few easy steps: +If you would like to install Autometa via mamba (I'd recommend it, its almost foolproof!), +you'll need to first download the Mambaforge_ installer on your system. You can do this in a few easy steps: -1. Type in the following and then hit enter. This will download the Miniconda installer to your home directory. +1. Type in the following and then hit enter. This will download the Mambaforge installer to your home directory. .. code-block:: bash - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O $HOME/Miniconda3-latest-Linux-x86_64.sh + wget "https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-$(uname)-$(uname -m).sh" -O "$HOME/Mambaforge-$(uname)-$(uname -m).sh" .. note:: @@ -163,44 +163,44 @@ you'll need to first install Miniconda on your system. You can do this in a few .. code-block:: bash - bash $HOME/Miniconda3-latest-Linux-x86_64.sh + bash $HOME/Mambaforge-$(uname)-$(uname -m).sh + # On my machine this was /home/sam/Mambaforge-latest-Linux-x86_64.sh 3. Follow all of the prompts. Keep pressing enter until it asks you to accept. Then type yes and enter. Say yes to everything. -.. note:: +.. note:: - If for whatever reason, you accidentally said no to the initialization, do not fear. + If for whatever reason, you accidentally said no to the initialization, do not fear. We can fix this by running the initialization with the following command: .. code-block:: bash - cd $HOME/miniconda3/bin/ - ./conda init - + $HOME/mambaforge/bin/mamba init + -4. Finally, for the changes to take effect, you'll need to run the following line of code which effectively acts as a "refresh" +1. Finally, for the changes to take effect, you'll need to run the following line of code which effectively acts as a "refresh" .. code-block:: bash - - source ~/.bashrc -Now that you have conda up and running, its time to install the Autometa conda environment. Run the following code: + source $HOME/.bashrc + +Now that you have mamba up and running, its time to install the Autometa mamba environment. Run the following code: .. code-block:: bash - conda env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml - + mamba env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml + .. attention:: - You will only need to run the installation (code above) once. The installation does NOT need to be performed every time you wish to use Autometa. - Once installation is complete, the conda environment (which holds all the tools that Autometa needs) will live on your server/computer + You will only need to run the installation (code above) once. The installation does NOT need to be performed every time you wish to use Autometa. + Once installation is complete, the mamba environment (which holds all the tools that Autometa needs) will live on your server/computer much like any other program you install. -Anytime you would like to run Autometa, you'll need to activate the conda environment. To activate the environment you'll need to run the following command: +Anytime you would like to run Autometa, you'll need to activate the mamba environment. To activate the environment you'll need to run the following command: .. code-block:: bash - conda activate autometa-nf + mamba activate autometa-nf Configuring a scheduler *********************** @@ -239,13 +239,13 @@ Then copy the following code block into that new file ("agrp" is the slurm parti } } -Keep this file somewhere central to you. For the sake of this example I will be keeping it in a folder called "Useful scripts" in my home directory +Keep this file somewhere central to you. For the sake of this example I will be keeping it in a folder called "Useful scripts" in my home directory because that is a central point for me where I know I can easily find the file and it won't be moved e.g. :code:`/home/sam/Useful_scripts/slurm_nextflow.config` -Save your new file with Ctrl+O and then exit nano with Ctrl+O. +Save your new file with Ctrl+O and then exit nano with Ctrl+O. -Installation and set up is now complete. πŸŽ‰ πŸ₯³ +Installation and set up is now complete. πŸŽ‰ πŸ₯³ Running Autometa **************** @@ -253,19 +253,19 @@ Running Autometa For a comprehensive list of features and options and how to use them please see :ref:`Running the pipeline` Autometa can bin one or several metagenomic datasets in one run. Regardless of the number of metagenomes you -want to process, you will need to provide a sample sheet which specifies the name of your sample, the full path to +want to process, you will need to provide a sample sheet which specifies the name of your sample, the full path to where that data is found and how to retrieve the sample's contig coverage information. -If the metagenome was assembled via SPAdes, Autometa can extract coverage and contig length information from the sequence headers. +If the metagenome was assembled via SPAdes, Autometa can extract coverage and contig length information from the sequence headers. -If you used a different assembler you will need to provide either raw reads or a table containing contig/scaffold coverage information. -Full details for data preparation may be found under :ref:`sample-sheet-preparation` +If you used a different assembler you will need to provide either raw reads or a table containing contig/scaffold coverage information. +Full details for data preparation may be found under :ref:`sample-sheet-preparation` -First ensure that your Autometa conda environment is activated. You can activate your environment by running: +First ensure that your Autometa mamba environment is activated. You can activate your environment by running: .. code-block:: bash - - conda activate autometa-nf + + mamba activate autometa-nf Run the following code to launch Autometa: @@ -275,7 +275,7 @@ Run the following code to launch Autometa: .. note:: - You may want to note where you have saved your input sample sheet prior to running the launch command. + You may want to note where you have saved your input sample sheet prior to running the launch command. It is much easier (and less error prone) to copy/paste the sample sheet file path when specifying the input (We'll get to this later in :ref:`quickstart-menu-4`). You will now use the arrow keys to move up and down between your options and hit your "Enter" or "Return" key to make your choice. @@ -296,8 +296,8 @@ You will now use the arrow keys to move up and down between your options and hit Choose a version ---------------- -The double, right-handed arrows should already indicate the latest release of Autometa (in our case ``2.0.0``). -The latest version of the tool will always be at the top of the list with older versions descending below. +The double, right-handed arrows should already indicate the latest release of Autometa (in our case ``2.0.0``). +The latest version of the tool will always be at the top of the list with older versions descending below. To select the latest version, ensure that the double, right-handed arrows are next to ``2.0.0``, then hit "Enter". .. image:: ../img/Menu1.png @@ -311,7 +311,7 @@ Pick the ``Command line`` option. .. note:: - Unless you've done some fancy server networking (i.e. tunneling and port-forwarding), + Unless you've done some fancy server networking (i.e. tunneling and port-forwarding), or are using Autometa locally, ``Command line`` is your *only* option. .. image:: ../img/Menu2.png @@ -321,7 +321,7 @@ Pick the ``Command line`` option. General nextflow parameters --------------------------- -If you are using a scheduler (Slurm in this example), ``-profile`` is the only option you'll need to change. +If you are using a scheduler (Slurm in this example), ``-profile`` is the only option you'll need to change. If you are not using a scheduler, you may skip this step. .. image:: ../img/Menu3.png @@ -331,12 +331,12 @@ If you are not using a scheduler, you may skip this step. Input and Output ---------------- -Now we need to give Autometa the full paths to our input sample sheet, output results folder -and output logs folder (aka where trace files are stored). +Now we need to give Autometa the full paths to our input sample sheet, output results folder +and output logs folder (aka where trace files are stored). .. note:: - A new folder, named by its respective sample value, will be created within the output results folder for + A new folder, named by its respective sample value, will be created within the output results folder for each metagenome listed in the sample sheet. .. image:: ../img/Menu4.png @@ -346,14 +346,14 @@ and output logs folder (aka where trace files are stored). Binning parameters ------------------ -If you're not sure what you're doing I would recommend only changing ``length_cutoff``. -The default cutoff is 3000bp, which means that any contigs/scaffolds smaller than 3000bp will not be considered for binning. +If you're not sure what you're doing I would recommend only changing ``length_cutoff``. +The default cutoff is 3000bp, which means that any contigs/scaffolds smaller than 3000bp will not be considered for binning. .. note:: - This cutoff will depend on how good your assembly is: e.g. if your N50 is 1200bp, I would choose a cutoff of 1000. + This cutoff will depend on how good your assembly is: e.g. if your N50 is 1200bp, I would choose a cutoff of 1000. If your N50 is more along the lines of 5000, I would leave the cutoff at the default 3000. I would strongly recommend - against choosing a number below 900 here. In the example below, I have chosen a cutoff of 1000bp as my assembly was + against choosing a number below 900 here. In the example below, I have chosen a cutoff of 1000bp as my assembly was not particularly great (the N50 is 1100bp). .. image:: ../img/Menu5.png @@ -363,17 +363,17 @@ The default cutoff is 3000bp, which means that any contigs/scaffolds smaller tha Additional Autometa options --------------------------- -Here you have a choice to make: +Here you have a choice to make: -* By enabling taxonomy aware mode, Autometa will attempt to use taxonomic data to make your bins more accurate. +* By enabling taxonomy aware mode, Autometa will attempt to use taxonomic data to make your bins more accurate. -However, this is a more computationally expensive step and will make the process take longer. +However, this is a more computationally expensive step and will make the process take longer. * By leaving this option as the default ``False`` option, Autometa will bin according to coverage and kmer patterns. Despite your choice, you will need to provide a path to the necessary databases using the ``single_db_dir`` option. -In the example below, I have enabled the taxonomy aware mode and provided the path to where the databases are stored -(in my case this is :code:`/home/sam/Databases`). +In the example below, I have enabled the taxonomy aware mode and provided the path to where the databases are stored +(in my case this is :code:`/home/sam/Databases`). For additional details on required databases, see the :ref:`Databases` section. @@ -384,13 +384,13 @@ For additional details on required databases, see the :ref:`Databases` section. Computational parameters ------------------------ -This will depend on the computational resources you have available. You could start with the default values and see -how the binning goes. If you have particularly complex datasets you may want to bump this up a bit. For your -average metagenome, you won't need more than 150Gb of memory. I've opted to use 75 Gb as a -starting point for a few biocrust (somewhat diverse) metagenomes. +This will depend on the computational resources you have available. You could start with the default values and see +how the binning goes. If you have particularly complex datasets you may want to bump this up a bit. For your +average metagenome, you won't need more than 150Gb of memory. I've opted to use 75 Gb as a +starting point for a few biocrust (somewhat diverse) metagenomes. .. note:: - + These options correspond to the resources provided to *each* process of Autometa, *not* the entire workflow itself. Also, for TB worth of assembled data you may want to try the :ref:`autometa-bash-workflow` using the @@ -409,7 +409,7 @@ to prevent immediately performing the nextflow run command. .. image:: ../img/launch_choice.png -If you recall, we created a file called :code:`slurm_nextflow.config` that contains the information Autometa will need to communicate with the Slurm scheduler. +If you recall, we created a file called :code:`slurm_nextflow.config` that contains the information Autometa will need to communicate with the Slurm scheduler. We need to include that file using the :code:`-c` flag (or configuration flag). Therefore to launch the Autometa workflow, run the following command: .. note:: @@ -433,41 +433,40 @@ Basic While the Autometa Nextflow pipeline can be run using Nextflow directly, we designed it using nf-core standards and templating to provide an easier user experience through use of the nf-core "tools" python library. The directions below demonstrate using a minimal -Conda environment to install Nextflow and nf-core tools and then running the Autometa pipeline. +mamba environment to install Nextflow and nf-core tools and then running the Autometa pipeline. -.. _install-nextflow-nfcore-with-conda: +.. _install-nextflow-nfcore-with-mamba: -Installing Nextflow and nf-core tools with Conda +Installing Nextflow and nf-core tools with mamba ************************************************ -If you have not previously installed/used Conda, you can get it using the -Miniconda installer appropriate to your system, here: ``_ +If you have not previously installed/used mamba_, you can get it from Mambaforge_. -After installing conda, running the following command will create a minimal -Conda environment named "autometa-nf", and install Nextflow and nf-core tools. +After installing mamba, running the following command will create a minimal +mamba environment named "autometa-nf", and install Nextflow and nf-core tools. .. code-block:: bash - conda env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml + mamba env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml If you receive the message... .. code-block:: bash - CondaValueError: prefix already exists: + CondaValueError: prefix already exists: /home/user/mambaforge/envs/autometa-nf ...it means you have already created the environment. If you want to overwrite/update the environment then add the :code:`--force` flag to the end of the command. .. code-block:: bash - conda env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml --force + mamba env create --file=https://raw.githubusercontent.com/KwanLab/Autometa/main/nextflow-env.yml --force -Once Conda has finished creating the environment be sure to activate it: +Once mamba has finished creating the environment be sure to activate it: .. code-block:: bash - conda activate autometa-nf + mamba activate autometa-nf Using nf-core @@ -484,7 +483,7 @@ start the pipeline launch process. nf-core launch KwanLab/Autometa .. caution:: - + nf-core will give a list of revisions to use following the above command. Any of the version 1.* revisions are NOT supported. @@ -543,7 +542,7 @@ The other parameter is a nextflow argument, specified with :code:`-profile`. Thi are able to successfully configure these profiles, please get in touch or submit a pull request and we will add these configurations to the repository. - - :code:`conda`: Enables running all processes using `conda `_ + - :code:`mamba`: Enables running all processes using `mamba `_ - :code:`singularity`: Enables running all processes using `singularity `_ - :code:`podman`: Enables running all processes using `podman `_ - :code:`shifter`: Enables running all processes using `shifter `_ @@ -581,7 +580,7 @@ using the :code:`nextflow run ...` command by prepending the parameter name with You can run the ``KwanLab/Autometa`` project without using nf-core if you already have a correctly formatted parameters file. (like the one generated from ``nf-core launch ...``, i.e. ``nf-params.json``) - + .. code-block:: bash nextflow run KwanLab/Autometa -params-file nf-params.json -profile slurm -resume @@ -795,7 +794,7 @@ Visualizing the Workflow ------------------------ You can visualize the entire workflow ie. create the directed acyclic graph (DAG) of processes from the written DOT file. First install -`Graphviz `_ (``conda install -c anaconda graphviz``) then do ``dot -Tpng < pipeline_info/autometa-dot > autometa-dag.png`` to get the +`Graphviz `_ (``mamba install -c anaconda graphviz``) then do ``dot -Tpng < pipeline_info/autometa-dot > autometa-dag.png`` to get the in the ``png`` format. Configuring your process executor @@ -868,3 +867,5 @@ To use this tagged version (or any other Autometa image tag) add the argument `` .. _Trimmomatic: http://www.usadellab.org/cms/?page=trimmomatic .. _FastQC: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ .. _metaQuast: http://quast.sourceforge.net/metaquast +.. _Mambaforge: https://github.com/conda-forge/miniforge#mambaforge +.. _mamba: https://mamba.readthedocs.io/en/latest/ diff --git a/docs/source/step-by-step-tutorial.rst b/docs/source/step-by-step-tutorial.rst index 28728a970..1a2dc76e9 100644 --- a/docs/source/step-by-step-tutorial.rst +++ b/docs/source/step-by-step-tutorial.rst @@ -7,7 +7,7 @@ Here is the step by step tutorial of the entire pipeline. This is helpful in case you have your own files or just want to run a specific step. Before running anything make sure you have activated the conda environment using -``conda activate autometa``. +``mamba activate autometa``. See the :ref:`Autometa Package Installation` page for details on setting up your conda environment. diff --git a/tests/environment.yml b/tests/environment.yml index f140f6d70..13c29b39a 100644 --- a/tests/environment.yml +++ b/tests/environment.yml @@ -11,9 +11,7 @@ dependencies: - bowtie2 - diamond>=2.0 - gdown - - hdbscan - hmmer - - joblib==1.1.0 # See https://stackoverflow.com/a/73830525/12671809 - numba>=0.47 - numpy>=1.13 - pandas>=1.1 @@ -30,8 +28,8 @@ dependencies: - rsync - samtools>=1.11 - scikit-bio - - scipy==1.8.1 #force scipy 1.8 until scikit-bio updates to 1.9, https://github.com/KwanLab/Autometa/issues/285 - - scikit-learn==0.24 # prevent error from joblib in multiprocessing distance calculations + - scipy + - scikit-learn>=1.3 - sphinx - sphinx_rtd_theme - tqdm From c8f142c81519d5743470c352f75cf76c146bcbf2 Mon Sep 17 00:00:00 2001 From: shaneroesemann <59748289+shaneroesemann@users.noreply.github.com> Date: Thu, 24 Aug 2023 15:15:56 -0500 Subject: [PATCH 07/11] :bug::shell: Fix GTDB taxon-binning workflow (#339) Append underscore to contig id to prevent partial matches See also: https://github.com/KwanLab/Autometa/pull/329#issuecomment-1594996732 --- workflows/autometa-large-data-mode.sh | 6 +++--- workflows/autometa.sh | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/workflows/autometa-large-data-mode.sh b/workflows/autometa-large-data-mode.sh index a4d65d36a..46cd58117 100644 --- a/workflows/autometa-large-data-mode.sh +++ b/workflows/autometa-large-data-mode.sh @@ -241,10 +241,10 @@ then set -x grep ">" $kingdom_fasta | \ sed 's/^>//' | \ - sed 's/$/_/' | \ - cut -f1 -d" " > $orf_prefixes + cut -f1 -d" " | \ + sed 's/$/_/' > $orf_prefixes # Retrieve ORF IDs from contig IDs - grep -f $orf_prefixes $orfs | sed 's/^>//' | cut -f1 -d" " > $orf_ids + grep -f $orf_prefixes $orfs | cut -f1 -d" " | sed 's/^>//' > $orf_ids # Retrieve ORF seqs from ORF IDs seqkit grep \ --pattern-file $orf_ids \ diff --git a/workflows/autometa.sh b/workflows/autometa.sh index fda8862b1..deb1ad606 100644 --- a/workflows/autometa.sh +++ b/workflows/autometa.sh @@ -231,10 +231,10 @@ then set -x grep ">" $kingdom_fasta | \ sed 's/^>//' | \ - sed 's/$/_/' | \ - cut -f1 -d" " > $orf_prefixes + cut -f1 -d" " | \ + sed 's/$/_/' > $orf_prefixes # Retrieve ORF IDs from contig IDs - grep -f $orf_prefixes $orfs | sed 's/^>//' | cut -f1 -d" " > $orf_ids + grep -f $orf_prefixes $orfs | cut -f1 -d" " | sed 's/^>//' > $orf_ids # Retrieve ORF seqs from ORF IDs seqkit grep \ --pattern-file $orf_ids \ From 5239018a87fb852fed68b7f5ddf568269c4cf910 Mon Sep 17 00:00:00 2001 From: Evan Rees <25933122+WiscEvan@users.noreply.github.com> Date: Thu, 24 Aug 2023 16:45:00 -0400 Subject: [PATCH 08/11] Update documentation (#342) * :memo: Prefix step-by-step tutorial title with 'bash' * :memo: Rename step-by-step-tutorial.rst to bash-step-by-step-tutorial.rst --- docs/source/autometa-python-api.rst | 2 +- ...-tutorial.rst => bash-step-by-step-tutorial.rst} | 13 ++++++++----- docs/source/index.rst | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) rename docs/source/{step-by-step-tutorial.rst => bash-step-by-step-tutorial.rst} (99%) diff --git a/docs/source/autometa-python-api.rst b/docs/source/autometa-python-api.rst index 389808d76..80db23b42 100644 --- a/docs/source/autometa-python-api.rst +++ b/docs/source/autometa-python-api.rst @@ -16,7 +16,7 @@ I.e. ``python -m autometa.common.kmers -h`` Autometa has many *entrypoints* available that are utilized by the :ref:`autometa-nextflow-workflow` and :ref:`autometa-bash-workflow`. If you have installed autometa, all of these entrypoints will be available to you. - If you would like to get a better understanding of each entrypoint, we recommend reading the :ref:`step-by-step-tutorial` section. + If you would like to get a better understanding of each entrypoint, we recommend reading the :ref:`bash-step-by-step-tutorial` section. Using Autometa's Python API ########################### diff --git a/docs/source/step-by-step-tutorial.rst b/docs/source/bash-step-by-step-tutorial.rst similarity index 99% rename from docs/source/step-by-step-tutorial.rst rename to docs/source/bash-step-by-step-tutorial.rst index 1a2dc76e9..c2ed60999 100644 --- a/docs/source/step-by-step-tutorial.rst +++ b/docs/source/bash-step-by-step-tutorial.rst @@ -1,10 +1,13 @@ -.. _step-by-step-tutorial: +.. _bash-step-by-step-tutorial: -=========================== -πŸ““ Step by Step Tutorial πŸ““ -=========================== +================================ +πŸ““ Bash Step by Step Tutorial πŸ““ +================================ -Here is the step by step tutorial of the entire pipeline. This is helpful in case you have your own files or just want to run a specific step. +Here is the step by step tutorial of on running the entire pipeline manually through Bash. +This is helpful in case you have your own files or just want to run a specific step. + +If you would like to set up a run of the whole pipeline through Bash, see the :ref:`Bash Workflow<🐚 Bash Workflow 🐚>` section. Before running anything make sure you have activated the conda environment using ``mamba activate autometa``. diff --git a/docs/source/index.rst b/docs/source/index.rst index 583fa2774..288a054b6 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -17,7 +17,7 @@ Guide getting-started nextflow-workflow bash-workflow - step-by-step-tutorial + bash-step-by-step-tutorial databases examining-results benchmarking From 98ae213ebd7ba81f5d5ea96a127c06be2bf7a968 Mon Sep 17 00:00:00 2001 From: Evan Rees <25933122+WiscEvan@users.noreply.github.com> Date: Thu, 24 Aug 2023 17:36:14 -0400 Subject: [PATCH 09/11] :snake::art::fire::bug: Fix UnboundLocalError bug (#325) * :snake::art::fire::bug: Remove unnecessary nesting * Fixes #324 UnboundLocalError resulted from trying to update `binning_checkpoints` dataframe when it was actually not available (occurs when `--cache` is *not* provided). Now variable is initialized accordingly to remove this error. --- autometa/binning/large_data_mode.py | 52 ++++++++++++++++------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/autometa/binning/large_data_mode.py b/autometa/binning/large_data_mode.py index d6dc34b3b..cac25d292 100644 --- a/autometa/binning/large_data_mode.py +++ b/autometa/binning/large_data_mode.py @@ -344,30 +344,34 @@ def cluster_by_taxon_partitioning( binning_checkpoints_fpath = os.path.join( cache, "binning_checkpoints.tsv.gz" ) - if binning_checkpoints_fpath: - if os.path.exists(binning_checkpoints_fpath) and os.path.getsize(binning_checkpoints_fpath): - checkpoint_info = get_checkpoint_info(binning_checkpoints_fpath) - binning_checkpoints = checkpoint_info["binning_checkpoints"] - starting_rank = checkpoint_info["starting_rank"] - starting_rank_name_txt = checkpoint_info["starting_rank_name_txt"] - # Update datastructures to begin at checkpoint stage. - ## Forward fill binning annotations to most recent checkpoint and drop any contigs without bin annotations - most_recent_binning_checkpoint = ( - binning_checkpoints.fillna(axis=1, method="ffill").iloc[:, -1].dropna() - ) - clustered_contigs = set( - most_recent_binning_checkpoint.index.unique().tolist() - ) - most_recent_clustered_df = most_recent_binning_checkpoint.to_frame().rename( - columns={starting_rank_name_txt: "cluster"} - ) - num_clusters = most_recent_clustered_df.cluster.nunique() - clusters.append(most_recent_clustered_df) - else: - logger.debug( - f"Binning checkpoints not found. Writing checkpoints to {binning_checkpoints_fpath}" - ) - binning_checkpoints = pd.DataFrame() + if ( + binning_checkpoints_fpath + and os.path.exists(binning_checkpoints_fpath) + and os.path.getsize(binning_checkpoints_fpath) + ): + checkpoint_info = get_checkpoint_info(binning_checkpoints_fpath) + binning_checkpoints = checkpoint_info["binning_checkpoints"] + starting_rank = checkpoint_info["starting_rank"] + starting_rank_name_txt = checkpoint_info["starting_rank_name_txt"] + # Update datastructures to begin at checkpoint stage. + ## Forward fill binning annotations to most recent checkpoint and drop any contigs without bin annotations + most_recent_binning_checkpoint = ( + binning_checkpoints.fillna(axis=1, method="ffill").iloc[:, -1].dropna() + ) + clustered_contigs = set(most_recent_binning_checkpoint.index.unique().tolist()) + most_recent_clustered_df = most_recent_binning_checkpoint.to_frame().rename( + columns={starting_rank_name_txt: "cluster"} + ) + num_clusters = most_recent_clustered_df.cluster.nunique() + clusters.append(most_recent_clustered_df) + else: + logger_message = ( + f"Binning checkpoints not found. Writing checkpoints to {binning_checkpoints_fpath}" + if binning_checkpoints_fpath + else "Binning checkpoints not found. Initializing..." + ) + logger.debug(logger_message) + binning_checkpoints = pd.DataFrame() # Subset ranks by provided (or checkpointed) starting rank starting_rank_index = canonical_ranks.index(starting_rank) From bf5272988eb52a37566d75760f03ea22e8884de9 Mon Sep 17 00:00:00 2001 From: Evan Rees <25933122+WiscEvan@users.noreply.github.com> Date: Thu, 24 Aug 2023 17:41:19 -0400 Subject: [PATCH 10/11] :bug::art::snake: Fixes #305 (#343) :art: Add UNCLASSIFIED attribute to TaxonomyDatabase class :art::bug::snake: Add logic to include blastp unaligned contigs to unclassified fasta ---------- Co-authored-by: WiscEvan Co-authored-by: kaw97 --- autometa/taxonomy/database.py | 26 +++++++++++++++----------- autometa/taxonomy/vote.py | 13 ++++++++++++- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/autometa/taxonomy/database.py b/autometa/taxonomy/database.py index b3d1a1b56..b53b21177 100644 --- a/autometa/taxonomy/database.py +++ b/autometa/taxonomy/database.py @@ -31,7 +31,7 @@ class TaxonomyDatabase(ABC): class GTDB(TaxonomyDatabase): def __init__(self, ...): self.nodes = self.parse_nodes() - self.names = self.parse_names() + self.names = self.parse_names() self.merged = self.parse_merged() self.delnodes = self.parse_delnodes() ... @@ -59,6 +59,7 @@ def convert_accessions_to_taxids(self, accessions): Available attributes: CANONICAL_RANKS + UNCLASSIFIED """ CANONICAL_RANKS = [ @@ -71,6 +72,7 @@ def convert_accessions_to_taxids(self, accessions): "superkingdom", "root", ] + UNCLASSIFIED = "unclassified" @abstractmethod def parse_nodes(self) -> Dict[int, Dict[str, Union[str, int]]]: @@ -100,7 +102,7 @@ def parse_names(self) -> Dict[int, str]: Returns ------- str - Name of provided `taxid` if `taxid` is found in names.dmp else 'unclassified' + Name of provided `taxid` if `taxid` is found in names.dmp else TaxonomyDatabase.UNCLASSIFIED """ @@ -237,7 +239,7 @@ def name(self, taxid: int, rank: str = None) -> str: Returns ------- str - Name of provided `taxid` if `taxid` is found in names.dmp else 'unclassified' + Name of provided `taxid` if `taxid` is found in names.dmp else TaxonomyDatabase.UNCLASSIFIED """ try: @@ -246,19 +248,19 @@ def name(self, taxid: int, rank: str = None) -> str: logger.warning(err) taxid = 0 if not rank: - return self.names.get(taxid, "unclassified") + return self.names.get(taxid, TaxonomyDatabase.UNCLASSIFIED) if rank not in set(TaxonomyDatabase.CANONICAL_RANKS): logger.warning(f"{rank} not in canonical ranks!") - return "unclassified" + return TaxonomyDatabase.UNCLASSIFIED ancestor_taxid = taxid while ancestor_taxid != 1: ancestor_rank = self.rank(ancestor_taxid) if ancestor_rank == rank: - return self.names.get(ancestor_taxid, "unclassified") + return self.names.get(ancestor_taxid, TaxonomyDatabase.UNCLASSIFIED) ancestor_taxid = self.parent(ancestor_taxid) # At this point we have not encountered a name for the taxid rank # so we will place this as unclassified. - return "unclassified" + return TaxonomyDatabase.UNCLASSIFIED def rank(self, taxid: int) -> str: """ @@ -272,7 +274,7 @@ def rank(self, taxid: int) -> str: Returns ------- str - rank name if taxid is found in nodes else "unclassified" + rank name if taxid is found in nodes else autoattribute:: autometa.taxonomy.database.TaxonomyDatabase.UNCLASSIFIED """ try: @@ -280,7 +282,9 @@ def rank(self, taxid: int) -> str: except DatabaseOutOfSyncError as err: logger.warning(err) taxid = 0 - return self.nodes.get(taxid, {"rank": "unclassified"}).get("rank") + return self.nodes.get(taxid, {"rank": TaxonomyDatabase.UNCLASSIFIED}).get( + "rank" + ) def parent(self, taxid: int) -> int: """ @@ -368,7 +372,7 @@ def get_lineage_dataframe( taxids : iterable `taxids` whose lineage dataframe is being returned fillna : bool, optional - Whether to fill the empty cells with 'unclassified' or not, default True + Whether to fill the empty cells with TaxonomyDatabase.UNCLASSIFIED or not, default True Returns ------- @@ -408,5 +412,5 @@ def get_lineage_dataframe( df = pd.DataFrame(ranked_taxids).transpose() df.index.name = "taxid" if fillna: - df.fillna(value="unclassified", inplace=True) + df.fillna(value=TaxonomyDatabase.UNCLASSIFIED, inplace=True) return df diff --git a/autometa/taxonomy/vote.py b/autometa/taxonomy/vote.py index aa6211f19..c6eeae2f2 100644 --- a/autometa/taxonomy/vote.py +++ b/autometa/taxonomy/vote.py @@ -272,6 +272,12 @@ def write_ranks( if not os.path.exists(outdir): os.makedirs(outdir) assembly_records = [record for record in SeqIO.parse(assembly, "fasta")] + # Include unaligned records in unclassified fasta + unaligned_contigs = set( + record.id + for record in SeqIO.parse(assembly, "fasta") + if record.id not in taxonomy.index + ) fpaths = [] for rank_name, dff in taxonomy.groupby(rank): # First determine the file path respective to the rank name @@ -282,7 +288,12 @@ def write_ranks( rank_name_fname = ".".join([rank_name.lower(), "fna"]) rank_name_fpath = os.path.join(outdir, rank_name_fname) # Now retrieve and write records respective to rank - records = [record for record in assembly_records if record.id in dff.index] + # include unaligned contigs if rank is unclassified + if rank_name == TaxonomyDatabase.UNCLASSIFIED: + contig_set = set(dff.index).union(unaligned_contigs) + else: + contig_set = dff.index + records = [record for record in assembly_records if record.id in contig_set] if not records: logger.warning(f"No records to write to {rank_name_fpath}") else: From 2363789fc2547bb4bd9555d867d19e8969f40d81 Mon Sep 17 00:00:00 2001 From: WiscEvan Date: Thu, 24 Aug 2023 16:45:52 -0500 Subject: [PATCH 11/11] bump version to 2.2.1 --- VERSION | 2 +- nextflow.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index ccbccc3dc..c043eea77 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.2.0 +2.2.1 diff --git a/nextflow.config b/nextflow.config index f36dedc27..369fc01f1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,7 +12,7 @@ manifest { doi = "https://doi.org/10.1093/nar/gkz148" mainScript = "main.nf" nextflowVersion = ">=21.04.0" - version = "2.2.0" + version = "2.2.1" }