feat: added vsearch wrapper (#1149)

### Description  Added wrapper for VSEARCH. ### QC  * [x] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays).
snakemake · Mar 31, 2023 · 79342b7 · 79342b7
1 parent 59be498
commit 79342b7
Show file tree

Hide file tree

Showing 7 changed files with 182 additions and 0 deletions.
diff --git a/bio/vsearch/environment.yaml b/bio/vsearch/environment.yaml
@@ -0,0 +1,8 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - vsearch =2.22.1
+  - gzip
+  - bzip2
diff --git a/bio/vsearch/meta.yaml b/bio/vsearch/meta.yaml
@@ -0,0 +1,14 @@
+name: VSEARCH
+url: https://github.com/torognes/vsearch
+description: |
+  Versatile open-source tool for microbiome analysis.
+authors:
+  - Filipe G. Vieira
+input:
+  - input file(s)
+output:
+  - output file(s)
+params:
+  - extra: additional program arguments
+notes: |
+  * Keys for `input` and `output` files need to match `vsearch` arguments, (e.g. input) `uchime_denovo`, `cluster_fast`, `fastx_uniques`, `maskfasta`, `fastq_convert`, `fastq_mergepairs`, or (e.g. output) `chimeras`, `fastaout`, `fastqout`, `output`.
diff --git a/bio/vsearch/test/Snakefile b/bio/vsearch/test/Snakefile
@@ -0,0 +1,82 @@
+rule vsearch_cluster_fast:
+    input:
+        cluster_fast="reads/{sample}.fasta",
+    output:
+        profile="out/cluster_fast/{sample}.profile",
+    log:
+        "logs/vsearch/cluster_fast/{sample}.log",
+    params:
+        extra="--id 0.2 --sizeout --minseqlength 5",
+    threads: 1
+    wrapper:
+        "master/bio/vsearch"
+
+
+rule vsearch_maskfasta:
+    input:
+        maskfasta="reads/{sample}.fasta",
+    output:
+        output="out/maskfasta/{sample}.fasta",
+    log:
+        "logs/vsearch/maskfasta/{sample}.log",
+    params:
+        extra="--hardmask",
+    threads: 1
+    wrapper:
+        "master/bio/vsearch"
+
+
+rule vsearch_fastx_uniques:
+    input:
+        fastx_uniques="reads/{sample}.fastq",
+    output:
+        fastqout="out/fastx_uniques/{sample}.fastq",
+    log:
+        "logs/vsearch/fastx_uniques/{sample}.log",
+    params:
+        extra="--strand both --minseqlength 5",
+    threads: 2
+    wrapper:
+        "master/bio/vsearch"
+
+
+rule vsearch_fastx_uniques_gzip:
+    input:
+        fastx_uniques="reads/{sample}.fastq",
+    output:
+        fastqout="out/fastx_uniques/{sample}.fastq.gz",
+    log:
+        "logs/vsearch/fastx_uniques/{sample}.log",
+    params:
+        extra="--strand both --minseqlength 5",
+    threads: 2
+    wrapper:
+        "master/bio/vsearch"
+
+
+rule vsearch_fastx_uniques_bzip2:
+    input:
+        fastx_uniques="reads/{sample}.fastq",
+    output:
+        fastqout="out/fastx_uniques/{sample}.fastq.bz2",
+    log:
+        "logs/vsearch/fastx_uniques/{sample}.log",
+    params:
+        extra="--strand both --minseqlength 5",
+    threads: 2
+    wrapper:
+        "master/bio/vsearch"
+
+
+rule vsearch_fastq_convert:
+    input:
+        fastq_convert="reads/{sample}.fastq",
+    output:
+        fastqout="out/fastq_convert/{sample}.fastq",
+    log:
+        "logs/vsearch/fastq_convert/{sample}.log",
+    params:
+        extra="--fastq_ascii 33 --fastq_asciiout 64",
+    threads: 2
+    wrapper:
+        "master/bio/vsearch"
diff --git a/bio/vsearch/test/reads/a.fasta b/bio/vsearch/test/reads/a.fasta
@@ -0,0 +1,8 @@
+>1
+ACGGCAT
+>2
+ATGGCAT
+>1
+CGGCAT
+>3
+ATGGCA
diff --git a/bio/vsearch/test/reads/a.fastq b/bio/vsearch/test/reads/a.fastq
@@ -0,0 +1,16 @@
+@1
+ACGGCAT
++
+!!!!!!!
+@2
+ATGGCAT
++
+!!!!!!!
+@1
+NCGGCAT
++
+!!!!!!!
+@3
+ATGGCAT
++
+!!!!!!!
diff --git a/bio/vsearch/wrapper.py b/bio/vsearch/wrapper.py
@@ -0,0 +1,36 @@
+__author__ = "Filipe G. Vieira"
+__copyright__ = "Copyright 2021, Filipe G. Vieira"
+__license__ = "MIT"
+
+from snakemake.shell import shell
+
+
+extra = snakemake.params.get("extra", "")
+if snakemake.log:
+    log = f"--log {snakemake.log}"
+
+
+input = " ".join([f"--{key} {value}" for key, value in snakemake.input.items()])
+
+
+out_list = list()
+for key, value in snakemake.output.items():
+    if value.endswith(".gz"):
+        out_list.append(f"--{key} /dev/stdout | gzip > {value}")
+    elif value.endswith(".bz2"):
+        out_list.append(f"--{key} /dev/stdout | bzip2 > {value}")
+    else:
+        out_list.append(f"--{key} {value}")
+
+# Check which output files are to be compressed
+out_gz = [out.endswith(".gz") for out in out_list]
+out_bz2 = [out.endswith(".bz2") for out in out_list]
+assert sum(out_gz + out_bz2) <= 1, "only one output can be compressed"
+
+# Move compressed file (if any) to last
+output = [out for _, out in sorted(zip(out_gz or out_bz2, out_list))]
+
+
+shell(
+    "vsearch --threads {snakemake.threads}" " {input}" " {extra}" " {log}" " {output}"
+)
diff --git a/test.py b/test.py
@@ -153,6 +153,24 @@ def test_indelqual():
         ]
     )
 
+@skip_if_not_modified
+def test_vsearch():
+    run(
+        "bio/vsearch",
+        [
+            "snakemake",
+            "--cores",
+            "1",
+            "--use-conda",
+            "out/cluster_fast/a.profile",
+            "out/maskfasta/a.fasta",
+            "out/fastx_uniques/a.fastq",
+            "out/fastx_uniques/a.fastq.gz",
+            "out/fastx_uniques/a.fastq.bz2",
+            "out/fastq_convert/a.fastq",
+        ],
+    )
+
 @skip_if_not_modified
 def test_loglog():
     run(
-Original file line number
+Diff line change
@@ -0,0 +1,8 @@
+    >1
+    ACGGCAT
+    >2
+    ATGGCAT
+    >1
+    CGGCAT
+    >3
+    ATGGCA