Merge pull request #26 from leoisl/decontamination

Adding decontamination and a few other misc stuff
mbhall88 · May 16, 2022 · aff3098 · aff3098
2 parents c78ce58 + e7e0b0b
commit aff3098
Show file tree

Hide file tree

Showing 21 changed files with 1,741 additions and 89 deletions.
diff --git a/.config.yaml b/.config.yaml
@@ -9,13 +9,29 @@ mykrobe:
 rasusa:
   params: "-c 150 -g 4411532 -s 88"
 
+decom_DB:
+  url: "ftp://ftp-private.ebi.ac.uk/tbpore/0.1.0/decontamination_db/tbpore.remove_contam.fa.gz.map-ont.mmi.gz"
+
 minimap2:
-  params: "-a -L --sam-hit-only --secondary=no -x map-ont"
+  map_to_decom_DB:
+    params: "-aL2 -x map-ont"
+  map:
+    params: "-a -L --sam-hit-only --secondary=no -x map-ont"
 
 samtools:
+  sort_decom_DB:
+    params: ""
+  index_decom_DB:
+    params: ""
   sort:
     params: ""
 
+filter_contamination:
+  params: "--verbose --ignore-secondary"
+
+extract_decontaminated_nanopore_reads:
+  params: "grep"
+
 bcftools:
   mpileup:
     params: "-x -I -Q 13 -a 'INFO/SCR,FORMAT/SP,INFO/ADR,INFO/ADF' -h100 -M10000"

diff --git a/.gitignore b/.gitignore
@@ -147,3 +147,4 @@ tmp/
 /sample_example/
 /tbpore_out/
 /tests/helpers/logs/
+/pipelines/snakemake/.snakemake/
diff --git a/README.md b/README.md
@@ -31,15 +31,15 @@ cd tbpore
 conda env create -f environment.yaml && conda activate tbpore  # install dependencies
 just install  # install tbpore
 just check  # checks installation is fine
-# if you want to run tbpore on an example isolate
+# if you want to run tbpore on an example isolate (will require you to download the decontamination DB index)
 scripts/run_sample_example.sh
 ```
 
 # Performance
 
-Benchmarked on 109 TB Madagascar ONT samples with 1 thread:
-* Runtime: `1361`s avg (`550`s SD), max `2294`s, min `102`s (s = seconds);
-* RAM: `1266`MB avg (`394`MB SD), max `1914` MB, min `527` MB (MB = Megabytes);
+Benchmarked on 91 TB Madagascar ONT samples with 1 thread:
+* Runtime: `2255`s avg, `3805`s max (s = seconds);
+* RAM: `12.4`GB avg, `13.1`GB max (GB = Gigabytes);
 
 # Usage
 

diff --git a/data/decontamination_db/remove_contam.tsv.gz b/data/decontamination_db/remove_contam.tsv.gz
diff --git a/environment.yaml b/environment.yaml
@@ -16,4 +16,5 @@ dependencies:
   - rasusa
   - samtools=1.13
   - bcftools=1.13
-  - minimap2=2.24
+  - minimap2=2.22
+  - seqkit=2.0.0
diff --git a/external_scripts/filter_contamination.py b/external_scripts/filter_contamination.py
@@ -0,0 +1,182 @@
+import logging
+from enum import Enum
+from pathlib import Path
+from typing import Set
+
+import click
+import pandas as pd
+import pysam
+
+IS_CONTAM_COL = "is_contaminant"
+
+
+class RequiredIf(click.Option):
+    def __init__(self, *args, **kwargs):
+        self.required_if = kwargs.pop("required_if")
+        assert self.required_if, "'required_if' parameter required"
+        kwargs["help"] = (
+            kwargs.get("help", "")
+            + " NOTE: This argument is mutually inclusive with %s" % self.required_if
+        ).strip()
+        super(RequiredIf, self).__init__(*args, **kwargs)
+
+    def handle_parse_result(self, ctx, opts, args):
+        we_are_present = self.name in opts
+        other_present = self.required_if in opts
+
+        if not other_present:
+            if we_are_present:
+                raise click.UsageError(
+                    "Illegal usage: `%s` is mutually inclusive with `%s`"
+                    % (self.name, self.required_if)
+                )
+
+        return super(RequiredIf, self).handle_parse_result(ctx, opts, args)
+
+
+class Classification(Enum):
+    Contaminant = "contaminant"
+    Unmaped = "unmapped"
+    Keep = "keep"
+    Other = "other"
+
+
+class Classifier:
+    def __init__(self, metadata_file: str):
+        self.metadata = pd.read_table(
+            metadata_file,
+            header=None,
+            names=["organism", IS_CONTAM_COL, "accession"],
+            index_col="accession",
+            dtype={IS_CONTAM_COL: "bool"},
+        )
+
+    def classify(self, record: pysam.AlignedSegment) -> Classification:
+        if record.is_unmapped:
+            return Classification.Unmaped
+
+        ref_id = record.reference_name
+        is_contam: bool = self.metadata.at[ref_id, IS_CONTAM_COL]
+        if is_contam:
+            return Classification.Contaminant
+
+        return Classification.Keep
+
+
+@click.command()
+@click.help_option("--help", "-h")
+@click.option(
+    "-i",
+    "--samfile",
+    help="{B,CR,S}AM file of reads mapped to decontamination database.",
+    type=click.Path(exists=True, dir_okay=False),
+    required=True,
+)
+@click.option(
+    "-m",
+    "--metadata",
+    help=(
+        "TSV file containing information about each reference in the database. Column "
+        "1 is the category, column 2 is whether the reference is contamination, column "
+        "3 is the accession ID for the reference."
+    ),
+    type=click.Path(exists=True, dir_okay=False),
+    required=True,
+)
+@click.option(
+    "-o",
+    "--outdir",
+    type=click.Path(dir_okay=True, writable=True),
+    help=(
+        "Directory to write the output files to. The files written will be named "
+        "unmapped.reads, contaminant.reads, and keep.reads"
+    ),
+    default=".",
+    show_default=True,
+)
+@click.option(
+    "--ignore-secondary/--include-secondary",
+    help="Ignore organism assignments for secondary alignments?",
+    default=True,
+    show_default=True,
+)
+@click.option("-v", "--verbose", help="Turns on debug-level logging.", is_flag=True)
+def main(
+    samfile: str,
+    metadata: str,
+    outdir: str,
+    ignore_secondary: bool,
+    verbose: bool,
+):
+    """This scripts classifies records in an alignment to a contamination database, with
+    the help of a metadata file mapping reference names to whether they are
+    contamination or not. It produces three files with a read identifier for each line:
+      - unmapped reads
+      - contaminated reads
+      - reads to keep
+    """
+    log_level = logging.DEBUG if verbose else logging.INFO
+    logging.basicConfig(
+        format="%(asctime)s [%(levelname)s]: %(message)s", level=log_level
+    )
+    outdir = Path(outdir)
+    outdir.mkdir(exist_ok=True)
+
+    classifier = Classifier(metadata)
+    all_read_ids: Set[str] = set()
+    unmapped_reads: Set[str] = set()
+    contaminant_reads: Set[str] = set()
+    keep_reads: Set[str] = set()
+
+    logging.info("Classifying records in alignment file...")
+    with pysam.AlignmentFile(samfile) as bam:
+        for record in bam:
+            read_id = record.query_name
+            if read_id is None:
+                logging.warning(f"Got a record with no query name\n{str(record)}")
+                continue
+
+            all_read_ids.add(read_id)
+            if record.is_secondary and ignore_secondary:
+                logging.debug(f"{read_id} has secondary alignment. Skipping...")
+                continue
+
+            classification = classifier.classify(record)
+            if classification is Classification.Unmaped:
+                unmapped_reads.add(read_id)
+            elif classification is Classification.Keep:
+                keep_reads.add(read_id)
+            elif classification.Contaminant:
+                contaminant_reads.add(read_id)
+            else:
+                raise NotImplementedError(
+                    f"Don't know how to handle classification: {classification}"
+                )
+
+    # if any read in the pair is a "keeper" remove it from contaminants
+    contaminant_reads -= keep_reads
+    # reads are only unmapped if both are unmapped
+    unmapped_reads -= keep_reads.union(contaminant_reads)
+
+    assert all_read_ids == keep_reads.union(unmapped_reads, contaminant_reads)
+
+    logging.info(f"{len(keep_reads)} reads are to be kept")
+    logging.info(f"{len(contaminant_reads)} reads are contaminants")
+    logging.info(f"{len(unmapped_reads)} reads are unmapped")
+    logging.info("Writing output files...")
+
+    keep_file = outdir / "keep.reads"
+    keep_file.write_text("\n".join(keep_reads))
+    logging.info(f"Read identifiers to keep written to {keep_file}")
+
+    contaminant_file = outdir / "contaminant.reads"
+    contaminant_file.write_text("\n".join(contaminant_reads))
+    logging.info(f"Contaminant read identifiers written to {contaminant_file}")
+
+    unmapped_file = outdir / "unmapped.reads"
+    unmapped_file.write_text("\n".join(unmapped_reads))
+    logging.info(f"Unmapped read identifiers written to {unmapped_file}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/justfile b/justfile
@@ -16,7 +16,7 @@ check-fmt:
 
 # lint code with flake8
 lint:
-    poetry run flake8 . --extend-exclude=".venv/"
+    poetry run flake8 . --extend-exclude=".venv/,pipelines/snakemake"
 
 # install latest version with poetry
 install:
@@ -25,7 +25,7 @@ install:
 
 # run all tests
 test opts="":
-    poetry run pytest {{opts}} tests/
+    poetry run pytest -vv {{opts}} tests/
 
 # run tests with coverage report
 coverage:

diff --git a/pipelines/snakemake/Snakefile b/pipelines/snakemake/Snakefile
@@ -0,0 +1,77 @@
+import glob
+
+configfile: "config.yaml"
+
+rule all:
+    input:
+        f"{config['output_dir']}/consensus_comparison.out",
+        f"{config['output_dir']}/bcfs_comparison.out",
+        f"{config['output_dir']}/mykrobe_comparison.out",
+        f"{config['output_dir']}/mykrobe_comparison.simple.out",
+
+def get_input(wildcards):
+    reads = glob.glob(f"{config['base_reads_dir']}/{wildcards.sample}.nanopore.fq.gz")
+    assert len(reads) == 1
+    return reads
+
+rule run_tbpore:
+    input:
+        get_input
+    output:
+        directory(f"{config['output_dir']}/{{sample}}")
+    threads: 1
+    resources:
+        mem_mb=lambda wildcards, attempt: 20000 * attempt
+    log:
+        "logs/run_tbpore_{sample}.log"
+    shell:
+        "tbpore --no-cleanup -o {output} {input}"
+
+
+rule compare_h2h_and_tbpore_consensus:
+    input:
+        all_tbpore_outputs = expand(f"{config['output_dir']}/{{sample}}", sample=config['samples'])
+    output:
+        consensus_comparison = f"{config['output_dir']}/consensus_comparison.out"
+    params:
+        tbpore_output = config['output_dir'],
+        h2h_consensus_glob_pattern = config['h2h_consensus_glob_pattern'],
+    threads: 1
+    resources:
+        mem_mb=lambda wildcards, attempt: 1000 * attempt
+    log:
+        "logs/compare_h2h_and_tbpore_consensus.log"
+    script: "scripts/compare_H2H_and_tbpore_consensus.py"
+
+
+rule compare_h2h_and_tbpore_bcfs:
+    input:
+        all_tbpore_outputs = expand(f"{config['output_dir']}/{{sample}}", sample=config['samples'])
+    output:
+        bcfs_comparison = f"{config['output_dir']}/bcfs_comparison.out"
+    params:
+        tbpore_output = config['output_dir'],
+        h2h_bcf_glob_pattern = config['h2h_bcf_glob_pattern'],
+    threads: 1
+    resources:
+        mem_mb=lambda wildcards, attempt: 1000 * attempt
+    log:
+        "logs/compare_h2h_and_tbpore_bcfs.log"
+    script: "scripts/compare_H2H_and_tbpore_bcfs.py"
+
+
+rule compare_h2h_and_tbpore_mykrobe:
+    input:
+        all_tbpore_outputs = expand(f"{config['output_dir']}/{{sample}}", sample=config['samples'])
+    output:
+        mykrobe_comparison = f"{config['output_dir']}/mykrobe_comparison.out",
+        mykrobe_comparison_simple = f"{config['output_dir']}/mykrobe_comparison.simple.out"
+    params:
+        tbpore_output = config['output_dir'],
+        h2h_mykrobe_glob_pattern = config['h2h_mykrobe_glob_pattern'],
+    threads: 1
+    resources:
+        mem_mb=lambda wildcards, attempt: 1000 * attempt
+    log:
+        "logs/compare_h2h_and_tbpore_mykrobe.log"
+    script: "scripts/compare_H2H_and_tbpore_mykrobe.py"
diff --git a/pipelines/snakemake/config.yaml b/pipelines/snakemake/config.yaml
@@ -0,0 +1,6 @@
+base_reads_dir: "/hps/nobackup/iqbal/mbhall/tech_wars/data/madagascar/nanopore/raw_data/*/guppy_v5.0.16"
+samples: ["mada_1-19", "mada_1-8", "mada_1-6", "mada_103", "mada_1-1", "mada_130", "mada_132", "mada_128", "mada_2-1", "mada_1-51", "mada_1-20", "mada_2-31", "mada_109", "mada_112", "mada_1-5", "mada_107", "mada_1-40", "mada_111", "mada_1-14", "mada_1-46", "mada_136", "mada_1-54", "mada_1-25", "mada_118", "mada_129", "mada_1-18", "mada_151", "mada_134", "mada_1-3", "mada_1-44", "mada_1-15", "mada_1-47", "mada_2-53", "mada_1-16", "mada_1-2", "mada_154", "mada_104", "mada_115", "mada_126", "mada_1-33", "mada_102", "mada_127", "mada_125", "mada_1-43", "mada_137", "mada_117", "mada_131", "mada_2-46", "mada_1-17", "mada_124", "mada_121", "mada_141", "mada_2-42", "mada_1-28", "mada_152", "mada_1-48", "mada_2-34", "mada_123", "mada_106", "mada_140", "mada_1-30", "mada_1-50", "mada_139", "mada_1-41", "mada_2-25", "mada_105", "mada_144", "mada_1-32", "mada_1-22", "mada_1-11", "mada_1-10", "mada_110", "mada_120", "mada_113", "mada_1-38", "mada_122", "mada_1-7", "mada_116", "mada_142", "mada_133", "mada_148", "mada_1-13", "mada_1-53", "mada_135", "mada_1-12", "mada_2-50", "mada_1-21", "mada_1-39", "mada_1-36", "mada_143", "mada_150"]
+h2h_consensus_glob_pattern: "/hps/nobackup/iqbal/mbhall/tech_wars/analysis/baseline_variants/consensus/bcftools/madagascar/*.consensus.fa"
+h2h_mykrobe_glob_pattern: "/hps/nobackup/iqbal/mbhall/tech_wars/analysis/resistance_prediction/results/mykrobe/predict/nanopore/madagascar/*/*.mykrobe.json"
+h2h_bcf_glob_pattern: "/hps/nobackup/iqbal/mbhall/tech_wars/analysis/baseline_variants/nanopore/filtered_snps/madagascar/*.snps.filtered.bcf"
+output_dir: "output"