Skip to content

Commit

Permalink
Merge pull request #26 from leoisl/decontamination
Browse files Browse the repository at this point in the history
Adding decontamination and a few other misc stuff
  • Loading branch information
leoisl authored May 16, 2022
2 parents c78ce58 + e7e0b0b commit aff3098
Show file tree
Hide file tree
Showing 21 changed files with 1,741 additions and 89 deletions.
18 changes: 17 additions & 1 deletion .config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,29 @@ mykrobe:
rasusa:
params: "-c 150 -g 4411532 -s 88"

decom_DB:
url: "ftp://ftp-private.ebi.ac.uk/tbpore/0.1.0/decontamination_db/tbpore.remove_contam.fa.gz.map-ont.mmi.gz"

minimap2:
params: "-a -L --sam-hit-only --secondary=no -x map-ont"
map_to_decom_DB:
params: "-aL2 -x map-ont"
map:
params: "-a -L --sam-hit-only --secondary=no -x map-ont"

samtools:
sort_decom_DB:
params: ""
index_decom_DB:
params: ""
sort:
params: ""

filter_contamination:
params: "--verbose --ignore-secondary"

extract_decontaminated_nanopore_reads:
params: "grep"

bcftools:
mpileup:
params: "-x -I -Q 13 -a 'INFO/SCR,FORMAT/SP,INFO/ADR,INFO/ADF' -h100 -M10000"
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,4 @@ tmp/
/sample_example/
/tbpore_out/
/tests/helpers/logs/
/pipelines/snakemake/.snakemake/
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,15 @@ cd tbpore
conda env create -f environment.yaml && conda activate tbpore # install dependencies
just install # install tbpore
just check # checks installation is fine
# if you want to run tbpore on an example isolate
# if you want to run tbpore on an example isolate (will require you to download the decontamination DB index)
scripts/run_sample_example.sh
```

# Performance

Benchmarked on 109 TB Madagascar ONT samples with 1 thread:
* Runtime: `1361`s avg (`550`s SD), max `2294`s, min `102`s (s = seconds);
* RAM: `1266`MB avg (`394`MB SD), max `1914` MB, min `527` MB (MB = Megabytes);
Benchmarked on 91 TB Madagascar ONT samples with 1 thread:
* Runtime: `2255`s avg, `3805`s max (s = seconds);
* RAM: `12.4`GB avg, `13.1`GB max (GB = Gigabytes);

# Usage

Expand Down
Binary file added data/decontamination_db/remove_contam.tsv.gz
Binary file not shown.
3 changes: 2 additions & 1 deletion environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ dependencies:
- rasusa
- samtools=1.13
- bcftools=1.13
- minimap2=2.24
- minimap2=2.22
- seqkit=2.0.0
182 changes: 182 additions & 0 deletions external_scripts/filter_contamination.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import logging
from enum import Enum
from pathlib import Path
from typing import Set

import click
import pandas as pd
import pysam

IS_CONTAM_COL = "is_contaminant"


class RequiredIf(click.Option):
def __init__(self, *args, **kwargs):
self.required_if = kwargs.pop("required_if")
assert self.required_if, "'required_if' parameter required"
kwargs["help"] = (
kwargs.get("help", "")
+ " NOTE: This argument is mutually inclusive with %s" % self.required_if
).strip()
super(RequiredIf, self).__init__(*args, **kwargs)

def handle_parse_result(self, ctx, opts, args):
we_are_present = self.name in opts
other_present = self.required_if in opts

if not other_present:
if we_are_present:
raise click.UsageError(
"Illegal usage: `%s` is mutually inclusive with `%s`"
% (self.name, self.required_if)
)

return super(RequiredIf, self).handle_parse_result(ctx, opts, args)


class Classification(Enum):
Contaminant = "contaminant"
Unmaped = "unmapped"
Keep = "keep"
Other = "other"


class Classifier:
def __init__(self, metadata_file: str):
self.metadata = pd.read_table(
metadata_file,
header=None,
names=["organism", IS_CONTAM_COL, "accession"],
index_col="accession",
dtype={IS_CONTAM_COL: "bool"},
)

def classify(self, record: pysam.AlignedSegment) -> Classification:
if record.is_unmapped:
return Classification.Unmaped

ref_id = record.reference_name
is_contam: bool = self.metadata.at[ref_id, IS_CONTAM_COL]
if is_contam:
return Classification.Contaminant

return Classification.Keep


@click.command()
@click.help_option("--help", "-h")
@click.option(
"-i",
"--samfile",
help="{B,CR,S}AM file of reads mapped to decontamination database.",
type=click.Path(exists=True, dir_okay=False),
required=True,
)
@click.option(
"-m",
"--metadata",
help=(
"TSV file containing information about each reference in the database. Column "
"1 is the category, column 2 is whether the reference is contamination, column "
"3 is the accession ID for the reference."
),
type=click.Path(exists=True, dir_okay=False),
required=True,
)
@click.option(
"-o",
"--outdir",
type=click.Path(dir_okay=True, writable=True),
help=(
"Directory to write the output files to. The files written will be named "
"unmapped.reads, contaminant.reads, and keep.reads"
),
default=".",
show_default=True,
)
@click.option(
"--ignore-secondary/--include-secondary",
help="Ignore organism assignments for secondary alignments?",
default=True,
show_default=True,
)
@click.option("-v", "--verbose", help="Turns on debug-level logging.", is_flag=True)
def main(
samfile: str,
metadata: str,
outdir: str,
ignore_secondary: bool,
verbose: bool,
):
"""This scripts classifies records in an alignment to a contamination database, with
the help of a metadata file mapping reference names to whether they are
contamination or not. It produces three files with a read identifier for each line:
- unmapped reads
- contaminated reads
- reads to keep
"""
log_level = logging.DEBUG if verbose else logging.INFO
logging.basicConfig(
format="%(asctime)s [%(levelname)s]: %(message)s", level=log_level
)
outdir = Path(outdir)
outdir.mkdir(exist_ok=True)

classifier = Classifier(metadata)
all_read_ids: Set[str] = set()
unmapped_reads: Set[str] = set()
contaminant_reads: Set[str] = set()
keep_reads: Set[str] = set()

logging.info("Classifying records in alignment file...")
with pysam.AlignmentFile(samfile) as bam:
for record in bam:
read_id = record.query_name
if read_id is None:
logging.warning(f"Got a record with no query name\n{str(record)}")
continue

all_read_ids.add(read_id)
if record.is_secondary and ignore_secondary:
logging.debug(f"{read_id} has secondary alignment. Skipping...")
continue

classification = classifier.classify(record)
if classification is Classification.Unmaped:
unmapped_reads.add(read_id)
elif classification is Classification.Keep:
keep_reads.add(read_id)
elif classification.Contaminant:
contaminant_reads.add(read_id)
else:
raise NotImplementedError(
f"Don't know how to handle classification: {classification}"
)

# if any read in the pair is a "keeper" remove it from contaminants
contaminant_reads -= keep_reads
# reads are only unmapped if both are unmapped
unmapped_reads -= keep_reads.union(contaminant_reads)

assert all_read_ids == keep_reads.union(unmapped_reads, contaminant_reads)

logging.info(f"{len(keep_reads)} reads are to be kept")
logging.info(f"{len(contaminant_reads)} reads are contaminants")
logging.info(f"{len(unmapped_reads)} reads are unmapped")
logging.info("Writing output files...")

keep_file = outdir / "keep.reads"
keep_file.write_text("\n".join(keep_reads))
logging.info(f"Read identifiers to keep written to {keep_file}")

contaminant_file = outdir / "contaminant.reads"
contaminant_file.write_text("\n".join(contaminant_reads))
logging.info(f"Contaminant read identifiers written to {contaminant_file}")

unmapped_file = outdir / "unmapped.reads"
unmapped_file.write_text("\n".join(unmapped_reads))
logging.info(f"Unmapped read identifiers written to {unmapped_file}")


if __name__ == "__main__":
main()
4 changes: 2 additions & 2 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ check-fmt:

# lint code with flake8
lint:
poetry run flake8 . --extend-exclude=".venv/"
poetry run flake8 . --extend-exclude=".venv/,pipelines/snakemake"

# install latest version with poetry
install:
Expand All @@ -25,7 +25,7 @@ install:

# run all tests
test opts="":
poetry run pytest {{opts}} tests/
poetry run pytest -vv {{opts}} tests/

# run tests with coverage report
coverage:
Expand Down
77 changes: 77 additions & 0 deletions pipelines/snakemake/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import glob

configfile: "config.yaml"

rule all:
input:
f"{config['output_dir']}/consensus_comparison.out",
f"{config['output_dir']}/bcfs_comparison.out",
f"{config['output_dir']}/mykrobe_comparison.out",
f"{config['output_dir']}/mykrobe_comparison.simple.out",

def get_input(wildcards):
reads = glob.glob(f"{config['base_reads_dir']}/{wildcards.sample}.nanopore.fq.gz")
assert len(reads) == 1
return reads

rule run_tbpore:
input:
get_input
output:
directory(f"{config['output_dir']}/{{sample}}")
threads: 1
resources:
mem_mb=lambda wildcards, attempt: 20000 * attempt
log:
"logs/run_tbpore_{sample}.log"
shell:
"tbpore --no-cleanup -o {output} {input}"


rule compare_h2h_and_tbpore_consensus:
input:
all_tbpore_outputs = expand(f"{config['output_dir']}/{{sample}}", sample=config['samples'])
output:
consensus_comparison = f"{config['output_dir']}/consensus_comparison.out"
params:
tbpore_output = config['output_dir'],
h2h_consensus_glob_pattern = config['h2h_consensus_glob_pattern'],
threads: 1
resources:
mem_mb=lambda wildcards, attempt: 1000 * attempt
log:
"logs/compare_h2h_and_tbpore_consensus.log"
script: "scripts/compare_H2H_and_tbpore_consensus.py"


rule compare_h2h_and_tbpore_bcfs:
input:
all_tbpore_outputs = expand(f"{config['output_dir']}/{{sample}}", sample=config['samples'])
output:
bcfs_comparison = f"{config['output_dir']}/bcfs_comparison.out"
params:
tbpore_output = config['output_dir'],
h2h_bcf_glob_pattern = config['h2h_bcf_glob_pattern'],
threads: 1
resources:
mem_mb=lambda wildcards, attempt: 1000 * attempt
log:
"logs/compare_h2h_and_tbpore_bcfs.log"
script: "scripts/compare_H2H_and_tbpore_bcfs.py"


rule compare_h2h_and_tbpore_mykrobe:
input:
all_tbpore_outputs = expand(f"{config['output_dir']}/{{sample}}", sample=config['samples'])
output:
mykrobe_comparison = f"{config['output_dir']}/mykrobe_comparison.out",
mykrobe_comparison_simple = f"{config['output_dir']}/mykrobe_comparison.simple.out"
params:
tbpore_output = config['output_dir'],
h2h_mykrobe_glob_pattern = config['h2h_mykrobe_glob_pattern'],
threads: 1
resources:
mem_mb=lambda wildcards, attempt: 1000 * attempt
log:
"logs/compare_h2h_and_tbpore_mykrobe.log"
script: "scripts/compare_H2H_and_tbpore_mykrobe.py"
6 changes: 6 additions & 0 deletions pipelines/snakemake/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
base_reads_dir: "/hps/nobackup/iqbal/mbhall/tech_wars/data/madagascar/nanopore/raw_data/*/guppy_v5.0.16"
samples: ["mada_1-19", "mada_1-8", "mada_1-6", "mada_103", "mada_1-1", "mada_130", "mada_132", "mada_128", "mada_2-1", "mada_1-51", "mada_1-20", "mada_2-31", "mada_109", "mada_112", "mada_1-5", "mada_107", "mada_1-40", "mada_111", "mada_1-14", "mada_1-46", "mada_136", "mada_1-54", "mada_1-25", "mada_118", "mada_129", "mada_1-18", "mada_151", "mada_134", "mada_1-3", "mada_1-44", "mada_1-15", "mada_1-47", "mada_2-53", "mada_1-16", "mada_1-2", "mada_154", "mada_104", "mada_115", "mada_126", "mada_1-33", "mada_102", "mada_127", "mada_125", "mada_1-43", "mada_137", "mada_117", "mada_131", "mada_2-46", "mada_1-17", "mada_124", "mada_121", "mada_141", "mada_2-42", "mada_1-28", "mada_152", "mada_1-48", "mada_2-34", "mada_123", "mada_106", "mada_140", "mada_1-30", "mada_1-50", "mada_139", "mada_1-41", "mada_2-25", "mada_105", "mada_144", "mada_1-32", "mada_1-22", "mada_1-11", "mada_1-10", "mada_110", "mada_120", "mada_113", "mada_1-38", "mada_122", "mada_1-7", "mada_116", "mada_142", "mada_133", "mada_148", "mada_1-13", "mada_1-53", "mada_135", "mada_1-12", "mada_2-50", "mada_1-21", "mada_1-39", "mada_1-36", "mada_143", "mada_150"]
h2h_consensus_glob_pattern: "/hps/nobackup/iqbal/mbhall/tech_wars/analysis/baseline_variants/consensus/bcftools/madagascar/*.consensus.fa"
h2h_mykrobe_glob_pattern: "/hps/nobackup/iqbal/mbhall/tech_wars/analysis/resistance_prediction/results/mykrobe/predict/nanopore/madagascar/*/*.mykrobe.json"
h2h_bcf_glob_pattern: "/hps/nobackup/iqbal/mbhall/tech_wars/analysis/baseline_variants/nanopore/filtered_snps/madagascar/*.snps.filtered.bcf"
output_dir: "output"
Loading

0 comments on commit aff3098

Please sign in to comment.