-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #26 from leoisl/decontamination
Adding decontamination and a few other misc stuff
- Loading branch information
Showing
21 changed files
with
1,741 additions
and
89 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -147,3 +147,4 @@ tmp/ | |
/sample_example/ | ||
/tbpore_out/ | ||
/tests/helpers/logs/ | ||
/pipelines/snakemake/.snakemake/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,4 +16,5 @@ dependencies: | |
- rasusa | ||
- samtools=1.13 | ||
- bcftools=1.13 | ||
- minimap2=2.24 | ||
- minimap2=2.22 | ||
- seqkit=2.0.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
import logging | ||
from enum import Enum | ||
from pathlib import Path | ||
from typing import Set | ||
|
||
import click | ||
import pandas as pd | ||
import pysam | ||
|
||
IS_CONTAM_COL = "is_contaminant" | ||
|
||
|
||
class RequiredIf(click.Option): | ||
def __init__(self, *args, **kwargs): | ||
self.required_if = kwargs.pop("required_if") | ||
assert self.required_if, "'required_if' parameter required" | ||
kwargs["help"] = ( | ||
kwargs.get("help", "") | ||
+ " NOTE: This argument is mutually inclusive with %s" % self.required_if | ||
).strip() | ||
super(RequiredIf, self).__init__(*args, **kwargs) | ||
|
||
def handle_parse_result(self, ctx, opts, args): | ||
we_are_present = self.name in opts | ||
other_present = self.required_if in opts | ||
|
||
if not other_present: | ||
if we_are_present: | ||
raise click.UsageError( | ||
"Illegal usage: `%s` is mutually inclusive with `%s`" | ||
% (self.name, self.required_if) | ||
) | ||
|
||
return super(RequiredIf, self).handle_parse_result(ctx, opts, args) | ||
|
||
|
||
class Classification(Enum): | ||
Contaminant = "contaminant" | ||
Unmaped = "unmapped" | ||
Keep = "keep" | ||
Other = "other" | ||
|
||
|
||
class Classifier: | ||
def __init__(self, metadata_file: str): | ||
self.metadata = pd.read_table( | ||
metadata_file, | ||
header=None, | ||
names=["organism", IS_CONTAM_COL, "accession"], | ||
index_col="accession", | ||
dtype={IS_CONTAM_COL: "bool"}, | ||
) | ||
|
||
def classify(self, record: pysam.AlignedSegment) -> Classification: | ||
if record.is_unmapped: | ||
return Classification.Unmaped | ||
|
||
ref_id = record.reference_name | ||
is_contam: bool = self.metadata.at[ref_id, IS_CONTAM_COL] | ||
if is_contam: | ||
return Classification.Contaminant | ||
|
||
return Classification.Keep | ||
|
||
|
||
@click.command() | ||
@click.help_option("--help", "-h") | ||
@click.option( | ||
"-i", | ||
"--samfile", | ||
help="{B,CR,S}AM file of reads mapped to decontamination database.", | ||
type=click.Path(exists=True, dir_okay=False), | ||
required=True, | ||
) | ||
@click.option( | ||
"-m", | ||
"--metadata", | ||
help=( | ||
"TSV file containing information about each reference in the database. Column " | ||
"1 is the category, column 2 is whether the reference is contamination, column " | ||
"3 is the accession ID for the reference." | ||
), | ||
type=click.Path(exists=True, dir_okay=False), | ||
required=True, | ||
) | ||
@click.option( | ||
"-o", | ||
"--outdir", | ||
type=click.Path(dir_okay=True, writable=True), | ||
help=( | ||
"Directory to write the output files to. The files written will be named " | ||
"unmapped.reads, contaminant.reads, and keep.reads" | ||
), | ||
default=".", | ||
show_default=True, | ||
) | ||
@click.option( | ||
"--ignore-secondary/--include-secondary", | ||
help="Ignore organism assignments for secondary alignments?", | ||
default=True, | ||
show_default=True, | ||
) | ||
@click.option("-v", "--verbose", help="Turns on debug-level logging.", is_flag=True) | ||
def main( | ||
samfile: str, | ||
metadata: str, | ||
outdir: str, | ||
ignore_secondary: bool, | ||
verbose: bool, | ||
): | ||
"""This scripts classifies records in an alignment to a contamination database, with | ||
the help of a metadata file mapping reference names to whether they are | ||
contamination or not. It produces three files with a read identifier for each line: | ||
- unmapped reads | ||
- contaminated reads | ||
- reads to keep | ||
""" | ||
log_level = logging.DEBUG if verbose else logging.INFO | ||
logging.basicConfig( | ||
format="%(asctime)s [%(levelname)s]: %(message)s", level=log_level | ||
) | ||
outdir = Path(outdir) | ||
outdir.mkdir(exist_ok=True) | ||
|
||
classifier = Classifier(metadata) | ||
all_read_ids: Set[str] = set() | ||
unmapped_reads: Set[str] = set() | ||
contaminant_reads: Set[str] = set() | ||
keep_reads: Set[str] = set() | ||
|
||
logging.info("Classifying records in alignment file...") | ||
with pysam.AlignmentFile(samfile) as bam: | ||
for record in bam: | ||
read_id = record.query_name | ||
if read_id is None: | ||
logging.warning(f"Got a record with no query name\n{str(record)}") | ||
continue | ||
|
||
all_read_ids.add(read_id) | ||
if record.is_secondary and ignore_secondary: | ||
logging.debug(f"{read_id} has secondary alignment. Skipping...") | ||
continue | ||
|
||
classification = classifier.classify(record) | ||
if classification is Classification.Unmaped: | ||
unmapped_reads.add(read_id) | ||
elif classification is Classification.Keep: | ||
keep_reads.add(read_id) | ||
elif classification.Contaminant: | ||
contaminant_reads.add(read_id) | ||
else: | ||
raise NotImplementedError( | ||
f"Don't know how to handle classification: {classification}" | ||
) | ||
|
||
# if any read in the pair is a "keeper" remove it from contaminants | ||
contaminant_reads -= keep_reads | ||
# reads are only unmapped if both are unmapped | ||
unmapped_reads -= keep_reads.union(contaminant_reads) | ||
|
||
assert all_read_ids == keep_reads.union(unmapped_reads, contaminant_reads) | ||
|
||
logging.info(f"{len(keep_reads)} reads are to be kept") | ||
logging.info(f"{len(contaminant_reads)} reads are contaminants") | ||
logging.info(f"{len(unmapped_reads)} reads are unmapped") | ||
logging.info("Writing output files...") | ||
|
||
keep_file = outdir / "keep.reads" | ||
keep_file.write_text("\n".join(keep_reads)) | ||
logging.info(f"Read identifiers to keep written to {keep_file}") | ||
|
||
contaminant_file = outdir / "contaminant.reads" | ||
contaminant_file.write_text("\n".join(contaminant_reads)) | ||
logging.info(f"Contaminant read identifiers written to {contaminant_file}") | ||
|
||
unmapped_file = outdir / "unmapped.reads" | ||
unmapped_file.write_text("\n".join(unmapped_reads)) | ||
logging.info(f"Unmapped read identifiers written to {unmapped_file}") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import glob | ||
|
||
configfile: "config.yaml" | ||
|
||
rule all: | ||
input: | ||
f"{config['output_dir']}/consensus_comparison.out", | ||
f"{config['output_dir']}/bcfs_comparison.out", | ||
f"{config['output_dir']}/mykrobe_comparison.out", | ||
f"{config['output_dir']}/mykrobe_comparison.simple.out", | ||
|
||
def get_input(wildcards): | ||
reads = glob.glob(f"{config['base_reads_dir']}/{wildcards.sample}.nanopore.fq.gz") | ||
assert len(reads) == 1 | ||
return reads | ||
|
||
rule run_tbpore: | ||
input: | ||
get_input | ||
output: | ||
directory(f"{config['output_dir']}/{{sample}}") | ||
threads: 1 | ||
resources: | ||
mem_mb=lambda wildcards, attempt: 20000 * attempt | ||
log: | ||
"logs/run_tbpore_{sample}.log" | ||
shell: | ||
"tbpore --no-cleanup -o {output} {input}" | ||
|
||
|
||
rule compare_h2h_and_tbpore_consensus: | ||
input: | ||
all_tbpore_outputs = expand(f"{config['output_dir']}/{{sample}}", sample=config['samples']) | ||
output: | ||
consensus_comparison = f"{config['output_dir']}/consensus_comparison.out" | ||
params: | ||
tbpore_output = config['output_dir'], | ||
h2h_consensus_glob_pattern = config['h2h_consensus_glob_pattern'], | ||
threads: 1 | ||
resources: | ||
mem_mb=lambda wildcards, attempt: 1000 * attempt | ||
log: | ||
"logs/compare_h2h_and_tbpore_consensus.log" | ||
script: "scripts/compare_H2H_and_tbpore_consensus.py" | ||
|
||
|
||
rule compare_h2h_and_tbpore_bcfs: | ||
input: | ||
all_tbpore_outputs = expand(f"{config['output_dir']}/{{sample}}", sample=config['samples']) | ||
output: | ||
bcfs_comparison = f"{config['output_dir']}/bcfs_comparison.out" | ||
params: | ||
tbpore_output = config['output_dir'], | ||
h2h_bcf_glob_pattern = config['h2h_bcf_glob_pattern'], | ||
threads: 1 | ||
resources: | ||
mem_mb=lambda wildcards, attempt: 1000 * attempt | ||
log: | ||
"logs/compare_h2h_and_tbpore_bcfs.log" | ||
script: "scripts/compare_H2H_and_tbpore_bcfs.py" | ||
|
||
|
||
rule compare_h2h_and_tbpore_mykrobe: | ||
input: | ||
all_tbpore_outputs = expand(f"{config['output_dir']}/{{sample}}", sample=config['samples']) | ||
output: | ||
mykrobe_comparison = f"{config['output_dir']}/mykrobe_comparison.out", | ||
mykrobe_comparison_simple = f"{config['output_dir']}/mykrobe_comparison.simple.out" | ||
params: | ||
tbpore_output = config['output_dir'], | ||
h2h_mykrobe_glob_pattern = config['h2h_mykrobe_glob_pattern'], | ||
threads: 1 | ||
resources: | ||
mem_mb=lambda wildcards, attempt: 1000 * attempt | ||
log: | ||
"logs/compare_h2h_and_tbpore_mykrobe.log" | ||
script: "scripts/compare_H2H_and_tbpore_mykrobe.py" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
base_reads_dir: "/hps/nobackup/iqbal/mbhall/tech_wars/data/madagascar/nanopore/raw_data/*/guppy_v5.0.16" | ||
samples: ["mada_1-19", "mada_1-8", "mada_1-6", "mada_103", "mada_1-1", "mada_130", "mada_132", "mada_128", "mada_2-1", "mada_1-51", "mada_1-20", "mada_2-31", "mada_109", "mada_112", "mada_1-5", "mada_107", "mada_1-40", "mada_111", "mada_1-14", "mada_1-46", "mada_136", "mada_1-54", "mada_1-25", "mada_118", "mada_129", "mada_1-18", "mada_151", "mada_134", "mada_1-3", "mada_1-44", "mada_1-15", "mada_1-47", "mada_2-53", "mada_1-16", "mada_1-2", "mada_154", "mada_104", "mada_115", "mada_126", "mada_1-33", "mada_102", "mada_127", "mada_125", "mada_1-43", "mada_137", "mada_117", "mada_131", "mada_2-46", "mada_1-17", "mada_124", "mada_121", "mada_141", "mada_2-42", "mada_1-28", "mada_152", "mada_1-48", "mada_2-34", "mada_123", "mada_106", "mada_140", "mada_1-30", "mada_1-50", "mada_139", "mada_1-41", "mada_2-25", "mada_105", "mada_144", "mada_1-32", "mada_1-22", "mada_1-11", "mada_1-10", "mada_110", "mada_120", "mada_113", "mada_1-38", "mada_122", "mada_1-7", "mada_116", "mada_142", "mada_133", "mada_148", "mada_1-13", "mada_1-53", "mada_135", "mada_1-12", "mada_2-50", "mada_1-21", "mada_1-39", "mada_1-36", "mada_143", "mada_150"] | ||
h2h_consensus_glob_pattern: "/hps/nobackup/iqbal/mbhall/tech_wars/analysis/baseline_variants/consensus/bcftools/madagascar/*.consensus.fa" | ||
h2h_mykrobe_glob_pattern: "/hps/nobackup/iqbal/mbhall/tech_wars/analysis/resistance_prediction/results/mykrobe/predict/nanopore/madagascar/*/*.mykrobe.json" | ||
h2h_bcf_glob_pattern: "/hps/nobackup/iqbal/mbhall/tech_wars/analysis/baseline_variants/nanopore/filtered_snps/madagascar/*.snps.filtered.bcf" | ||
output_dir: "output" |
Oops, something went wrong.