diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9434bb5..7f02158 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -12,6 +12,7 @@ v2.2.1-dev ********** * Add novel module, **expression**, which analyzes gene expression * Add optional input `strandedness` to the sample configuration + * Add json output file for the expression module ********** v2.1.3 diff --git a/includes/expression/Snakefile b/includes/expression/Snakefile index cc9165b..0d9199e 100644 --- a/includes/expression/Snakefile +++ b/includes/expression/Snakefile @@ -5,6 +5,7 @@ rule all: input: multiqc="multiqc_expression.html", seamless=[f"{sample.sample}/expression/seAMLess.tsv" for sample in samples], + json=[module_output.json(sample) for sample in samples], rule normalized_coverage: @@ -74,6 +75,30 @@ rule transform_counts: """ +rule json_output: + input: + coverage=module_output.normalized_expression, + src=srcdir("scripts/json-output.py"), + params: + strandedness=get_strand, + genes=config["report"], + output: + "{sample}/expression/expression-output.json", + log: + "log/expression.{sample}.txt", + container: + containers["pysam"] + shell: + """ + python3 {input.src} \ + --coverage {input.coverage} \ + --strandedness {params.strandedness} \ + --genes {params.genes} \ + --sample {wildcards.sample} \ + > {output} 2> {log} + """ + + rule merge_samples: input: counts=[module_output.normalized_expression(sample) for sample in samples], diff --git a/includes/expression/common.smk b/includes/expression/common.smk index 4eafe96..af65ef9 100644 --- a/includes/expression/common.smk +++ b/includes/expression/common.smk @@ -43,6 +43,11 @@ def set_housekeeping_genes(): config["housekeeping"] = str_to_list(housekeeping) +def set_pdf_report_genes(): + report_genes = config.get("report") + config["report"] = str_to_list(report_genes) + + ## Input functions ## def get_bam(wildcards): return pep.sample_table.loc[wildcards.sample, "bam"] @@ -79,17 +84,23 @@ def check_housekeeping(): raise RuntimeError(msg) -def check_bed_genes_of_interest(): - """Check for duplicates genes bed file and genes_of_interest""" +def _bed_names(): + """Get gene names from the Bed file""" if "bed" not in config: - return + return list() - # Get the names from the bed file names = list() with open(config["bed"]) as fin: for line in fin: spline = line.strip("\n").split("\t") names.append(spline[3]) + return names + + +def check_bed_genes_of_interest(): + """Check for duplicates genes bed file and genes_of_interest""" + # Get the names from the bed file + names = _bed_names() for gene in config["genes_of_interest"]: if gene in names: @@ -97,6 +108,14 @@ def check_bed_genes_of_interest(): raise RuntimeError(msg) +def check_genes_for_report(): + quantified_genes = _bed_names() + config["genes_of_interest"] + for gene in config["report"]: + if gene not in quantified_genes: + msg = f"{gene} is specified for the report, but not in genes_of_interest or the bed file" + raise RuntimeError(msg) + + ## Functions for module outputs ## def coverage(wildcards): return f"{wildcards.sample}/expression/coverage.csv" @@ -113,11 +132,23 @@ def multiqc_files(): return unstranded + stranded +def get_json(wildcards): + return f"{wildcards.sample}/expression/expression-output.json" + + +# Set default values, parse specified values set_genes_of_interest() set_housekeeping_genes() +set_pdf_report_genes() + +# Check the sanity of the reported settings check_housekeeping() check_bed_genes_of_interest() +check_genes_for_report() module_output = SimpleNamespace( - coverage=coverage, normalized_expression=normalized, multiqc_files=multiqc_files() + coverage=coverage, + normalized_expression=normalized, + multiqc_files=multiqc_files(), + json=get_json, ) diff --git a/includes/expression/scripts/json-output.py b/includes/expression/scripts/json-output.py new file mode 100755 index 0000000..6caed34 --- /dev/null +++ b/includes/expression/scripts/json-output.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python + +import json +import argparse +import os + +from multiqc import read_expression + +def fusion_results(fname): + # Initialise the results dictionary + with open(fname) as fin: + return json.load(fin) + +def main(args): + """ Create json output of expression results """ + + # Create results dictionary + results = { + "metadata": { + "sample_name": args.sample + }, + "genes": dict() + } + + # Read the normalized expression for the specified strand + expression = read_expression(args.coverage, args.strandedness) + + # Extract the genes of interest + genes = dict() + for gene in args.genes: + genes[gene] = expression[gene] + + results["genes"] = genes + print(json.dumps({"expression": results}, sort_keys=True, indent=2)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--coverage', help='Normalized coverage file') + parser.add_argument('--sample', help = 'Sample name') + parser.add_argument('--strandedness', help='Strandedness of the sample') + parser.add_argument('--genes', nargs='*', default=list(), help='genes to include') + + args = parser.parse_args() + main(args) + diff --git a/test/test_expression.yml b/test/test_expression.yml index 69f3f16..c528fa0 100644 --- a/test/test_expression.yml +++ b/test/test_expression.yml @@ -13,6 +13,7 @@ contains: - "rule merge_samples" - "rule transform_counts" + - "rule json_output" - name: Test error on unknown housekeeping gene tags: @@ -62,6 +63,7 @@ --config pepfile=test/pep/expression.csv bed=test/data/reference/transcripts_chrM.bed + report="MT-ND4 MT-TH" exit_code: 0 files: - path: SRR8615409/expression/coverage.csv @@ -94,6 +96,10 @@ - path: multiqc_expression.html contains: - "MT-CO2" + - path: SRR8615409/expression/expression-output.json + contains: + - "MT-ND4" + - "MT-TH" - name: Run the expression module with genes of interest tags: @@ -106,7 +112,7 @@ --configfile test/data/config/expression.json --config pepfile=test/pep/expression.csv - genes_of_interest="MT-CO2" + genes_of_interest="MT-CO2 MT-ND4 MT-TH" exit_code: 0 files: - path: SRR8615409/expression/coverage.csv @@ -126,6 +132,7 @@ tags: - functional - expression + - current command: > snakemake --snakefile includes/expression/Snakefile