diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..894a44c --- /dev/null +++ b/.gitignore @@ -0,0 +1,104 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..45b1e40 --- /dev/null +++ b/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2020, Broad Institute +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..a4ee83d --- /dev/null +++ b/README.md @@ -0,0 +1,22 @@ +# PerMutation +Code for Tn-Seq analysis of _E. faecalis_ MMH594. Check out the wiki more details: + +1. [Processing, Mapping, Replication Bias Correction, and Normalization Across Replicates](https://github.com/broadinstitute/PerMutation/wiki/1.-Processing,-Mapping,-Replication-Bias-Correction,-and-Normalization-Across-Replicates) +2. [Assessing the Fitness Cost of Genes For Growth on Nutrient Rich Media](https://github.com/broadinstitute/PerMutation/wiki/2.-Assessing-the-Fitness-Cost-of-Genes-For-Growth-on-Nutrient-Rich-Media) +3. [Predicting Whether Genes Contribute to Antibiotic Resistance](https://github.com/broadinstitute/PerMutation/wiki/3.-Predicting-Whether-Genes-Contribute-to-Antibiotic-Resistance) + +All programs/scripts written in Python-3, no real installation required. Python-3 libraries used include: + +* biopython +* numpy +* scipy +* pysam + +For processing and mapping of sequencing data to the reference genome the following third-party tools are necessary: + +* trimmomatic +* samtools +* bowtie2 +* fastx_toolkit + +Progams should work if the user creates and uses a conda environment from the provided yml file. diff --git a/fitness_cost_assessment/DvalCalculator.py b/fitness_cost_assessment/DvalCalculator.py new file mode 100644 index 0000000..8c9a62a --- /dev/null +++ b/fitness_cost_assessment/DvalCalculator.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python +# Rauf Salamzade +# DvalCalculator.py +import os +import sys +from sys import stderr +import argparse +from collections import defaultdict +from Bio import SeqIO +import math +try: assert(sys.version_info[0] == 3) +except: stderr.write("Please use Python-3 to run this program. Exiting now ...\n"); sys.exit(1) + +def get_scaffold_lengths(fasta): + scaf_lens = {} + with open(fasta) as of: + for rec in SeqIO.parse(of, 'fasta'): + scaf_lens[rec.id] = len(str(rec.seq)) + return scaf_lens + +def parse_gff(gff, upstream, trim): + trim_edge = (1.0-trim)/2.0 + pos_to_gene = {} + genes = {} + exon_lens = {} + with open(gff) as og: + for line in og: + line = line.rstrip('\n') + ls = line.split('\t') + if not line.startswith("#") and ls[2] == 'gene': + gene = ls[-1].split('ID=')[1].split(';')[0] + scaffold = ls[0]; start = int(ls[3]); stop = int(ls[4]); strand = ls[6] + if strand == '+': start -= upstream + elif strand == '-': stop += upstream + glen = abs(start-stop)+1 + # trim start loci and round up to integer + start = int(math.floor(start + (glen*trim_edge))) + # trim stop loci and round down to integer + stop = int(math.ceil(stop - (glen*trim_edge))) + # Please note, gene length is the length of the feature + # after trimming and upstream region addition are accounted + # for. + if not pos_to_gene.has_key(scaffold): + pos_to_gene[scaffold] = defaultdict(set) + genes[scaffold] = {} + exon_lens[scaffold] = 0 + exon_lens[scaffold] += abs(start-stop) + genes[scaffold][gene] = [start, stop] + for bp in range(start, stop+1): + pos_to_gene[scaffold][bp].add(gene) + return genes, pos_to_gene, exon_lens + +def load_counts(wig_file): + hopcounts = {} + scaf = None + with open(wig_file) as ow: + for i, line in enumerate(ow): + line = line.rstrip('\n') + if line.startswith("variableStep"): + scaf = line.split('chrom=')[1] + hopcounts[scaf] = defaultdict(float) + else: + pos, insert_count = line.split('\t') + hopcounts[scaf][int(pos)] = float(insert_count) + return hopcounts + +def aggregate_counts(scaf_lens, pos_to_gene, hopcounts, exon_only): + gene_counts = {} + total_counts = defaultdict(float) + for scaf in scaf_lens: + gene_counts[scaf] = defaultdict(float) + for bp in range(1, scaf_lens[scaf] + 1): + if hopcounts[scaf].has_key(bp): + if pos_to_gene[scaf].has_key(bp): + genes = pos_to_gene[scaf][bp] + for g in genes: + gene_counts[scaf][g] += hopcounts[scaf][bp] + total_counts[scaf] += hopcounts[scaf][bp] + elif not exon_only: + total_counts[scaf] += hopcounts[scaf][bp] + return gene_counts, total_counts + +def compute_dvals(output, genes, exon_lens, scaf_lens, gene_counts, total_counts, trim, upstream, exon_only): + for s in genes: + outf = open('.'.join([output, s.replace('.','-'), 'txt']), 'w') + outf.write('gene\tscaffold\tstart\tstop\tgene_length\ttrim\tupstream\texon_only\tobserved_insertions\tdval\n') + for g in genes[s]: + observed_count = gene_counts[s][g] + gstart, gend = genes[s][g] + glen = abs(gstart - gend)+1 + expected_count = None + if exon_only: expected_count = (float(glen)/exon_lens[s])*total_counts[s] + else: expected_count = (float(glen)/scaf_lens[s])*total_counts[s] + dval = float(observed_count)/expected_count + outf.write('\t'.join([str(x) for x in [g, s, gstart, gend, glen, trim, upstream, exon_only, observed_count, dval]])+ '\n') + outf.close() + +def calc_dvalue_pipeline(wig_file, output, fasta, gff, exon_only, upstream, trim): + """ Main function which wraps everything together """ + try: assert(upstream >= 0) + except: sys.stderr.write('Error: -u parameter must be provided positive integer or zero. Exiting now ...\n'); sys.exit(1) + try: assert(trim > 0.0 and trim <= 1.0) + except: sys.stderr.write('Error: -t trim must be provided positive float greater than 0.0 and less than or equal to 1.0. Exiting now ...\n'); sys.exit(1) + try: assert(os.path.isfile(fasta) and os.path.isfile(wig_file) and os.path.isfile(gff)) + except: sys.stderr.write('Error: Unable to locate hopcounts, gff, and/or fasta input files! Please check input and try again. Exiting now ...\n'); sys.exit(1) + if trim != 1.0 and upstream != 0: sys.stderr.write('Warning: Using -u and -t parameters simultaneously! Please consider if this is truly what you want. Continuing ...\n') + + scaf_lens = get_scaffold_lengths(fasta) + genes, pos_to_gene, exon_lens = parse_gff(gff, upstream, trim) + pos_counts = load_counts(wig_file) + gene_counts, tot_counts = aggregate_counts(scaf_lens, pos_to_gene, pos_counts, exon_only) + compute_dvals(output, genes, exon_lens, scaf_lens, gene_counts, tot_counts, trim, upstream, exon_only) + +if __name__ == '__main__': + #Parse arguments. + parser = argparse.ArgumentParser(description=""" + Progam to compute the dVal for each gene, a simple metric which simultaneously normalizes for sequencing depth and gene length and quantifies fitness. + """) + + parser.add_argument('-i', '--wig_file', help='Location of wig file with insertion information.', required=True) + parser.add_argument('-o', '--output', help='Location and prefix of output file with Dval stats from aggregated counts.', required=True) + parser.add_argument('-f', '--fasta', help='Fasta file of genome assembly corresponding to GFF', required=True) + parser.add_argument('-g', '--gff', help='GFF file from Vesper/Calhoun. Gene ID needs to be in aliases identified by the key "ID".', required=True) + parser.add_argument('-e', '--exon_only', action='store_true', help='Calculate Dval accounting for only insertions which lie on genes.', required=False, default=False) + parser.add_argument('-t', '--trim', type=float, help='Middle proportion of genes to consider. For instance, 0.8 specifies that the first and last 10 percent of bases in the gene will be ignored.', required=False, default=1.0) + parser.add_argument('-u', '--upstream', type=int, help='Include specified number of bases upstream as part of gene to test if including promotor regions maintains essentiallity. Should generally not be used in combination with trim. Default is 0.', default=0, required=False) + + args = parser.parse_args() + + calc_dvalue_pipeline(args.wig_file, args.output, args.fasta, args.gff, args.exon_only, args.upstream, args.trim) diff --git a/fitness_cost_assessment/GenicPerMutant.py b/fitness_cost_assessment/GenicPerMutant.py new file mode 100755 index 0000000..3ebae48 --- /dev/null +++ b/fitness_cost_assessment/GenicPerMutant.py @@ -0,0 +1,95 @@ +import os +import sys +import numpy as np +import argparse +import random +from scipy import stats +import math + +def permutant(wig, gff, output, gene, window, genome_length, permutation): + + start = None + stop = None + genic_regions = set([]) + + with open(gff) as ogf: + for line in ogf: + line = line.rstrip() + ls = line.split('\t') + if ls[2] != 'gene': continue + gene_id = ls[-1].split('ID=')[-1].split(';')[0] + if gene_id == gene: + start = int(ls[3]) + stop = int(ls[4]) + for p in range(int(ls[3]), int(ls[4])+1): genic_regions.add(p) + + glen = abs(start-stop) + 1 + mid80_start = int(math.floor(start + (glen*0.10))) + mid80_stop = int(math.ceil(stop - (glen*0.10))) + + + window_start = [start - window, None] + window_stop = [stop + window, None] + if window_start[0] < 1: + alt_start = genome_length - (window - start) + window_start = [1, alt_start] + window_stop[1] = genome_length + elif window_stop[0] > genome_length: + alt_stop = window_stop[0] - genome_length + window_stop = [genome_length, alt_stop] + window_start[1] = 1 + + window_tot_ta = 0 + window_sat_ta = 0 + region_ta = 0 + region_sum = 0 + window_nz_counts = [] + with open(wig) as ow: + for line in ow: + line = line.rstrip('\n') + if line.startswith('variable'): continue + ls = line.split('\t') + pos = int(ls[0]) + count = float(ls[1]) + if (pos >= window_start[0] and pos <= window_stop[0]) or (window_start[1] and pos >= window_start[1] and pos <= window_stop[1]):# and pos in genic_regions: + if pos >= mid80_start and pos <= mid80_stop: + region_ta += 1 + region_sum += count + window_tot_ta += 1 + if count > 0.0: + window_nz_counts.append(count) + window_sat_ta += 1 + + p = window_sat_ta/float(window_tot_ta) + empirical_pvalue = 0 + + outf = open(output, 'w') + outf.write(str(region_sum) + '\n') + for i in range(0, permutation): + simulated_nz_sites = np.random.binomial(region_ta, p, 1)[0] + simulated_sum = 0 + random.shuffle(window_nz_counts) + if simulated_nz_sites > 0: simulated_sum = sum(window_nz_counts[:simulated_nz_sites]) + if simulated_sum <= region_sum: empirical_pvalue += 1 + outf.write(str(simulated_sum) + '\n') + + outf.write('#' + str(empirical_pvalue/permutation) + '\t' + str(p) + '\t' + str(region_ta) + '\n') + outf.close() + +if __name__ == '__main__': + # Parse arguments. + parser = argparse.ArgumentParser(description=""" + Progam to simulate localized permutation of Tn mutant abundances in gene relative to local surroundings to estimate + probability that the gene is depleted in mutants to expectations. + """) + + parser.add_argument('-i', '--wig_file', help='Location of wig file with insertion information.', required=True) + parser.add_argument('-a', '--gff', help='gff annotation', required=True) + parser.add_argument('-o', '--output', help='Location and prefix of output file with Dval stats from aggregated counts.', required=True) + parser.add_argument('-d', '--gene', help='gene id', required=True) + parser.add_argument('-w', '--window', type=int, help='The window size.', required=True) + parser.add_argument('-g', '--genome_length', type=int, help='Length of genome', required=True) + parser.add_argument('-p', '--permutation', type=int, help='permutations', required=False, default=100000) + + args = parser.parse_args() + permutant(args.wig_file, args.gff, args.output, args.gene, args.window, args.genome_length, args.permutation) \ No newline at end of file diff --git a/fitness_cost_assessment/ModifiedDvalCalculator.py b/fitness_cost_assessment/ModifiedDvalCalculator.py new file mode 100755 index 0000000..004273a --- /dev/null +++ b/fitness_cost_assessment/ModifiedDvalCalculator.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python +# Rauf Salamzade +# ModifiedDvalCalculator.py + +import os +import sys +from sys import stderr +import argparse +from collections import defaultdict +from Bio import SeqIO +import math +import re + +try: assert(sys.version_info[0] == 3) +except: stderr.write("Please use Python-3 to run this program. Exiting now ...\n"); sys.exit(1) + +def get_scaffold_lengths(fasta, masked_TA_sites): + + masked_sites = defaultdict(set) + if masked_TA_sites: + with open(masked_TA_sites) as omts: + for line in omts: + line = line.rstrip('\n') + ls = line.split('\t') + masked_sites[ls[0]].add(int(ls[1])) + + scaf_lens = {}; scaf_tas = {} + with open(fasta) as of: + for rec in SeqIO.parse(of, 'fasta'): + scaf_lens[rec.id] = len(str(rec.seq)) + scaf_tas[rec.id] = sorted([x+1 for x in [m.start() for m in re.finditer('TA', str(rec.seq).upper())] if not (x+1) in masked_sites[rec.id]]) + return scaf_lens, scaf_tas + +def parse_gff(gff, upstream, trim, scaf_tas): + trim_edge = (1.0-trim)/2.0 + pos_to_gene = {} + genes = {} + exon_tas = {} + with open(gff) as og: + for line in og: + line = line.rstrip('\n') + ls = line.split('\t') + if not line.startswith("#") and ls[2] == 'gene': + gene = ls[-1].split('ID=')[1].split(';')[0] + scaffold = ls[0]; start = int(ls[3]); stop = int(ls[4]); strand = ls[6] + if strand == '+': start -= upstream + elif strand == '-': stop += upstream + glen = abs(start-stop)+1 + # trim start loci and round up to integer + start = int(math.floor(start + (glen*trim_edge))) + # trim stop loci and round down to integer + stop = int(math.ceil(stop - (glen*trim_edge))) + # Please note, gene length is the length of the feature + # after trimming and upstream region addition are accounted + # for. + if not pos_to_gene.has_key(scaffold): + pos_to_gene[scaffold] = defaultdict(set) + genes[scaffold] = {} + exon_tas[scaffold] = 0 + gene_tas = [x for x in scaf_tas[scaffold] if x >= start and x <= stop] + exon_tas[scaffold] += len(gene_tas) + genes[scaffold][gene] = [start, stop] + for bp in range(start, stop+1): + pos_to_gene[scaffold][bp].add(gene) + return genes, pos_to_gene, exon_tas + +def load_counts(wig_file): + hopcounts = {} + scaf = None + with open(wig_file) as ow: + for i, line in enumerate(ow): + line = line.rstrip('\n') + if line.startswith("variableStep"): + scaf = line.split('chrom=')[1] + hopcounts[scaf] = defaultdict(float) + else: + pos, insert_count = line.split('\t') + hopcounts[scaf][int(pos)] = float(insert_count) + return hopcounts + +def aggregate_counts(scaf_lens, pos_to_gene, hopcounts, exon_only): + gene_counts = {} + total_counts = defaultdict(float) + for scaf in scaf_lens: + gene_counts[scaf] = defaultdict(float) + for bp in range(1, scaf_lens[scaf] + 1): + if hopcounts[scaf].has_key(bp): + if pos_to_gene[scaf].has_key(bp): + genes = pos_to_gene[scaf][bp] + for g in genes: + gene_counts[scaf][g] += hopcounts[scaf][bp] + total_counts[scaf] += hopcounts[scaf][bp] + elif not exon_only: + total_counts[scaf] += hopcounts[scaf][bp] + return gene_counts, total_counts + +def compute_dvals(output, genes, exon_tas, scaf_tas, gene_counts, total_counts, trim, upstream, exon_only): + for s in genes: + outf = open('.'.join([output, s.replace('.','-'), 'txt']), 'w') + outf.write('gene\tscaffold\tstart\tstop\tgene_length\ttrim\tupstream\texon_only\tobserved_insertions\tdval\n') + for g in genes[s]: + observed_count = gene_counts[s][g] + gstart, gend = genes[s][g] + gene_tas = len([x for x in scaf_tas[s] if x >= gstart and x <= gend]) + expected_count = None + if exon_only: expected_count = (float(gene_tas)/exon_tas[s])*total_counts[s] + else: expected_count = (float(gene_tas)/len(scaf_tas[s]))*total_counts[s] + dval = 'nan' + if expected_count > 0.0: dval = float(observed_count)/expected_count + outf.write('\t'.join([str(x) for x in [g, s, gstart, gend, gene_tas, trim, upstream, exon_only, observed_count, dval]])+ '\n') + outf.close() + +def calc_dvalue_pipeline(wig_file, output, masked_TA_sites, fasta, gff, exon_only, upstream, trim): + """ Main function which wraps everything together """ + try: assert(upstream >= 0) + except: sys.stderr.write('Error: -u parameter must be provided positive integer or zero. Exiting now ...\n'); sys.exit(1) + try: assert(trim > 0.0 and trim <= 1.0) + except: sys.stderr.write('Error: -t trim must be provided positive float greater than 0.0 and less than or equal to 1.0. Exiting now ...\n'); sys.exit(1) + try: assert(os.path.isfile(fasta) and os.path.isfile(wig_file) and os.path.isfile(gff)) + except: sys.stderr.write('Error: Unable to locate hopcounts, gff, and/or fasta input files! Please check input and try again. Exiting now ...\n'); sys.exit(1) + if trim != 1.0 and upstream != 0: sys.stderr.write('Warning: Using -u and -t parameters simultaneously! Please consider if this is truly what you want. Continuing ...\n') + + scaf_lens, scaf_tas = get_scaffold_lengths(fasta, masked_TA_sites) + genes, pos_to_gene, exon_tas = parse_gff(gff, upstream, trim, scaf_tas) + pos_counts = load_counts(wig_file) + gene_counts, tot_counts = aggregate_counts(scaf_lens, pos_to_gene, pos_counts, exon_only) + compute_dvals(output, genes, exon_tas, scaf_tas, gene_counts, tot_counts, trim, upstream, exon_only) + +if __name__ == '__main__': + #Parse arguments. + parser = argparse.ArgumentParser(description=""" + Progam to compute the modified dVal for each gene, a simple metric which simultaneously normalizes for sequencing depth and TA count (instead of gene length) and quantifies fitness. + """) + + parser.add_argument('-i', '--wig_file', help='Location of wig file with insertion information.', required=True) + parser.add_argument('-o', '--output', help='Location and prefix of output file with Dval stats from aggregated counts.', required=True) + parser.add_argument('-m', '--masked_TA_sites', help='File with coordinates of masked TA sites.', required=False) + parser.add_argument('-f', '--fasta', help='Fasta file of genome assembly corresponding to GFF', required=True) + parser.add_argument('-g', '--gff', help='GFF file from Vesper/Calhoun. Gene ID needs to be in aliases identified by the key "ID".', required=True) + parser.add_argument('-e', '--exon_only', action='store_true', help='Calculate Dval accounting for only insertions which lie on genes.', required=False, default=False) + parser.add_argument('-t', '--trim', type=float, help='Middle proportion of genes to consider. For instance, 0.8 specifies that the first and last 10 percent of bases in the gene will be ignored.', required=False, default=1.0) + parser.add_argument('-u', '--upstream', type=int, help='Include specified number of bases upstream as part of gene to test if including promotor regions maintains essentiallity. Should generally not be used in combination with trim. Default is 0.', default=0, required=False) + + args = parser.parse_args() + + calc_dvalue_pipeline(args.wig_file, args.output, args.masked_TA_sites, args.fasta, args.gff, args.exon_only, args.upstream, args.trim) \ No newline at end of file diff --git a/fitness_cost_assessment/MultiGenePermutant.py b/fitness_cost_assessment/MultiGenePermutant.py new file mode 100755 index 0000000..fb55990 --- /dev/null +++ b/fitness_cost_assessment/MultiGenePermutant.py @@ -0,0 +1,92 @@ +import os +import sys +import numpy as np +import argparse +import random +from scipy import stats +import math + +def permutant(wig, gff, output, start, stop, window, genome_length, permutation): + + genic_regions = set([]) + + with open(gff) as ogf: + for line in ogf: + line = line.rstrip() + ls = line.split('\t') + if ls[2] != 'gene': continue + for p in range(int(ls[3]), int(ls[4])+1): genic_regions.add(p) + + window_start = [start - window, None] + window_stop = [stop + window, None] + if window_start[0] < 1: + alt_start = genome_length - (window - start) + window_start = [1, alt_start] + window_stop[1] = genome_length + elif window_stop[0] > genome_length: + alt_stop = window_stop[0] - genome_length + window_stop = [genome_length, alt_stop] + window_start[1] = 1 + + window_tot_ta = 0 + window_sat_ta = 0 + region_ta = 0 + region_sum = 0 + window_nz_counts = [] + continuous_zeros = 0 + longest_continuous_zeros = 0 + with open(wig) as ow: + for line in ow: + line = line.rstrip('\n') + if line.startswith('variable'): continue + ls = line.split('\t') + pos = int(ls[0]) + count = float(ls[1]) + if ((pos >= window_start[0] and pos <= window_stop[0]) or (window_start[1] and pos >= window_start[1] and pos <= window_stop[1])) and pos in genic_regions: + if pos >= start and pos <= stop: + region_ta += 1 + region_sum += count + if count > 0.0: + if continuous_zeros > longest_continuous_zeros: longest_continuous_zeros = continuous_zeros + continuous_zeros = 0 + else: + continuous_zeros += 1 + window_tot_ta += 1 + if count > 0.0: + window_nz_counts.append(count) + window_sat_ta += 1 + + if continuous_zeros > longest_continuous_zeros: longest_continuous_zeros = continuous_zeros + p = window_sat_ta/float(window_tot_ta) + empirical_pvalue = 0 + + for i in range(0, permutation): + simulated_nz_sites = np.random.binomial(region_ta, p, 1)[0] + simulated_sum = 0 + random.shuffle(window_nz_counts) + if simulated_nz_sites > 0: simulated_sum = sum(window_nz_counts[:simulated_nz_sites]) + if simulated_sum <= region_sum: empirical_pvalue += 1 + + outf = open(output, 'w') + outf.write(str(empirical_pvalue/permutation) + '\t' + str(p) + '\n') + outf.close() + +if __name__ == '__main__': + # Pull out the arguments. + parser = argparse.ArgumentParser(description=""" + Progam to compute the Dval for each gene, a simple metric which simultaneously normalizes for sequencing depth and gene length and quantifies fitness. + """) + + parser.add_argument('-i', '--wig_file', help='Location of wig file with insertion information.', required=True) + parser.add_argument('-a', '--gff', help='gff annotation', required=True) + parser.add_argument('-o', '--output', help='Location and prefix of output file with Dval stats from aggregated counts.', required=True) + parser.add_argument('-s', '--start', type=int, help='start position', required=True) + parser.add_argument('-t', '--stop', type=int, help='stop position', required=True) + parser.add_argument('-w', '--window', type=int, help='The window size.', required=True) + parser.add_argument('-g', '--genome_length', type=int, help='Length of genome', required=True) + parser.add_argument('-p', '--permutation', type=int, help='permutations', required=False, default=100000) + + args = parser.parse_args() + permutant(args.wig_file, args.gff, args.output, args.start, args.stop, args.window, args.genome_length, args.permutation) + + diff --git a/fitness_cost_assessment/MultiGenePermutantOri.py b/fitness_cost_assessment/MultiGenePermutantOri.py new file mode 100755 index 0000000..962870e --- /dev/null +++ b/fitness_cost_assessment/MultiGenePermutantOri.py @@ -0,0 +1,80 @@ +import os +import sys +import numpy as np +import argparse +import random +from scipy import stats +import math + +def permutant(wig, gff, output, permutation): + + genic_regions = set([]) + + with open(gff) as ogf: + for line in ogf: + line = line.rstrip() + ls = line.split('\t') + if ls[2] != 'gene': continue + for p in range(int(ls[3]), int(ls[4])+1): genic_regions.add(p) + + start = [1, 3150945] + stop = [10528, 3152103] + window_start = [1, 3050945] + window_stop = [110528, 3152103] + + window_tot_ta = 0 + window_sat_ta = 0 + region_ta = 0 + region_sum = 0 + window_nz_counts = [] + continuous_zeros = 0 + longest_continuous_zeros = 0 + with open(wig) as ow: + for line in ow: + line = line.rstrip('\n') + if line.startswith('variable'): continue + ls = line.split('\t') + pos = int(ls[0]) + count = float(ls[1]) + if ((pos >= window_start[0] and pos <= window_stop[0]) or (window_start[1] and pos >= window_start[1] and pos <= window_stop[1])) and pos in genic_regions: + if (pos >= start[0] and pos <= stop[0]) or (pos >= start[1] and pos <= stop[1]): + region_ta += 1 + region_sum += count + if count > 0.0: + if continuous_zeros > longest_continuous_zeros: longest_continuous_zeros = continuous_zeros + continuous_zeros = 0 + else: + continuous_zeros += 1 + window_tot_ta += 1 + if count > 0.0: + window_nz_counts.append(count) + window_sat_ta += 1 + + if continuous_zeros > longest_continuous_zeros: longest_continuous_zeros = continuous_zeros + p = window_sat_ta/float(window_tot_ta) + empirical_pvalue = 0 + + for i in range(0, permutation): + simulated_nz_sites = np.random.binomial(region_ta, p, 1)[0] + simulated_sum = 0 + random.shuffle(window_nz_counts) + if simulated_nz_sites > 0: simulated_sum = sum(window_nz_counts[:simulated_nz_sites]) + if simulated_sum <= region_sum: empirical_pvalue += 1 + + outf = open(output, 'w') + outf.write(str(empirical_pvalue/permutation) + '\t' + str(p) + '\n') + outf.close() + +if __name__ == '__main__': + # Pull out the arguments. + parser = argparse.ArgumentParser(description=""" + Progam to compute the Dval for each gene, a simple metric which simultaneously normalizes for sequencing depth and gene length and quantifies fitness. + """) + + parser.add_argument('-i', '--wig_file', help='Location of wig file with insertion information.', required=True) + parser.add_argument('-a', '--gff', help='gff annotation', required=True) + parser.add_argument('-o', '--output', help='Location and prefix of output file with Dval stats from aggregated counts.', required=True) + parser.add_argument('-p', '--permutation', type=int, help='permutations', required=False, default=100000) + + args = parser.parse_args() + permutant(args.wig_file, args.gff, args.output, args.permutation) \ No newline at end of file diff --git a/permutation_environment.yml b/permutation_environment.yml new file mode 100755 index 0000000..9eb0135 --- /dev/null +++ b/permutation_environment.yml @@ -0,0 +1,15 @@ +name: PerMutation +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python=3.7 + - biopython + - numpy + - scipy + - pysam + - trimmomatic + - fastx_toolkit + - bowtie2 + - samtools \ No newline at end of file diff --git a/predicting_abx_resistance/PerMutation-CE.py b/predicting_abx_resistance/PerMutation-CE.py new file mode 100755 index 0000000..80b5c3e --- /dev/null +++ b/predicting_abx_resistance/PerMutation-CE.py @@ -0,0 +1,131 @@ +import os +import sys +import argparse +from collections import defaultdict +import numpy +import math + +def p_adjust_bh(p): + """ + Using implementation from https://stackoverflow.com/questions/7450957/how-to-implement-rs-p-adjust-in-python/33532498#33532498 + Benjamini-Hochberg p-value correction for multiple hypothesis testing. + """ + p = numpy.asfarray(p) + by_descend = p.argsort()[::-1] + by_orig = by_descend.argsort() + steps = float(len(p)) / numpy.arange(len(p), 0, -1) + q = numpy.minimum(1, numpy.minimum.accumulate(steps * p[by_descend])) + return q[by_orig] + +def read_in_wig_data(all_wigs): + wigdata = [] + position = [] + for i, wf in enumerate(all_wigs): + sample_counts = [] + with open(wf) as owf: + for line in owf: + line = line.strip() + ls = line.split('\t') + if line.startswith("variableStep"): continue + if i == 0: position.append(int(ls[0])) + sample_counts.append(float(ls[1])) + wigdata.append(sample_counts) + + return([position, wigdata]) + +def resampling(control_data, treatment_data, S=100000): + count_greater = 1 # to prevent p-value of 0 + + n_control = len(list(control_data.values())[0]) + control_mean = numpy.average([sum(x[1]) for x in control_data.items()]) + treatment_mean = numpy.average([sum(x[1]) for x in treatment_data.items()]) + + log2FC = (control_mean + 0.1)/(treatment_mean+0.1) + + stat_obs = 0 + test_data = [] + for ind in control_data: + test_data.append(control_data[ind] + treatment_data[ind]) + control_sum = sum(control_data[ind]) + treat_sum = sum(treatment_data[ind]) + stat_obs += control_sum - treat_sum + + for j, s in enumerate(range(S)): + stat = 0 + for pos_vals in test_data: + perm_vals = numpy.random.permutation(pos_vals) + stat += sum(perm_vals[:n_control])-sum(perm_vals[n_control:]) + if stat >= stat_obs: count_greater += 1 + pval_greater = float(count_greater) / float(S) + + return (stat_obs, log2FC, pval_greater) + +def create_parser(): + # Parse arguments. + + parser = argparse.ArgumentParser(description=""" This program runs a position aware variant of the permutation test developed by DeJesus et al 2015.""") + + parser.add_argument('-c', '--control_wigs', help='control wig files separated by comma.', required=True) + parser.add_argument('-e', '--experiment_wigs', help='experimental wig files separated by comma.', required=True) + parser.add_argument('-b', '--bed_file', help='BED file with four columns: scaffold id, start position, end position, gene id', required=True) + parser.add_argument('-s', '--scaffold', help='scaffold id', required=True) + parser.add_argument('-o', '--output', help='output file', required=True) + + args = parser.parse_args() + return args + +# parse arguments +myargs = create_parser() +control_wigs = myargs.control_wigs.split(',') +experiment_wigs = myargs.experiment_wigs.split(',') +bed_file = myargs.bed_file +output_file = myargs.output +scaffold = myargs.scaffold + +replicate_labels = ['control']*len(control_wigs) + ['experiment']*len(experiment_wigs) +all_wigs = control_wigs + experiment_wigs +position, wigdata = read_in_wig_data(all_wigs) + +pos_to_index = {} +for i, p in enumerate(position): + nz_flag = False + for r in wigdata: + if r[i] > 0.0: + nz_flag = True + if nz_flag: + pos_to_index[p] = i + +all_info = [] +pvalues = [] +with open(bed_file) as obf: + for line in obf: + line = line.rstrip('\n') + feature_scaffold, start, stop, description = line.split('\t') + relevant_indices = [] + start = min(int(start), int(stop)) + stop = max(int(start), int(stop)) + glen = abs(start - stop) + 1 + start = int(math.floor(start + (glen * 0.10))) + stop = int(math.ceil(stop - (glen * 0.10))) + for p in range(start, stop+1): + if p in pos_to_index: + relevant_indices.append(pos_to_index[p]) + if not relevant_indices: continue + control_samples = defaultdict(list) + experiment_samples = defaultdict(list) + for ind in relevant_indices: + for r in range(0,len(control_wigs)): + control_samples[ind].append(wigdata[r][ind]) + for r in range(len(control_wigs), (len(control_wigs)+len(experiment_wigs))): + experiment_samples[ind].append(wigdata[r][ind]) + + stat, log2FC, pval = resampling(control_samples, experiment_samples) + all_info.append([feature_scaffold, start, stop, description, log2FC, stat, pval]) + pvalues.append(pval) + +outf = open(output_file, 'w') +qvalues = p_adjust_bh(pvalues) + +for i, q in enumerate(qvalues): + outf.write('\t'.join([ str(x) for x in (all_info[i] + [q])])+ '\n') +outf.close() diff --git a/predicting_abx_resistance/PerMutation-GA.py b/predicting_abx_resistance/PerMutation-GA.py new file mode 100755 index 0000000..751616e --- /dev/null +++ b/predicting_abx_resistance/PerMutation-GA.py @@ -0,0 +1,137 @@ +import os +import sys +import argparse +from collections import defaultdict +import numpy +import math + +def p_adjust_bh(p): + """ + Using implementation from https://stackoverflow.com/questions/7450957/how-to-implement-rs-p-adjust-in-python/33532498#33532498 + Benjamini-Hochberg p-value correction for multiple hypothesis testing. + """ + p = numpy.asfarray(p) + by_descend = p.argsort()[::-1] + by_orig = by_descend.argsort() + steps = float(len(p)) / numpy.arange(len(p), 0, -1) + q = numpy.minimum(1, numpy.minimum.accumulate(steps * p[by_descend])) + return q[by_orig] + +def read_in_wig_data(all_wigs): + wigdata = [] + position = [] + for i, wf in enumerate(all_wigs): + sample_counts = [] + with open(wf) as owf: + for line in owf: + line = line.strip() + ls = line.split('\t') + if line.startswith("variableStep"): continue + if i == 0: position.append(int(ls[0])) + sample_counts.append(float(ls[1])) + wigdata.append(sample_counts) + + wigdata_tr = [] + for ls in zip(*wigdata): + wigdata_tr.append(list(ls)) + + return ([position, wigdata_tr]) + +def resampling(control_data, treatment_data, S=100000): + count_greater = 0 + + n_control = len(list(control_data.values())[0]) + control_mean = numpy.average([sum(x[1]) for x in control_data.items()]) + treatment_mean = numpy.average([sum(x[1]) for x in treatment_data.items()]) + + log2FC = (control_mean + 0.1)/(treatment_mean+0.1) + + + stat_obs = 0 + test_data = [] + for ind in control_data: + control_sum = sum(control_data[ind]) + treat_sum = sum(treatment_data[ind]) + test_data.append(control_data[ind] + treatment_data[ind]) + stat_obs += treat_sum - control_sum + + + for j, s in enumerate(range(S)): + stat = 0 + for pos_vals in test_data: + perm_vals = numpy.random.permutation(pos_vals) + stat += sum(perm_vals[n_control:]) - sum(perm_vals[:n_control]) + if stat >= stat_obs: count_greater += 1 + pval_greater = float(count_greater) / float(S) + + return (stat_obs, log2FC, pval_greater) + +def create_parser(): + # Parse arguments. + + parser = argparse.ArgumentParser(description=""" This program runs a position aware variant of the permutation test developed by DeJesus et al 2015.""") + + parser.add_argument('-c', '--control_wigs', help='control wig files separated by comma.', required=True) + parser.add_argument('-e', '--experiment_wigs', help='experimental wig files separated by comma.', required=True) + parser.add_argument('-b', '--bed_file', help='BED file with four columns: scaffold id, start position, end position, gene id', required=True) + parser.add_argument('-s', '--scaffold', help='scaffold id', required=True) + parser.add_argument('-o', '--output', help='output file', required=True) + + args = parser.parse_args() + return args + +# parse arguments +myargs = create_parser() +control_wigs = myargs.control_wigs.split(',') +experiment_wigs = myargs.experiment_wigs.split(',') +bed_file = myargs.bed_file +output_file = myargs.output +scaffold = myargs.scaffold + +replicate_labels = ['control']*len(control_wigs) + ['experiment']*len(experiment_wigs) +all_wigs = control_wigs + experiment_wigs +position, wigdata = read_in_wig_data(all_wigs) + +pos_to_index = {} +for i, p in enumerate(position): + nz_flag = False + for r in wigdata: + if r[i] > 0.0: + nz_flag = True + if nz_flag: + pos_to_index[p] = i + +all_info = [] +pvalues = [] +with open(bed_file) as obf: + for line in obf: + line = line.rstrip('\n') + feature_scaffold, start, stop, description = line.split('\t') + relevant_indices = [] + start = min(int(start), int(stop)) + stop = max(int(start), int(stop)) + glen = abs(start - stop) + 1 + start = int(math.floor(start + (glen * 0.10))) + stop = int(math.ceil(stop - (glen * 0.10))) + for p in range(start, stop+1): + if p in pos_to_index: + relevant_indices.append(pos_to_index[p]) + if not relevant_indices: continue + control_samples = defaultdict(list) + experiment_samples = defaultdict(list) + for ind in relevant_indices: + for r in range(0,len(control_wigs)): + control_samples[ind].append(wigdata[r][ind]) + for r in range(len(control_wigs), (len(control_wigs)+len(experiment_wigs))): + experiment_samples[ind].append(wigdata[r][ind]) + + stat, log2FC, pval = resampling(control_samples, experiment_samples) + all_info.append([feature_scaffold, start, stop, description, log2FC, stat, pval]) + pvalues.append(pval) + +outf = open(output_file, 'w') +qvalues = p_adjust_bh(pvalues) + +for i, q in enumerate(qvalues): + outf.write('\t'.join([ str(x) for x in (all_info[i] + [q])])+ '\n') +outf.close() \ No newline at end of file diff --git a/processing/CorrectReplicationBias.py b/processing/CorrectReplicationBias.py new file mode 100644 index 0000000..8d37b60 --- /dev/null +++ b/processing/CorrectReplicationBias.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python +# Rauf Salamzade +# DvalCalculator.py +import os +import sys +import subprocess +from sys import stderr +import argparse +from collections import defaultdict +from Bio import SeqIO +import math +import operator + +try: assert(sys.version_info[0] == 3) +except: stderr.write("Please use Python-3 to run this program. Exiting now ...\n"); sys.exit(1) + +def correction_pipeline(wig_file, chrom_scaffold, out_file, zero_deflate, window_size, rscript_path): + tmp_dir = os.path.abspath(out_file + '_tmp_' + str(os.getpid())) + '/' + try: assert(not os.path.exists(tmp_dir)) + except: sys.stderr.write('Error: somehow the tmp directory path is already in existence. Please try running again. Exiting now ...\n'); sys.exit(1) + else: os.system('mkdir ' + tmp_dir) + + scaffold = None + outf = None + corrected_results = defaultdict(list) + for i, line in enumerate(open(wig_file)): + line = line.rstrip('\n') + if line.startswith("variableStep"): + if i != 0: + if scaffold == chrom_scaffold: + outf.close() + rscript_cmd = ['Rscript', rscript_path, tmp_dir + scaffold + '.txt', str(window_size), tmp_dir + scaffold + '.adjusted.txt'] + subprocess.call(rscript_cmd) + for j, l in enumerate(open(tmp_dir + scaffold + '.adjusted.txt')): + if j > 0: + l = l.rstrip('\n') + corrected_results[scaffold].append([float(x) for x in l.split('\t')]) + scaffold = line.split('chrom=')[1] + if scaffold == chrom_scaffold: + outf = open(tmp_dir + scaffold + '.txt', 'w') + outf.write('loci\tcounts\n') + elif scaffold == chrom_scaffold: + if zero_deflate and float(line.split('\t')[1]) != 0.0: outf.write(line + '\n') + elif not zero_deflate: outf.write(line + '\n') + if i > 0: + if scaffold == chrom_scaffold: + outf.close() + rscript_cmd = ['Rscript', rscript_path, tmp_dir + scaffold + '.txt', str(window_size), tmp_dir + scaffold+ '.adjusted.txt'] + subprocess.call(rscript_cmd) + for j, l in enumerate(open(tmp_dir + scaffold + '.adjusted.txt')): + if j > 0: + l = l.strip('\n') + corrected_results[scaffold].append([float(x) for x in l.split('\t')]) + + of = open(out_file, 'w') + for s in corrected_results: + of.write('variableStep chrom=' + s + '\n') + for loc in sorted(corrected_results[s], key=operator.itemgetter(0)): + of.write(str(int(loc[0])) + '\t' + str(loc[1]) + '\n') + of.close() + + os.system('rm -rf ' + tmp_dir) + +if __name__ == '__main__': + #Parse arguments. + parser = argparse.ArgumentParser(description=""" + This is a wrapper to a TnseqDiff R function which applies LOESS correction for replication bias. Replication bias + correction is only performed for the chromosome, which is assumed to have a complete representation in the assembly. + """) + + parser.add_argument('-i', '--wig_file', type=str, help='Location of wig file with insertion information.', required=True) + parser.add_argument('-c', '--chrom_scaffold', type=str, help='Name of the chromosomal scaffold in the assembly/wig-file.', required=True) + parser.add_argument('-o', '--out_file', type=str, help='Location of resulting wig file where counts have been corrected for replication bias.', required=True) + parser.add_argument('-z', '--zero_deflate', action='store_true', help='Whether to exclude zero-insertion TA sites when correcting for replication bias. Default is false.', required=False, default=False) + parser.add_argument('-w', '--window_size', type=int, help='The window size to use when correcting for replication bias. Default is 10,000 .', required=False, default=10000) + parser.add_argument('-p', '--rscript_path', type=str, help='Path to Rscript ReplicationCorrector.R for replication bias correction.', required=True) + args = parser.parse_args() + + correction_pipeline(args.wig_file, args.chrom_scaffold, args.out_file, args.zero_deflate, args.window_size, args.rscript_path) diff --git a/processing/HopCountsBroad.py b/processing/HopCountsBroad.py new file mode 100755 index 0000000..b84371b --- /dev/null +++ b/processing/HopCountsBroad.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python +# Rauf Salamzade +# HopCountsBroad.py + +import os +import sys +import re +import subprocess +import argparse +from collections import defaultdict +from Bio import SeqIO +import pysam + +try: + assert (sys.version_info[0] == 3) +except: + sys.stderr.write("Please use Python-3 to run this program. Exiting now ...\n"); sys.exit(1) + +# number of cores/threads to use for trimmomatic + bowtie2 +threads = 1 + +def fastq_filter(inFastq, outputDir, uncompressFlag): + # Use the fastx toolkit to clip the data and the Trimmomatic software to QC filter the reads. + + clippedFastq = outputDir + '.'.join(inFastq.split('/')[-1].split('.')[:-1]) + '.clipped.fastq.gz' + trimmedFastq = outputDir + '.'.join(inFastq.split('/')[-1].split('.')[:-1]) + '.clipped.trimmed.fastq.gz' + clippedQC = outputDir + "fastxClipperQC/" + trimmedQC = outputDir + "trimmomaticQC/" + createdir_cmd = ["mkdir", clippedQC, trimmedQC] + subprocess.call(createdir_cmd) + + # Here I change the minimum length of a read to be maintained to 15, as k-mers of size 17-19 are 99% + # likely to be unique to a region of a given bacterial reference according to scientific literature. + # Additonally, I allow for reads with ambiguous 'N' characters to be retained. + print('Clipping adapter string from reads ... ') + fastx_cmd = ['fastx_clipper', '-o', clippedFastq, '-a', 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC', + '-l15', '-n', '-Q33', '-z', '-i', inFastq] + clipper_fastqc_cmd = ['fastqc', clippedFastq, '-o', clippedQC] + subprocess.call(fastx_cmd) + subprocess.call(clipper_fastqc_cmd) + print('Finished') + + # NOTE: I got rid of a step to trim reads to 25 bp. To find the corresponding insertion site for each read + # I employ a different algorithm. + + # Run Trimmomatic to filter reads based on Phred scores. + print('Filtering for quality... ') + trimmomatic_cmd = ['trimmomatic', + 'SE', '-threads', str(threads), '-phred33', clippedFastq, trimmedFastq, 'LEADING:20', + 'TRAILING:30', + 'SLIDINGWINDOW:4:20', 'MINLEN:15'] + trim_fastqc_cmd = ["fastqc", trimmedFastq, "-o", trimmedQC] + subprocess.call(trimmomatic_cmd) + subprocess.call(trim_fastqc_cmd) + print('Finished') + cleanup_cmd = ["rm", clippedFastq] + if uncompressFlag: + cleanup_cmd += [inFastq] + subprocess.call(cleanup_cmd) + return (trimmedFastq) + + +def run_bowtie(inFastq, indexFile, outputDir, stringency): + # Original bowtie v1 command run on Tufts servers was: + # bowtie -q -p 4 -S -n 2 -e 70 -l 28 --maxbts 800 -k 1 -m 1 --best --phred33-quals S.aureus + # Sample_MHcontrol2.R1.fastq.clp.trm.qc > Sample_MHcontrol2.R1.fastq.sam + # Here I switch over to bowtie2: + + stats_file_stdout = outputDir + "bowtie2.stdout" + stats_file_stderr = outputDir + "bowtie2.stderr" + sam_file = outputDir + '.'.join(inFastq.split('/')[-1].split('.')[:-4]) + '.sam' + filtered_sam_file = outputDir + '.'.join(inFastq.split('/')[-1].split('.')[:-4]) + '.filt.sam' + bam_file = outputDir + '.'.join(inFastq.split('/')[-1].split('.')[:-4]) + '.filt.bam' + bam_file_sorted = outputDir + '.'.join(inFastq.split('/')[-1].split('.')[:-4]) + '.filt.sorted.bam' + + # Run bowtie2 with very sensitive settings and don't change default reporting + print('Running bowtie2 alignment of reads to reference index...') + bowtie2_cmd = ['bowtie2', '-R', '10', '--met-file', stats_file_stdout, '-p', str(threads), '--phred33', + '--very-sensitive', '-x', indexFile, '-U', inFastq, '-S', sam_file, '&>', stats_file_stderr] + subprocess.call(bowtie2_cmd) + print('Finished alignment') + + outf = open(filtered_sam_file, 'w') + with open(sam_file) as osf: + for line in osf: + if line.startswith("@"): outf.write(line) + else: + if stringency.lower().strip() == 'true-uni': + if not 'XS:i:' in line: + ls = line.rstrip('\n').split('\t') + if (ls[1] == "0" or ls[1] == "16") and (int(ls[4]) >= 3): + for l in ls: + if l.startswith("XM:i:"): + mismatches = l.split('XM:i:')[1] + if mismatches == "0" or mismatches == "1": + outf.write(line) + elif stringency.lower().strip() == 'true-multi': + ls = line.rstrip('\n').split('\t') + AS = None; XS = None + for v in line.split(): + if v.startswith("AS:i:"): + AS = v.split('AS:i:')[1] + elif v.startswith("XS:i:"): + XS = v.split('XS:i:')[1] + if ls[1] != "4" and (ls[4] == "0" or ls[4] == "1") and (AS and AS == XS): + outf.write(line) + elif stringency.lower().strip() == 'recommended': + ls = line.rstrip('\n').split('\t') + if (ls[1] == "0" or ls[1] == "16") and (int(ls[4]) >= 10): + for l in ls: + if l.startswith("XM:i:"): + mismatches = l.split('XM:i:')[1] + if mismatches == "0" or mismatches == "1": + outf.write(line) + else: + sys.stderr.write("The stringency specified for the mapping was not an accepted mode. Please read the help statement for the program! Exiting now ...\n") + raise RuntimeError + outf.close() + + print('Filters applied!') + sam_to_bam_cmd = ["samtools", "view", "-Sb", filtered_sam_file] + ob = open(bam_file, 'w') + subprocess.call(sam_to_bam_cmd, stdout=ob) + ob.close() + sort_bam_cmd = ["samtools", "sort", bam_file, '.'.join(bam_file_sorted.split('.')[:-1])] + subprocess.call(sort_bam_cmd) + index_bam_cmd = ["samtools", "index", bam_file_sorted] + subprocess.call(index_bam_cmd) + cleanup_cmd = ["rm", inFastq, sam_file, bam_file, filtered_sam_file] + subprocess.call(cleanup_cmd) + return bam_file_sorted + + +def hopcount_calc(bamFile, outputDir, refGenomeFile, resultName): + # Parse FASTA Genome + reference_fasta = {} + ta_sites = {} + with open(refGenomeFile) as rGF: + for rec in SeqIO.parse(rGF, 'fasta'): + reference_fasta[rec.id] = str(rec.seq) + ta_sites[rec.id] = sorted([x + 1 for x in [m.start() for m in re.finditer('TA', str(rec.seq))]]) + + scaffolds = reference_fasta.keys() + + print(scaffolds) + pysamfile = pysam.AlignmentFile(bamFile, "rb") + for i, scaffold in enumerate(scaffolds): + plusData = defaultdict(int) + minusData = defaultdict(int) + for read in pysamfile.fetch(scaffold): + direction = read.flag + location = read.reference_start + 1 + readlength = read.reference_length + + # find insertion TA site. If insertion is on plus strand, find the closest TA site to the left of the + # alignment reported location. Otherwise, if insertion is on the minus strand, find the closet TA site + # to the right of the alignment reported location. + location = int(location) + if direction == 0: + insertion_location = location - 2 + plusData[insertion_location] += 1 + else: + insertion_location = location + readlength + minusData[insertion_location] += 1 + + # write results to output file! + outf_hopcounts = open(outputDir + resultName + ".tsv", 'a+') + outf_wigfile = open(outputDir + resultName + ".wig", 'a+') + outf_wigfile.write('variableStep chrom=' + scaffold + '\n') + if i == 0: + outf_hopcounts.write( + '\t'.join(['scaffold', 'insertion_location', 'plus_strand_counts', 'minus_strand_counts']) + '\n') + for loc in ta_sites[scaffold]: + outf_wigfile.write('\t'.join([str(loc), str(plusData[loc] + minusData[loc])]) + '\n') + outf_hopcounts.write('\t'.join([scaffold, str(loc), str(plusData[loc]), str(minusData[loc])]) + '\n') + outf_hopcounts.close() + outf_wigfile.close() + + +def run_pipeline(fastqFile, indexName, refGenomeFile, outputDir, resultName, stringency, cores): + global threads, strict_alignment + if cores != 1: + threads = cores + + """ + Main function that runs pipeline by calling all three functions. + """ + + fastqFile = os.path.abspath(fastqFile) + indexName = os.path.abspath(indexName) + refGenomeFile = os.path.abspath(refGenomeFile) + outputDir = os.path.abspath(outputDir) + '/' + + + if not os.path.isdir(outputDir): + os.system('mkdir ' + outputDir) + else: + sys.stderr.write('Warning: output directory already exists.\n') + + # Uncompress fastqFile if necessary + uncompress_flag = False + if fastqFile.endswith('.gz'): + fastqFile_cp = outputDir + fastqFile.split('/')[-1] + os.system("cp " + fastqFile + " " + fastqFile_cp) + os.system("gunzip " + fastqFile_cp) + fastqFile = outputDir + '.'.join(fastqFile.split('/')[-1].split('.')[:-1]) + uncompress_flag = True + + # Call the fastq clip, trim, filter set + processedFastq = fastq_filter(fastqFile, outputDir, uncompress_flag) + + # Call bowtie2 to do the mapping to the reference genomre + bamFile = run_bowtie(processedFastq, indexName, outputDir, stringency) + + # Calculate TA insertion site loci and read counts for them + hopcount_calc(bamFile, outputDir, refGenomeFile, resultName) + + print("Run has completed successfully!") + +if __name__ == '__main__': + # Parse arguments. + + parser = argparse.ArgumentParser(description=""" + Pipeline in Python to QC process TnSeq sequencing data, map them to reference with bowtie2, and compute \"Hop Counts\", + the count of insertions at each TA site. + """) + parser.add_argument('-f', '--fastq_file', help='location of input fastq file (has to be uncompressed).', + required=True) + parser.add_argument('-i', '--index_name', help='name of the bowtie index', required=True) + parser.add_argument('-r', '--ref_genome_file', help='fasta file for the reference genome', required=True) + parser.add_argument('-o', '--output_dir', help='path to the output directory', required=True) + parser.add_argument('-s', '--stringency', + help='specify alignment stringency, options include: true-uni, true-multi, and recommended. Default = recommended', + required=False, default="recommended") + parser.add_argument('-n', '--result_name', help='prefix for final result naming', required=False, default='result') + parser.add_argument('-c', '--cores', type=int, + help='number of cores to allocate per run, will be used for only trimmomatic + bowtie2 steps.', + required=False, default=1) + args = parser.parse_args() + + run_pipeline(''.join(args.fastq_file), ''.join(args.index_name), ''.join(args.ref_genome_file), + ''.join(args.output_dir), ''.join(args.result_name), ''.join(args.stringency), args.cores) diff --git a/processing/ReplicationCorrector.R b/processing/ReplicationCorrector.R new file mode 100755 index 0000000..d65094b --- /dev/null +++ b/processing/ReplicationCorrector.R @@ -0,0 +1,17 @@ +# Please make sure to be using R-3.4 or later + +args <- commandArgs(TRUE) +source("https://bioconductor.org/biocLite.R") +if (!require("Tnseq")) biocLite("Tnseq") +library(Tnseq) + +dat <- read.table(args[1], header=T, sep='\t') +loci <- dat$loci +counts <- dat$counts +counts.adjusted <- list() +counts.adjusted[[1]] <- loci +biasfactor <- BiasFactor(loci, counts, window=as.numeric(args[2])) +counts.adjusted[[2]] <- counts/biasfactor$bias.factor +counts.adjusted.df <- data.frame(counts.adjusted) +colnames(counts.adjusted.df) <- c("loci", "counts.adjusted") +write.table(counts.adjusted.df, file=args[3], sep='\t', row.names=F) \ No newline at end of file diff --git a/processing/createTaskFileForQCandMapping.py b/processing/createTaskFileForQCandMapping.py new file mode 100755 index 0000000..b565fc8 --- /dev/null +++ b/processing/createTaskFileForQCandMapping.py @@ -0,0 +1,56 @@ +import os +import sys +import argparse + +def create_parser(): + # Parse arguments. + + parser = argparse.ArgumentParser(description=""" + This quick script will generate a "task" file where each line corresponds to a HopCountsBroad.py command for + processing and mapping a single fastq/sample file.""") + + parser.add_argument('-f', '--fastq_dir', help='location of directory with input fastq files.', required=True) + parser.add_argument('-i', '--index_name', help='name of the bowtie index', required=True) + parser.add_argument('-r', '--ref_genome_file', help='fasta file for the reference genome', required=True) + parser.add_argument('-o', '--output_dir', help='path to the general output directory', required=True) + parser.add_argument('-p', '--program', help='path to Python program HopCountsBroad.py.', required=True) + parser.add_argument('-c', '--cores', type=int, + help='number of cores to allocate per run, will be used for only trimmomatic + bowtie2 steps.', + required=False, default=1) + parser.add_argument('-m', '--mapping', help="Provide mapping of sequencing data to sample names. First column is sample name, second column is sequencing file [up to .fastq or .fastq.gz]", + required=False, default=None) + parser.add_argument('-s', '--stringency', help='specify which mapping stringency to use.', required=False, default="recommended") + args = parser.parse_args() + + return ''.join(args.fastq_dir), ''.join(args.index_name), ''.join(args.ref_genome_file), ''.join(args.output_dir), args.cores, ''.join(args.mapping), ''.join(args.stringency), ''.join(args.program) + +fastq_dir, index_name, ref_genome_file, output_dir, cores, mapping_file, stringency, program = create_parser() + +fastq_dir = os.path.abspath(fastq_dir) + '/' +output_dir = os.path.abspath(output_dir) + '/' +ref_genome_file = os.path.abspath(ref_genome_file) +index_name = os.path.abspath(index_name) +program = os.path.abspath(program) +mapping_file = os.path.abspath(mapping_file) +stringency = stringency + +mapping = {} +if os.path.isfile(mapping_file): + with open(mapping_file) as omf: + for line in omf: + line = line.rstrip('\n') + ls = line.split('\t') + mapping[ls[1].split('.fastq')[0]] = ls[0] + +if not os.path.isdir(output_dir): + os.system('mkdir ' + output_dir) + +for f in os.listdir(fastq_dir): + if f.endswith(".fastq") or f.endswith(".fastq.gz"): + fastqFile = fastq_dir + f + outDir = f.split('.fastq')[0] + if not f.startswith("Undetermined") and outDir in mapping: + if mapping: + outDir = mapping[outDir] + print(' '.join([program, '-f', fastqFile, '-i', index_name, '-r', ref_genome_file, '-o', + output_dir + outDir, '-n', outDir, '-c', str(cores), '-s', stringency]))