Initial commit

broadinstitute · Sep 14, 2020 · 84ff524 · 84ff524
commit 84ff524
Show file tree

Hide file tree

Showing 15 changed files with 1,375 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,104 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2020, Broad Institute
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
@@ -0,0 +1,22 @@
+# PerMutation
+Code for Tn-Seq analysis of _E. faecalis_ MMH594. Check out the wiki more details:
+
+1. [Processing, Mapping, Replication Bias Correction, and Normalization Across Replicates](https://github.com/broadinstitute/PerMutation/wiki/1.-Processing,-Mapping,-Replication-Bias-Correction,-and-Normalization-Across-Replicates)
+2. [Assessing the Fitness Cost of Genes For Growth on Nutrient Rich Media](https://github.com/broadinstitute/PerMutation/wiki/2.-Assessing-the-Fitness-Cost-of-Genes-For-Growth-on-Nutrient-Rich-Media)
+3. [Predicting Whether Genes Contribute to Antibiotic Resistance](https://github.com/broadinstitute/PerMutation/wiki/3.-Predicting-Whether-Genes-Contribute-to-Antibiotic-Resistance)
+
+All programs/scripts written in Python-3, no real installation required. Python-3 libraries used include:
+
+* biopython 
+* numpy
+* scipy
+* pysam
+
+For processing and mapping of sequencing data to the reference genome the following third-party tools are necessary:
+
+* trimmomatic
+* samtools
+* bowtie2
+* fastx_toolkit
+
+Progams should work if the user creates and uses a conda environment from the provided yml file.
diff --git a/fitness_cost_assessment/DvalCalculator.py b/fitness_cost_assessment/DvalCalculator.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# Rauf Salamzade
+# DvalCalculator.py
+import os
+import sys
+from sys import stderr
+import argparse
+from collections import defaultdict
+from Bio import SeqIO
+import math
+try: assert(sys.version_info[0] == 3)
+except: stderr.write("Please use Python-3 to run this program. Exiting now ...\n"); sys.exit(1)
+
+def get_scaffold_lengths(fasta):
+	scaf_lens = {}
+	with open(fasta) as of:
+		for rec in SeqIO.parse(of, 'fasta'):
+			scaf_lens[rec.id] = len(str(rec.seq))
+	return scaf_lens
+
+def parse_gff(gff, upstream, trim):
+	trim_edge = (1.0-trim)/2.0
+	pos_to_gene = {}
+	genes = {}
+	exon_lens = {}
+	with open(gff) as og:
+		for line in og:
+			line = line.rstrip('\n')
+			ls = line.split('\t')
+			if not line.startswith("#") and ls[2] == 'gene':
+				gene = ls[-1].split('ID=')[1].split(';')[0]
+				scaffold = ls[0]; start = int(ls[3]); stop = int(ls[4]); strand = ls[6]
+				if strand == '+': start -= upstream
+				elif strand == '-': stop += upstream
+				glen = abs(start-stop)+1
+				# trim start loci and round up to integer
+				start = int(math.floor(start + (glen*trim_edge)))
+				# trim stop loci and round down to integer
+				stop = int(math.ceil(stop - (glen*trim_edge)))
+				# Please note, gene length is the length of the feature
+				# after trimming and upstream region addition are accounted
+				# for.
+				if not pos_to_gene.has_key(scaffold):
+					pos_to_gene[scaffold] = defaultdict(set)
+					genes[scaffold] = {}
+					exon_lens[scaffold] = 0
+				exon_lens[scaffold] += abs(start-stop)
+				genes[scaffold][gene] = [start, stop]
+				for bp in range(start, stop+1):
+					pos_to_gene[scaffold][bp].add(gene)
+	return genes, pos_to_gene, exon_lens
+
+def load_counts(wig_file):
+	hopcounts = {}
+	scaf = None
+	with open(wig_file) as ow:
+		for i, line in enumerate(ow):
+			line = line.rstrip('\n')
+			if line.startswith("variableStep"):
+				scaf = line.split('chrom=')[1]
+				hopcounts[scaf] = defaultdict(float)
+			else:
+				pos, insert_count = line.split('\t')
+				hopcounts[scaf][int(pos)] = float(insert_count)
+	return hopcounts
+
+def aggregate_counts(scaf_lens, pos_to_gene, hopcounts, exon_only):
+	gene_counts = {}
+	total_counts = defaultdict(float)
+	for scaf in scaf_lens:
+		gene_counts[scaf] = defaultdict(float)
+		for bp in range(1, scaf_lens[scaf] + 1):
+			if hopcounts[scaf].has_key(bp):
+				if pos_to_gene[scaf].has_key(bp):
+					genes = pos_to_gene[scaf][bp]
+					for g in genes:
+						gene_counts[scaf][g] += hopcounts[scaf][bp]
+					total_counts[scaf] += hopcounts[scaf][bp]
+				elif not exon_only:
+					total_counts[scaf] += hopcounts[scaf][bp]
+	return gene_counts, total_counts
+
+def compute_dvals(output, genes, exon_lens, scaf_lens, gene_counts, total_counts, trim, upstream, exon_only):
+	for s in genes:
+		outf = open('.'.join([output, s.replace('.','-'), 'txt']), 'w')
+		outf.write('gene\tscaffold\tstart\tstop\tgene_length\ttrim\tupstream\texon_only\tobserved_insertions\tdval\n')
+		for g in genes[s]:
+			observed_count = gene_counts[s][g]
+			gstart, gend = genes[s][g]
+			glen = abs(gstart - gend)+1
+			expected_count = None
+			if exon_only: expected_count = (float(glen)/exon_lens[s])*total_counts[s]
+			else: expected_count = (float(glen)/scaf_lens[s])*total_counts[s]
+			dval = float(observed_count)/expected_count
+			outf.write('\t'.join([str(x) for x in [g, s, gstart, gend, glen, trim, upstream, exon_only, observed_count, dval]])+ '\n')
+		outf.close()
+
+def calc_dvalue_pipeline(wig_file, output, fasta, gff, exon_only, upstream, trim):
+	""" Main function which wraps everything together """
+	try: assert(upstream >= 0)
+	except: sys.stderr.write('Error: -u parameter must be provided positive integer or zero. Exiting now ...\n'); sys.exit(1)
+	try: assert(trim > 0.0 and trim <= 1.0)
+	except: sys.stderr.write('Error: -t trim must be provided positive float greater than 0.0 and less than or equal to 1.0. Exiting now ...\n'); sys.exit(1)
+	try: assert(os.path.isfile(fasta) and os.path.isfile(wig_file) and os.path.isfile(gff))
+	except:	sys.stderr.write('Error: Unable to locate hopcounts, gff, and/or fasta input files! Please check input and try again. Exiting now ...\n'); sys.exit(1)
+	if trim != 1.0 and upstream != 0: sys.stderr.write('Warning: Using -u and -t parameters simultaneously! Please consider if this is truly what you want. Continuing ...\n')
+
+	scaf_lens = get_scaffold_lengths(fasta)
+	genes, pos_to_gene, exon_lens = parse_gff(gff, upstream, trim)
+	pos_counts = load_counts(wig_file)
+	gene_counts, tot_counts = aggregate_counts(scaf_lens, pos_to_gene, pos_counts, exon_only)
+	compute_dvals(output, genes, exon_lens, scaf_lens, gene_counts, tot_counts, trim, upstream, exon_only)
+
+if __name__ == '__main__':
+	#Parse arguments.
+	parser = argparse.ArgumentParser(description="""
+	Progam to compute the dVal for each gene, a simple metric which simultaneously normalizes for sequencing depth and gene length and quantifies fitness.
+	""")
+
+	parser.add_argument('-i', '--wig_file', help='Location of wig file with insertion information.', required=True)
+	parser.add_argument('-o', '--output', help='Location and prefix of output file with Dval stats from aggregated counts.', required=True)
+	parser.add_argument('-f', '--fasta', help='Fasta file of genome assembly corresponding to GFF', required=True)
+	parser.add_argument('-g', '--gff', help='GFF file from Vesper/Calhoun. Gene ID needs to be in aliases identified by the key "ID".', required=True)
+	parser.add_argument('-e', '--exon_only', action='store_true', help='Calculate Dval accounting for only insertions which lie on genes.', required=False, default=False)
+	parser.add_argument('-t', '--trim', type=float, help='Middle proportion of genes to consider. For instance, 0.8 specifies that the first and last 10 percent of bases in the gene will be ignored.', required=False, default=1.0) 
+	parser.add_argument('-u', '--upstream', type=int, help='Include specified number of bases upstream as part of gene to test if including promotor regions maintains essentiallity. Should generally not be used in combination with trim. Default is 0.', default=0, required=False)
+
+	args = parser.parse_args()
+
+	calc_dvalue_pipeline(args.wig_file, args.output, args.fasta, args.gff, args.exon_only, args.upstream, args.trim)
diff --git a/fitness_cost_assessment/GenicPerMutant.py b/fitness_cost_assessment/GenicPerMutant.py
@@ -0,0 +1,95 @@
+import os
+import sys
+import numpy as np
+import argparse
+import random
+from scipy import stats
+import math
+
+def permutant(wig, gff, output, gene, window, genome_length, permutation):
+
+    start = None
+    stop = None
+    genic_regions = set([])
+
+    with open(gff) as ogf:
+        for line in ogf:
+            line = line.rstrip()
+            ls = line.split('\t')
+            if ls[2] != 'gene': continue
+            gene_id =  ls[-1].split('ID=')[-1].split(';')[0]
+            if gene_id == gene:
+                start = int(ls[3])
+                stop = int(ls[4])
+            for p in range(int(ls[3]), int(ls[4])+1): genic_regions.add(p)
+
+    glen = abs(start-stop) + 1
+    mid80_start = int(math.floor(start + (glen*0.10)))
+    mid80_stop = int(math.ceil(stop - (glen*0.10)))
+
+
+    window_start = [start - window, None]
+    window_stop = [stop + window, None]
+    if window_start[0] < 1:
+        alt_start = genome_length - (window - start)
+        window_start = [1, alt_start]
+        window_stop[1] = genome_length
+    elif window_stop[0] > genome_length:
+        alt_stop = window_stop[0] - genome_length
+        window_stop = [genome_length, alt_stop]
+        window_start[1] = 1
+
+    window_tot_ta = 0
+    window_sat_ta = 0
+    region_ta = 0
+    region_sum = 0
+    window_nz_counts = []
+    with open(wig) as ow:
+        for line in ow:
+            line = line.rstrip('\n')
+            if line.startswith('variable'): continue
+            ls = line.split('\t')
+            pos = int(ls[0])
+            count = float(ls[1])
+            if (pos >= window_start[0] and pos <= window_stop[0]) or (window_start[1] and pos >= window_start[1] and pos <= window_stop[1]):# and pos in genic_regions:
+                if pos >= mid80_start and pos <= mid80_stop:
+                    region_ta += 1
+                    region_sum += count
+                window_tot_ta += 1
+                if count > 0.0:
+                    window_nz_counts.append(count)
+                    window_sat_ta += 1
+
+    p = window_sat_ta/float(window_tot_ta)
+    empirical_pvalue = 0
+
+    outf = open(output, 'w')
+    outf.write(str(region_sum) + '\n')
+    for i in range(0, permutation):
+        simulated_nz_sites = np.random.binomial(region_ta, p, 1)[0]
+        simulated_sum = 0
+        random.shuffle(window_nz_counts)
+        if simulated_nz_sites > 0: simulated_sum = sum(window_nz_counts[:simulated_nz_sites])
+        if simulated_sum <= region_sum: empirical_pvalue += 1
+        outf.write(str(simulated_sum) + '\n')
+
+    outf.write('#' + str(empirical_pvalue/permutation) + '\t' + str(p) + '\t' + str(region_ta) + '\n')
+    outf.close()
+
+if __name__ == '__main__':
+    # Parse arguments.
+    parser = argparse.ArgumentParser(description="""
+	Progam to simulate localized permutation of Tn mutant abundances in gene relative to local surroundings to estimate
+	probability that the gene is depleted in mutants to expectations.
+	""")
+
+    parser.add_argument('-i', '--wig_file', help='Location of wig file with insertion information.', required=True)
+    parser.add_argument('-a', '--gff', help='gff annotation', required=True)
+    parser.add_argument('-o', '--output', help='Location and prefix of output file with Dval stats from aggregated counts.', required=True)
+    parser.add_argument('-d', '--gene', help='gene id', required=True)
+    parser.add_argument('-w', '--window', type=int, help='The window size.', required=True)
+    parser.add_argument('-g', '--genome_length', type=int, help='Length of genome', required=True)
+    parser.add_argument('-p', '--permutation', type=int, help='permutations', required=False, default=100000)
+
+    args = parser.parse_args()
+    permutant(args.wig_file, args.gff, args.output, args.gene, args.window, args.genome_length, args.permutation)