Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
raufs committed Sep 14, 2020
0 parents commit 84ff524
Show file tree
Hide file tree
Showing 15 changed files with 1,375 additions and 0 deletions.
104 changes: 104 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
29 changes: 29 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
BSD 3-Clause License

Copyright (c) 2020, Broad Institute
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# PerMutation
Code for Tn-Seq analysis of _E. faecalis_ MMH594. Check out the wiki more details:

1. [Processing, Mapping, Replication Bias Correction, and Normalization Across Replicates](https://github.com/broadinstitute/PerMutation/wiki/1.-Processing,-Mapping,-Replication-Bias-Correction,-and-Normalization-Across-Replicates)
2. [Assessing the Fitness Cost of Genes For Growth on Nutrient Rich Media](https://github.com/broadinstitute/PerMutation/wiki/2.-Assessing-the-Fitness-Cost-of-Genes-For-Growth-on-Nutrient-Rich-Media)
3. [Predicting Whether Genes Contribute to Antibiotic Resistance](https://github.com/broadinstitute/PerMutation/wiki/3.-Predicting-Whether-Genes-Contribute-to-Antibiotic-Resistance)

All programs/scripts written in Python-3, no real installation required. Python-3 libraries used include:

* biopython
* numpy
* scipy
* pysam

For processing and mapping of sequencing data to the reference genome the following third-party tools are necessary:

* trimmomatic
* samtools
* bowtie2
* fastx_toolkit

Progams should work if the user creates and uses a conda environment from the provided yml file.
130 changes: 130 additions & 0 deletions fitness_cost_assessment/DvalCalculator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/usr/bin/env python
# Rauf Salamzade
# DvalCalculator.py
import os
import sys
from sys import stderr
import argparse
from collections import defaultdict
from Bio import SeqIO
import math
try: assert(sys.version_info[0] == 3)
except: stderr.write("Please use Python-3 to run this program. Exiting now ...\n"); sys.exit(1)

def get_scaffold_lengths(fasta):
scaf_lens = {}
with open(fasta) as of:
for rec in SeqIO.parse(of, 'fasta'):
scaf_lens[rec.id] = len(str(rec.seq))
return scaf_lens

def parse_gff(gff, upstream, trim):
trim_edge = (1.0-trim)/2.0
pos_to_gene = {}
genes = {}
exon_lens = {}
with open(gff) as og:
for line in og:
line = line.rstrip('\n')
ls = line.split('\t')
if not line.startswith("#") and ls[2] == 'gene':
gene = ls[-1].split('ID=')[1].split(';')[0]
scaffold = ls[0]; start = int(ls[3]); stop = int(ls[4]); strand = ls[6]
if strand == '+': start -= upstream
elif strand == '-': stop += upstream
glen = abs(start-stop)+1
# trim start loci and round up to integer
start = int(math.floor(start + (glen*trim_edge)))
# trim stop loci and round down to integer
stop = int(math.ceil(stop - (glen*trim_edge)))
# Please note, gene length is the length of the feature
# after trimming and upstream region addition are accounted
# for.
if not pos_to_gene.has_key(scaffold):
pos_to_gene[scaffold] = defaultdict(set)
genes[scaffold] = {}
exon_lens[scaffold] = 0
exon_lens[scaffold] += abs(start-stop)
genes[scaffold][gene] = [start, stop]
for bp in range(start, stop+1):
pos_to_gene[scaffold][bp].add(gene)
return genes, pos_to_gene, exon_lens

def load_counts(wig_file):
hopcounts = {}
scaf = None
with open(wig_file) as ow:
for i, line in enumerate(ow):
line = line.rstrip('\n')
if line.startswith("variableStep"):
scaf = line.split('chrom=')[1]
hopcounts[scaf] = defaultdict(float)
else:
pos, insert_count = line.split('\t')
hopcounts[scaf][int(pos)] = float(insert_count)
return hopcounts

def aggregate_counts(scaf_lens, pos_to_gene, hopcounts, exon_only):
gene_counts = {}
total_counts = defaultdict(float)
for scaf in scaf_lens:
gene_counts[scaf] = defaultdict(float)
for bp in range(1, scaf_lens[scaf] + 1):
if hopcounts[scaf].has_key(bp):
if pos_to_gene[scaf].has_key(bp):
genes = pos_to_gene[scaf][bp]
for g in genes:
gene_counts[scaf][g] += hopcounts[scaf][bp]
total_counts[scaf] += hopcounts[scaf][bp]
elif not exon_only:
total_counts[scaf] += hopcounts[scaf][bp]
return gene_counts, total_counts

def compute_dvals(output, genes, exon_lens, scaf_lens, gene_counts, total_counts, trim, upstream, exon_only):
for s in genes:
outf = open('.'.join([output, s.replace('.','-'), 'txt']), 'w')
outf.write('gene\tscaffold\tstart\tstop\tgene_length\ttrim\tupstream\texon_only\tobserved_insertions\tdval\n')
for g in genes[s]:
observed_count = gene_counts[s][g]
gstart, gend = genes[s][g]
glen = abs(gstart - gend)+1
expected_count = None
if exon_only: expected_count = (float(glen)/exon_lens[s])*total_counts[s]
else: expected_count = (float(glen)/scaf_lens[s])*total_counts[s]
dval = float(observed_count)/expected_count
outf.write('\t'.join([str(x) for x in [g, s, gstart, gend, glen, trim, upstream, exon_only, observed_count, dval]])+ '\n')
outf.close()

def calc_dvalue_pipeline(wig_file, output, fasta, gff, exon_only, upstream, trim):
""" Main function which wraps everything together """
try: assert(upstream >= 0)
except: sys.stderr.write('Error: -u parameter must be provided positive integer or zero. Exiting now ...\n'); sys.exit(1)
try: assert(trim > 0.0 and trim <= 1.0)
except: sys.stderr.write('Error: -t trim must be provided positive float greater than 0.0 and less than or equal to 1.0. Exiting now ...\n'); sys.exit(1)
try: assert(os.path.isfile(fasta) and os.path.isfile(wig_file) and os.path.isfile(gff))
except: sys.stderr.write('Error: Unable to locate hopcounts, gff, and/or fasta input files! Please check input and try again. Exiting now ...\n'); sys.exit(1)
if trim != 1.0 and upstream != 0: sys.stderr.write('Warning: Using -u and -t parameters simultaneously! Please consider if this is truly what you want. Continuing ...\n')

scaf_lens = get_scaffold_lengths(fasta)
genes, pos_to_gene, exon_lens = parse_gff(gff, upstream, trim)
pos_counts = load_counts(wig_file)
gene_counts, tot_counts = aggregate_counts(scaf_lens, pos_to_gene, pos_counts, exon_only)
compute_dvals(output, genes, exon_lens, scaf_lens, gene_counts, tot_counts, trim, upstream, exon_only)

if __name__ == '__main__':
#Parse arguments.
parser = argparse.ArgumentParser(description="""
Progam to compute the dVal for each gene, a simple metric which simultaneously normalizes for sequencing depth and gene length and quantifies fitness.
""")

parser.add_argument('-i', '--wig_file', help='Location of wig file with insertion information.', required=True)
parser.add_argument('-o', '--output', help='Location and prefix of output file with Dval stats from aggregated counts.', required=True)
parser.add_argument('-f', '--fasta', help='Fasta file of genome assembly corresponding to GFF', required=True)
parser.add_argument('-g', '--gff', help='GFF file from Vesper/Calhoun. Gene ID needs to be in aliases identified by the key "ID".', required=True)
parser.add_argument('-e', '--exon_only', action='store_true', help='Calculate Dval accounting for only insertions which lie on genes.', required=False, default=False)
parser.add_argument('-t', '--trim', type=float, help='Middle proportion of genes to consider. For instance, 0.8 specifies that the first and last 10 percent of bases in the gene will be ignored.', required=False, default=1.0)
parser.add_argument('-u', '--upstream', type=int, help='Include specified number of bases upstream as part of gene to test if including promotor regions maintains essentiallity. Should generally not be used in combination with trim. Default is 0.', default=0, required=False)

args = parser.parse_args()

calc_dvalue_pipeline(args.wig_file, args.output, args.fasta, args.gff, args.exon_only, args.upstream, args.trim)
95 changes: 95 additions & 0 deletions fitness_cost_assessment/GenicPerMutant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import os
import sys
import numpy as np
import argparse
import random
from scipy import stats
import math

def permutant(wig, gff, output, gene, window, genome_length, permutation):

start = None
stop = None
genic_regions = set([])

with open(gff) as ogf:
for line in ogf:
line = line.rstrip()
ls = line.split('\t')
if ls[2] != 'gene': continue
gene_id = ls[-1].split('ID=')[-1].split(';')[0]
if gene_id == gene:
start = int(ls[3])
stop = int(ls[4])
for p in range(int(ls[3]), int(ls[4])+1): genic_regions.add(p)

glen = abs(start-stop) + 1
mid80_start = int(math.floor(start + (glen*0.10)))
mid80_stop = int(math.ceil(stop - (glen*0.10)))


window_start = [start - window, None]
window_stop = [stop + window, None]
if window_start[0] < 1:
alt_start = genome_length - (window - start)
window_start = [1, alt_start]
window_stop[1] = genome_length
elif window_stop[0] > genome_length:
alt_stop = window_stop[0] - genome_length
window_stop = [genome_length, alt_stop]
window_start[1] = 1

window_tot_ta = 0
window_sat_ta = 0
region_ta = 0
region_sum = 0
window_nz_counts = []
with open(wig) as ow:
for line in ow:
line = line.rstrip('\n')
if line.startswith('variable'): continue
ls = line.split('\t')
pos = int(ls[0])
count = float(ls[1])
if (pos >= window_start[0] and pos <= window_stop[0]) or (window_start[1] and pos >= window_start[1] and pos <= window_stop[1]):# and pos in genic_regions:
if pos >= mid80_start and pos <= mid80_stop:
region_ta += 1
region_sum += count
window_tot_ta += 1
if count > 0.0:
window_nz_counts.append(count)
window_sat_ta += 1

p = window_sat_ta/float(window_tot_ta)
empirical_pvalue = 0

outf = open(output, 'w')
outf.write(str(region_sum) + '\n')
for i in range(0, permutation):
simulated_nz_sites = np.random.binomial(region_ta, p, 1)[0]
simulated_sum = 0
random.shuffle(window_nz_counts)
if simulated_nz_sites > 0: simulated_sum = sum(window_nz_counts[:simulated_nz_sites])
if simulated_sum <= region_sum: empirical_pvalue += 1
outf.write(str(simulated_sum) + '\n')

outf.write('#' + str(empirical_pvalue/permutation) + '\t' + str(p) + '\t' + str(region_ta) + '\n')
outf.close()

if __name__ == '__main__':
# Parse arguments.
parser = argparse.ArgumentParser(description="""
Progam to simulate localized permutation of Tn mutant abundances in gene relative to local surroundings to estimate
probability that the gene is depleted in mutants to expectations.
""")

parser.add_argument('-i', '--wig_file', help='Location of wig file with insertion information.', required=True)
parser.add_argument('-a', '--gff', help='gff annotation', required=True)
parser.add_argument('-o', '--output', help='Location and prefix of output file with Dval stats from aggregated counts.', required=True)
parser.add_argument('-d', '--gene', help='gene id', required=True)
parser.add_argument('-w', '--window', type=int, help='The window size.', required=True)
parser.add_argument('-g', '--genome_length', type=int, help='Length of genome', required=True)
parser.add_argument('-p', '--permutation', type=int, help='permutations', required=False, default=100000)

args = parser.parse_args()
permutant(args.wig_file, args.gff, args.output, args.gene, args.window, args.genome_length, args.permutation)
Loading

0 comments on commit 84ff524

Please sign in to comment.