Bump version: 0.0.0 → 0.0.1

stracquadaniolab · May 27, 2024 · b3cd8f3 · b3cd8f3
commit b3cd8f3
Show file tree

Hide file tree

Showing 17 changed files with 4,855 additions and 0 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -0,0 +1,14 @@
+[bumpversion]
+current_version = 0.0.1
+commit = True
+tag = True
+
+[bumpversion:file:readme.md]
+
+[bumpversion:file:conf/base.config]
+
+[bumpversion:file:containers/Dockerfile]
+
+[bumpversion:file:.github/workflows/ci.yml]
+search = {{{{ github.repository }}}}:{current_version}
+replace = {{{{ github.repository }}}}:{new_version}
diff --git a/.devcontainer.json b/.devcontainer.json
@@ -0,0 +1,37 @@
+{
+	// setup a container to develop a python package using miniconda3
+	"name": "quickMD",
+	"dockerFile": "containers/Dockerfile",
+	"context": ".",
+	"build": {"args": {"CONDA_FILE": "containers/environment.yml"}},
+
+	// Set *default* container specific settings.json values.
+	"settings": {
+
+		"terminal.integrated.profiles.linux": {
+			"bash": {
+				"path": "/bin/bash",
+			}
+		},
+		"terminal.integrated.defaultProfile.windows": "bash",
+		"python.pythonPath": "/opt/conda/bin/python",
+		"python.formatting.provider": "black",
+		"python.formatting.blackPath": "/opt/conda/bin/black",
+		"[python]": {
+			"editor.rulers": [
+				90
+			]
+		},
+	},
+
+	// extensions for vscode
+	"extensions": [
+		"ms-python.python",
+		"stkb.rewrap",
+		"donjayamanne.githistory",
+		"aaron-bond.better-comments",
+		"gruntfuggly.todo-tree",
+		"mhutchie.git-graph"
+	],
+
+}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,42 @@
+name: "build"
+on: [push]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v1
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1        
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v1
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GHCR_TOKEN }}
+      - name: Build Docker image
+        uses: docker/build-push-action@v2
+        with:
+          context: .
+          file: containers/Dockerfile
+          platforms: linux/amd64
+          load: true
+          push: false
+          tags: ghcr.io/${{ github.repository }}:0.0.1, ghcr.io/${{ github.repository }}:latest
+      - name: 'Install Nextflow'
+        uses: stracquadaniolab/gh-action-setup-nextflow@v0.2.0
+      - name: 'Testing Nextflow pipeline with test profile'
+        run: |
+          chmod +x bin/*
+          nextflow run . -profile test,ci,docker      
+      - name: Push image to GitHub container registry
+        if: startsWith(github.event.ref, 'refs/tags')
+        uses: docker/build-push-action@v2
+        with:
+          context: .
+          file: containers/Dockerfile
+          platforms: linux/amd64
+          push: true
+          tags: ghcr.io/${{ github.repository }}:0.0.1, ghcr.io/${{ github.repository }}:latest
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+work/
+results/
+logs/
+.DS_Store
+*.pyc
+.vscode
+.nextflow*
diff --git a/bin/fit.py b/bin/fit.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+"""
+    fit.py 
+
+    Usage:
+        fit.py <inputfile>
+"""
+
+from docopt import docopt
+
+def linear(filename: str = "data.txt"):
+    print("fitting a linear model from: ", filename)
+
+if __name__ == "__main__":
+    arguments = docopt(__doc__, version='fit')
+    linear(arguments['<inputfile>'])
diff --git a/bin/openmmMD.py b/bin/openmmMD.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+"""openmm-MD
+
+Minimise the potential energy of the wildtype protein structure and mutated variants, according to AMBER14 potentials 
+Usage:
+openmmMD.py [--incsv=<incsv>] [--from-col=<col>] [--in-pdb=<pdb>] [--chain=<chain>] [--pH=<pH>] [--no_restraints] 
+
+Options:
+--incsv=<incsv>    Read data from csv file
+--from-col=<col>   Select column title containing mutations
+--in-pdb=<pdb>     PDB file of protein wildtype
+--chain=<chain>    Select chain upon which to perform mutation
+--pH=<ph>          Set pH of the protein
+--no_restraints    Allow movement of all atoms
+"""
+
+import logging
+from docopt import docopt
+import pandas as pd
+import numpy as np
+import re
+import sys
+import pathlib
+import MDAnalysis as mda
+from MDAnalysis.analysis import rms, align
+from sys import stdout
+from pdbfixer import PDBFixer
+from openmm.app import *
+from openmm import *
+from openmm.unit import *
+
+def get_data(csvname: str, col: str):
+    data = pd.read_csv(csvname)
+    clipped_data=pd.DataFrame(data=data, columns=[col])
+    divided_data = clipped_data[col].str.extract(r'([a-zA-Z]+)([^a-zA-Z]+)([a-zA-Z]+)')
+    divided_cleaned = divided_data.to_string(header=None, index=False)
+    amino_index = {'G': 'GLY' , 'L': 'LEU', 'R': 'ARG', 'A': 'ALA', 'Y': 'TYR', 'S': 'SER', 'M': 'MET', 'I': 'ILE', 'T': 'THR', 'C': 'CYS', 'V': 'VAL', 'P': 'PRO', 'F': 'PHE', 'W': 'TRP', 'H': 'HIS', 'K': 'LYS', 'D': 'ASP', 'E': 'GLU', 'N': 'ASN', 'Q': 'GLN', ' ': '-'}
+    new_rep_codes = re.sub(r"[GLRAYSMITCVPFWHKDENQ ]", lambda x: amino_index[x.group(0)], divided_cleaned)
+    new_rep_cleaned = re.sub("--","-", new_rep_codes)
+    return new_rep_cleaned
+
+def reformat_data(csvname: str, new_rep_cleaned):
+    df_newcol = new_rep_cleaned.splitlines()
+    pre_df = {'name': df_newcol}
+    df1 = pd.DataFrame(pre_df)
+    df2 = pd.read_csv(csvname)
+    df3 = pd.concat([df1, df2['ΔΔG']], axis=1)
+    stem = csvname.replace(".csv","")
+    csv_reformat = str(stem + "_reformat.csv")
+    df3.to_csv(csv_reformat)
+
+def clean_wildtype(pdbname: str, pH: str):
+    pH_fl = float(pH)
+    pdb = PDBFixer(pdbname)
+    #pdb.findMissingResidues()
+    pdb.missingResidues = {}
+    #pdb.findNonstandardResidues()
+    #pdb.replaceNonstandardResidues()
+    pdb.removeHeterogens(False)
+    pdb.findMissingAtoms()
+    pdb.addMissingAtoms()
+    pdb.addMissingHydrogens(pH_fl)
+    PDBFile.writeFile(pdb.topology, pdb.positions, open("wildtype_fixed.pdb", 'w'), keepIds=True)
+    return pdb
+
+def create_mutants(pdbname: str, mutant, chain: str, pH: str):
+    pH_fl = float(pH)
+    mutpdb = PDBFixer(pdbname)
+    mutpdb.applyMutations([mutant], chain)
+    #mutpdb.findMissingResidues()
+    mutpdb.missingResidues = {}
+    #mutpdb.findNonstandardResidues()
+    #mutpdb.replaceNonstandardResidues()
+    mutpdb.removeHeterogens(False)
+    mutpdb.findMissingAtoms()
+    mutpdb.addMissingAtoms()
+    mutpdb.addMissingHydrogens(pH_fl)
+    PDBFile.writeFile(mutpdb.topology, mutpdb.positions, open(mutant + "_fixed.pdb", 'w'), keepIds=True)
+    return mutpdb
+
+def setup_forcefield():
+    forcefield = ForceField('amber14-all.xml', 'amber14/tip3pfb.xml')
+    return forcefield
+
+def setup_modeller(pdb):
+    modeller = Modeller(pdb.topology, pdb.positions)
+    return modeller
+
+def setup_system(modeller, forcefield, solvmol: str, no_restraints: bool):
+    mol=int(solvmol)
+    modeller.addSolvent(forcefield, numAdded=mol)
+    system = forcefield.createSystem(modeller.topology, nonbondedMethod=PME, nonbondedCutoff=1.0*nanometer, constraints=HBonds)
+    if not no_restraints:
+        logging.info("Using restraints on backbone")
+        for atom in modeller.topology.atoms():
+            if atom.name in ["CA","C","N"]:
+                system.setParticleMass(atom.index, 0)
+    return system
+
+def setup_simulation(modeller, system):
+    integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.001*picoseconds)
+    simulation = Simulation(modeller.topology, system, integrator)
+    simulation.context.setPositions(modeller.positions)
+    return simulation
+
+def energy_minimization(modeller):
+    forcefield = setup_forcefield()
+    system = setup_system(modeller, forcefield, arguments['--no_restraints'])
+    simulation = setup_simulation(modeller, system)
+    init_state = simulation.context.getState(getEnergy=True, getPositions=True)
+    logging.info("Starting potential energy = %.9f kcal/mol"
+        % init_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole))
+    simulation.minimizeEnergy
+    final_state = simulation.context.getState(getEnergy=True, getPositions=True)
+    logging.info("Starting potential energy = %.9f kcal/mol"
+        % final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole))
+    final_pe = final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)
+    return simulation.topology, final_state.getPositions(), final_pe, simulation, 
+
+def md_nvt(simulation, csvname: str, totalsteps: int, reprate: int, pdbname):
+    init_state = simulation.context.getState(getEnergy=True, getPositions=True)
+    init_pe = init_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)
+    simulation.reporters.append(PDBReporter(pdbname, reprate))
+    simulation.reporters.append(StateDataReporter(stdout, reprate, step=True, potentialEnergy=True, temperature=True, volume=True))
+    prepdf = {'Step':[], 'Potential Energy (kJ/mole)':[], 'Temperature (K)':[], 'Box Volume (nm^3)':[]}
+    inidf = pd.DataFrame(prepdf)
+    inidf.to_csv(csvname, index=False)
+    simulation.reporters.append(StateDataReporter(csvname, reprate, step=True,
+        potentialEnergy=True, temperature=True, volume=True, append=True))
+    simulation.step(totalsteps)
+    final_state = simulation.context.getState(getEnergy=True, getPositions=True)
+    final_pe = final_state.getPotentialEnergy().value_in_unit(kilocalories_per_mole)
+    df = pd.read_csv(csvname)
+    av_energy = df.loc[:, 'Potential Energy (kJ/mole)'].mean()
+    return init_pe, final_pe, av_energy, simulation, simulation.topology, init_state.getPositions()
+
+#def rmsd_analysis(pdb_traj: str, pdb_ref: str):
+#    trajectory = md.load_pdb(pdb_traj)
+#    reference = md.load_pdb(pdb_ref)
+#    trajectory.superpose(reference)
+#    n_residues = trajectory.topology.n_residues
+#    rmsd_per_residue = []
+#    for i in range(n_residues):
+#        atom_indices = trajectory.topology.residue(i).atom_indices
+#        residue_rmsd = np.array([md.rmsd(trajectory, reference, frame=frame_index, atom_indices=atom_indices) for frame_index in range(trajectory.n_frames)])
+#        rmsd_per_residue.append(residue_rmsd)
+
+def rmsf_analysis(pdb_traj: str, rmsfcsv: str):
+    u = mda.Universe(pdb_traj)
+    average = align.AverageStructure(u, u, select='protein and name CA', ref_frame=0).run()
+    ref = average.results.universe
+    aligner = align.AlignTraj(u, ref, select='protein and name CA', in_memory=True).run()
+    c_alphas = u.select_atoms('protein and name CA')
+    R = rms.RMSF(c_alphas).run()
+    residue_ids = [residue.resid for residue in c_alphas.residues]
+    residue_names = [residue.resname for residue in c_alphas.residues]
+    rmsf_values = R.rmsf
+    df = pd.DataFrame({'Res_ID': residue_ids, 'Res_Name': residue_names, 'RMSF': rmsf_values })
+    df.to_csv(rmsfcsv, index=False)
+
+def main():
+    arguments = docopt(__doc__, version='openmm-fb_MD.py')
+    new_rep_cleaned = get_data(arguments['--incsv'], arguments['--from-col'])
+    reformat_data(arguments['--incsv'], new_rep_cleaned)
+    pdb = clean_wildtype(arguments['--in-pdb'], arguments['--pH'])
+    for j in range(1,2):
+        wt_pdb = energy_minimization(pdb)
+        strj = str(j)
+        wt_out = str("wt_minimised" + strj + ".pdb")
+        PDBFile.writeFile(wt_pdb[0], wt_pdb[1], open(wt_out, "w"), keepIds=True)
+        simulation = wt_pdb[3]
+        csvname = str("wt_traj" + strj + ".csv")
+        pdbname = str("wt_traj" + strj + ".pdb")
+        sim_run = md_nvt(simulation, csvname, 2000000, 10000, pdbname)
+        wt_ref = str("wt_reference" + strj + ".pdb")
+        PDBFile.writeFile(sim_run[4], sim_run[5], open(wt_ref, "w"), keepIds=True)
+        rmsfout = str("wt_rmsf" + strj + ".csv")
+        rmsf_analysis(pdbname, rmsfout)
+
+if __name__ == '__main__':
+    arguments = docopt(__doc__, version='openmm-fb_MD.py')
+    logging.getLogger().setLevel(logging.INFO)  
+    main()
diff --git a/bin/plots.py b/bin/plots.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+"""
+    plots.py
+
+    Usage:
+        plots.py (scatter|line) <filename>
+
+
+"""
+
+
+from docopt import docopt
+
+def scatter(filename: str):
+    '''
+        plot points
+    '''
+    print("plotting points from: ", filename)
+
+
+def line(filename: str):
+    '''
+        plot a line
+    '''
+    print("plotting lines from: ", filename)
+
+
+if __name__ == "__main__":
+    arguments = docopt(__doc__, version='plots')
+    if arguments['line']:
+        line(arguments["<filename>"])
+    else:
+        scatter(arguments["<filename>"])
+