-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhw3.py
159 lines (140 loc) · 7.66 KB
/
hw3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# In this assignment, your goal is to implement a method to find amino acid pairs that are in close
# proximity in the structure (i.e., in three dimensions) but far away in the sequence. In particular,
# your program will get input a protein structure (which may be composed of multiple amino acid
# chains) as a single PDB file and two numbers, one integer, S, to be used as the distance threshold
# in the sequence and a real number, D, to be used as a distance threshold in the stucture. You willl
# then report amino-acid pairs (for each chain of the structure) whose distance in 3-dimensions is
# less than or equal to D, and which are at least S amino acids apart in the sequence (i.e., if one
# amino acid is the ith amino acid, the other has to be i+Sth or more, or vice versa).
# Here is a step by step description of the algorithm you are expected to implement:
# 1) Get a protein structure as input from the user as a PDB (Protein Data Bank) file.
# 2) Read the PDB file and get the sequence of each chain from the SEQRES records and the
# coordinates of each aminno acid from the ATOM records. Use the alpha carbon
# coordinate of amino acids as their coordinates.
# 3) For each chain, compute pairwise distances, as the the Euclidean distance, between all
# amino acid pairs in that chain. Output amino acid pairs whose distance in 3-dimensions is
#less than or equal to D, and which are at least S amino acids apart in the sequence.
# 4) Repeat the process for all chains in the structure.
# 5) Output the results to the user.
import sys
import math
from matplotlib.pyplot import table
from tabulate import tabulate
# the following fuction takes a protein structure as input and returns the secondary structure of each chain
# However, you will only need to read the ATOM and SEQRES records of PDB files. The ATOM
# record contains the coordinates of the atoms that make up the structure. For each amino acid, you
# are only going to use the CA atom (alpha-Carbon) coordinates. The atom records look like below:
# ATOM 1 N ASP A 30 31.904 -0.904 -0.904 1.00 0.00 N
# ATOM 2 CA ASP A 30 31.904 -0.904 -0.904 1.00 0.00 C
# ATOM 3 C ASP A 30 32.904 -0.904 -0.904 1.00 0.00 C
# ATOM 4 O ASP A 30 33.904 -0.904 -0.904 1.00 0.00 O
# The SEQRES record contains the sequence of the amino acids in the chain. For each chain, you
# are only going to use the first 20 amino acids. The SEQRES record looks like below:
# SEQRES 1 A 20
# the following fuction reads the PDB file get the sequence of each chain from the SEQRES records and the coordinates of each amino acid as the ATOM records and return the amino acid sequences and the coordinates as a dictionary
def read_pdb(pdb_file):
# initialize sequences as a dictionary with chain id as the key and amino acid sequence list as the value
seq = {}
# initialize coordinates as a dictionary with amino acid id as the key and coordinates as the value
# open the pdb file
pdb_file = open(pdb_file, 'r')
index = 0
# read the pdb file line by line
for line in pdb_file:
# if the line is the SEQRES record, get the amino acid sequence
if line[0:4] == 'ATOM' and line[13:16] == 'CA ':
# for each chain id, get the amino acid id and then add a coordinate to the dictionary
chain_id = line[21]
amino_acid_residue_name = line[17:20]
amino_acid_coordinate = [
float(line[30:38]),
float(line[38:46]),
float(line[46:54])
]
if chain_id in seq:
seq[chain_id].append(
[amino_acid_residue_name, amino_acid_coordinate])
else:
seq[chain_id] = [[
amino_acid_residue_name, amino_acid_coordinate
]]
# close the pdb file
pdb_file.close()
# print the amino acid sequences and the coordinates
# print(seq['A'])
# print(coords)
# return the amino acid sequences and the coordinates
return seq
# get protien coordinates in all chains using protien name and sequnce as input
def get_protein_coordinates_in_seq(protein_sequence, chain_id):
protien_coords = []
for i in protein_sequence[chain_id]:
protien_coords.append(i[1])
return protien_coords
# the following fuction calculates the euclidian distance between two points in 3-dimensions
def distance(point1, point2):
return math.sqrt((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2 +
(point1[2] - point2[2])**2)
# the follwoing fuction takes a chain id and a list of amino acid coordinates as input and returns the distance between all amino acids in the chain
def get_distance_between_all_amino_acids(protein_coordinates, chain_id, D, S):
distance_list = []
for i in range(len(protein_coordinates[chain_id])):
for j in range(i + 1, len(protein_coordinates[chain_id])):
eucladian_distance = distance(protein_coordinates[chain_id][i][1],
protein_coordinates[chain_id][j][1])
abs_distance = abs(i - j)
if eucladian_distance <= D and abs_distance >= S:
distance_list.append([
protein_coordinates[chain_id][i][0], i,
protein_coordinates[chain_id][j][0], j, eucladian_distance,
abs_distance
])
return distance_list
# the following fuction computes the pairwise distance between all amino acids in a chain whose distance is less than or equal to D and which are at least S amino acids apart in the sequence
def get_pairwise_distance(protein_sequence, S, D):
# initialize the pairwise distance as a dictionary with chain id as the key and amino acid pairwise distance as the value
pairwise_distance = {}
# initialize the pairwise distance as a dictionary with chain id as the key and amino acid pairwise distance as the value
# return the pairwise distance
return pairwise_distance
# run the program
def main():
# check if we are in the main function
# get the input arguments
pdb_file = sys.argv[1]
D = float(sys.argv[2])
S = int(sys.argv[3])
output_file = "output.txt"
# print("reading the pdb file")
# read the pdb file
seq = read_pdb(pdb_file)
match_list = {}
for chain_id in seq:
match_list[chain_id] = get_distance_between_all_amino_acids(
seq, chain_id, D, S)
# write the output to the output file
# tabluate the match_list using the chain id as the key and the amino acid pairwise distance as the value
with open("e2044725_table.out", 'w') as f:
for chain_id in match_list:
f.writelines("Chain ID: " + chain_id + "\n")
f.writelines(
tabulate(match_list[chain_id],
headers=[
'Amino Acid 1', 'Index from seq start',
'Amino Acid 2', 'Index from seq start',
'Euclidian Distance', 'Absolute Distance'
],
tablefmt='grid') + "\n")
print("Chain ID: " + chain_id)
print(
tabulate(match_list[chain_id],
headers=[
'Amino Acid 1', 'Index from seq start',
'Amino Acid 2', 'Index from seq start',
'Euclidian Distance', 'Absolute Distance'
],
tablefmt='grid') + "\n")
# run the main function
if __name__ == '__main__':
main()
print("File has been created with the name e2044725_table.out")