read.py

import sys
import os


def extractPDBInfo(input1,input2):

    """ read the PDB file and chainId to map the residue serial number with residue number & chain info  """


    keyList = []
    valueList = []

    pdbTups = ()

    pdbInfoMap = {}

    index = 0

    fileName = open(input1,'r')
    if fileName:
       for line in fileName:
           line = line.strip()

           if line.split()[0][0:6] == 'HETATM' and len(line.split()[0]) > 8:
              if line.split()[1] == 'CA' and line.split()[3] == input2:
                 index = index + 1
                 keyList.append(index)
                 valueList.append(line.split()[4])

              elif line.split()[1] == 'CA'  and len(line.split()[3]) > 1 and line.split()[3][0:1] == input2:
                 index = index + 1
                 keyList.append(index)
                 valueList.append(line.split()[3][1:])
             

           if line.split()[0] == 'ATOM' or line.split()[0] == 'HETATM':
              if line.split()[2] == 'CA' and line.split()[4] == input2:
                 index = index + 1
                 keyList.append(index)
                 valueList.append(line.split()[5])

              elif line.split()[2] == 'CA'  and len(line.split()[4]) > 1 and line.split()[4][0:1] == input2:
                 index = index + 1
                 keyList.append(index)
                 valueList.append(line.split()[4][1:])
                      
    pdbInfoMap = dict(zip(keyList,valueList))
    return pdbInfoMap

def extractBestAlign(input):
   
    """ For the aligned sequence extract the alignment score and chainId of each pdb chain sequence. Alignment score generated by running blastP. """ 

    lineNum = 0
    alignIndex = []
    identity = []
    alignIdentityMap = {}
    chainId = ''

    fileName = open(input,"r")
    if fileName:
       for line in fileName:
           line = line.strip()
           lineNum = lineNum + 1

           ### PdbId along with chainInfo present in the alignment file on linenum 4 ### 
           if lineNum == 4:
              chainId = line.split("=")[1][1:6]

           ### lineNumber of the line where score is metnioned in the alignment file
           if line[0:5] == 'Score':
              alignIndex.append(int(lineNum))

           ### Store the identitity score for each alignments in the alignment file ###
           if line[0:10] == 'Identities':
              identity.append(float(line.split("=")[1].split(",")[0].split("(")[1][:-2]))
   
    ### For the last line of the alignment file assign dummy alignment identity score of 0.0
    alignIndex.append(int(lineNum))
    identity.append(0.0)

    ### Build the alignment identity dictionary by mapping each alignment with the identity score ### 
    alignIdentityMap = dict(zip(alignIndex,identity))
    
    return alignIndex,alignIdentityMap,chainId


def extractAlignInfo(input1,input2,input3):

    lineNum = 0

    qIndex = []
    qRes = []
    qKeyList = []
    sIndex = []
    sRes = []
    sKeyList = []
    newList = []

    qResMap = {}
    sResMap = {}
    pdbInfoMap = {}
    transcriptPdbMap = {}

    qSequence = ''
    Sequence = ''


    pdbInfoMap = extractPDBInfo(input3[0:4]+'.pdb',input3[4:5])

    ### read the blast output file and map the aligned query & subject sequences with corresponding alignment position 
    fileName = open(input1,"r")
    if fileName:
       for line in fileName:
           line = line.strip()
           lineNum = lineNum + 1

           #### extract aligned sequence along with its start and end position #####
           if float(input2[1]) >= 96.0 and lineNum > int(input2[0]) and lineNum < int(input2[2]) and (line[0:5] == 'Query') and (line[0:6] != 'Query='):
               qIndex.append(int(line.split()[1]))
               qIndex.append(int(line.split()[3]))
               qSequence = qSequence + ''.join(line.split()[2].split())

           #### extract subject sequence along with its start and end position #####
           #if float(input2[1]) >= 96.0 and lineNum > int(input2[0]) and lineNum < int(input2[2]) and (line[0:5] == 'Query') and (line[0:6] != 'Query='):
           if float(input2[1]) >= 96.0 and lineNum > int(input2[0]) and lineNum < int(input2[2]) and (line[0:5] == 'Sbjct') and (line[0:6] != 'Sbjct='):
               sIndex.append(int(line.split()[1]))
               sIndex.append(int(line.split()[3]))
               Sequence = Sequence + ''.join(line.split()[2].split())
   
    if qIndex:
       
       qKeyList = range(qIndex[0],qIndex[len(qIndex)-1]+1)  # list of residue index for the aligned query sequence  

       qRes =  list(qSequence.strip('[]'))                  # aligned query sequence residue element

       sKeyList = range(sIndex[0],sIndex[0]+len(Sequence))   # lust of residue index for the aligned subject sequence
 
       sRes =  list(Sequence.strip('[]'))                    #aligned subject sequence residue element

       for item in qKeyList:
           newList.append(pdbInfoMap[item])   #This stores the pdb residue number for each residue index of the aligned query sequence


       qResMap = dict(zip(newList,qRes))      #Map aligned query residues with the pdb residue number
       

       #sResMap = dict(zip(sKeyList,sRes))

       transcriptPdbMap = dict(zip(sKeyList,newList))  #Map subject residue number to the query residue number

    return qResMap,transcriptPdbMap   #return query residue map and trancrit to pdb mapping