Added new subpackage for molecule file I/O

timbernat · Dec 11, 2024 · bbd1b85 · bbd1b85
1 parent 040a718
commit bbd1b85
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 0 deletions.
diff --git a/polymerist/molfiles/__init__.py b/polymerist/molfiles/__init__.py
@@ -0,0 +1,4 @@
+'''Utilities for reading from and writing to various molecular file formats'''
+
+__author__ = 'Timotej Bernat'
+__email__ = 'timotej.bernat@colorado.edu'
diff --git a/polymerist/molfiles/pdb.py b/polymerist/molfiles/pdb.py
@@ -0,0 +1,72 @@
+'''PDB file formatting tools'''
+
+__author__ = 'Timotej Bernat'
+__email__ = 'timotej.bernat@colorado.edu'
+
+from dataclasses import dataclass, field
+from collections import Counter
+
+
+@dataclass
+class SerialAtomLabeller:
+    '''
+    For assigning unique numbered atom names based on their
+    order of appearance within a molecule and elemental class
+    
+    Useful, for example, in generating unique atom names for a PDB file
+    
+    Parameters
+    ----------
+    atom_label_size : int , default 4      
+        Exact length alloted for any generated atom label
+        Labels shorter than this are right-padded with spaces,
+        while labels longer than this are truncated
+        
+        Default of 4 is the chosen to be compatible with the PDB specification ("Atom name: lines 13-16, left-justified")
+        https://www.cgl.ucsf.edu/chimera/docs/UsersGuide/tutorials/pdbintro.html
+    include_elem_idx : bool, default True  
+        Whether to attach a numerical element-index postfix to atom labels
+        
+        E.g. with atom_label_size=4, the fifth carbon in a topology  
+        will be labelled as "C004" with include_elem_idx=True, 
+        while labelled as "C   " with include_elem_idx=False, 
+    default_elem_idx : int, default 0
+        Starting index for each element category
+        By default, is 0-indexed; MUST BE POSITIVE
+    '''
+    atom_label_size  : int = 4
+    include_elem_idx : bool = True
+    default_elem_idx : int = 0
+
+    element_counter : Counter = field(init=False, default_factory=Counter)
+
+    def __post_init__(self) -> None:
+        '''Check ranges on input values'''
+        if self.atom_label_size < 0:
+            raise ValueError(f'Must provide a non-negative number of index digits to include (provided {self.num_idx_digits})')
+
+        if self.default_elem_idx < 0:
+            raise ValueError(f'Must provide a non-negative starting index for element indices (provided {self.default_elem_idx})')
+
+    def get_atom_label(self, elem_symbol : str) -> str:
+        '''
+        Obtain a numbered atom label for an atom based on its element, 
+        updating the underlying element context in the process
+        '''
+        if elem_symbol not in self.element_counter: # initialize first occurence to starting value
+            self.element_counter[elem_symbol] = self.default_elem_idx
+
+        atom_idx_label : str = ''
+        if self.include_elem_idx:
+            atom_idx = self.element_counter[elem_symbol]
+            num_idx_digits = max(self.atom_label_size - len(elem_symbol), 0) # number of symbols left over for an atom index
+            atom_idx_label = f'{atom_idx:0{num_idx_digits}d}'
+
+        atom_name = f'{elem_symbol}{atom_idx_label}'
+        atom_name = atom_name.ljust(self.atom_label_size, ' ')[:self.atom_label_size] # pad with spaces if too short, or truncate if too long
+        assert(len(atom_name) <= self.atom_label_size) # perfunctory check to make sure things are working as expected
+
+        self.element_counter[elem_symbol] += 1 # update tally with addition of new occurence of a particular element
+
+        return atom_name
+