From 26897a9e639c40184dd54043e1b99b6884c42536 Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Fri, 12 Jan 2024 13:45:53 +0100
Subject: [PATCH 1/2] Use 5-character residue names

---
 python-src/fastpdb/__init__.py |  4 ++--
 src/lib.rs                     | 40 +++++++++++++++++-----------------
 2 files changed, 22 insertions(+), 22 deletions(-)
diff --git a/python-src/fastpdb/__init__.py b/python-src/fastpdb/__init__.py
index da190b7..a7b751d 100644
--- a/python-src/fastpdb/__init__.py
+++ b/python-src/fastpdb/__init__.py
@@ -131,7 +131,7 @@ def get_structure(self, model=None, altloc="first", extra_fields=None, include_b
         # Interpret uint32 arrays as unicode arrays
         chain_id  = np.frombuffer(chain_id,  dtype="U4")
         ins_code  = np.frombuffer(ins_code,  dtype="U1")
-        res_name  = np.frombuffer(res_name,  dtype="U3")
+        res_name  = np.frombuffer(res_name,  dtype="U5")
         atom_name = np.frombuffer(atom_name, dtype="U6")
         element   = np.frombuffer(element,   dtype="U2")
         altloc_id = np.frombuffer(altloc_id, dtype="U1")
@@ -256,7 +256,7 @@ def set_structure(self, atoms):
         # Convert Unicode arrays into uint32 arrays for usage in Rust
         chain_id  = np.frombuffer(atoms.chain_id,  dtype=np.uint32).reshape(-1, 4)
         ins_code  = np.frombuffer(atoms.ins_code,  dtype=np.uint32).reshape(-1, 1)
-        res_name  = np.frombuffer(atoms.res_name,  dtype=np.uint32).reshape(-1, 3)
+        res_name  = np.frombuffer(atoms.res_name,  dtype=np.uint32).reshape(-1, 5)
         atom_name = np.frombuffer(atoms.atom_name, dtype=np.uint32).reshape(-1, 6)
         element   = np.frombuffer(atoms.element,   dtype=np.uint32).reshape(-1, 2)
 
diff --git a/src/lib.rs b/src/lib.rs
index 1a8f58f..ed157a7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -43,7 +43,7 @@ struct PDBFile {
 
 #[pymethods]
 impl PDBFile {
-    
+
     /// Create an new [`PDBFile`].
     /// The lines of text are given to `lines`.
     /// An empty `Vec` represents and empty PDB file.
@@ -99,7 +99,7 @@ impl PDBFile {
         })
     }
 
-    
+
     /// Get the number of models contained in the file.
     fn get_model_count(&self) -> usize {
         self.model_start_i.len()
@@ -180,7 +180,7 @@ impl PDBFile {
     /// Unicode *NumPy* arrays are represented by 2D *NumPy* arrays with
     /// `uint32` *dtype*.
     /// The returned tuple contains the following annotations in the given order:
-    /// 
+    ///
     /// - `chain_id`
     /// - `res_id`
     /// - `ins_code`
@@ -211,16 +211,16 @@ impl PDBFile {
                                                            Option<Py<PyArray<f64,  Ix1>>>,
                                                            Option<Py<PyArray<i64,  Ix1>>>)> {
         let atom_line_i: Vec<usize> = self.get_atom_indices(model)?;
-        
+
         let mut chain_id:  Array<u32,  Ix2> = Array::zeros((atom_line_i.len(), 4));
         let mut res_id:    Array<i64,  Ix1> = Array::zeros(atom_line_i.len());
         let mut ins_code:  Array<u32,  Ix2> = Array::zeros((atom_line_i.len(), 1));
-        let mut res_name:  Array<u32,  Ix2> = Array::zeros((atom_line_i.len(), 3));
+        let mut res_name:  Array<u32,  Ix2> = Array::zeros((atom_line_i.len(), 5));
         let mut hetero:    Array<bool, Ix1> = Array::default(atom_line_i.len());
         let mut atom_name: Array<u32,  Ix2> = Array::zeros((atom_line_i.len(), 6));
         let mut element:   Array<u32,  Ix2> = Array::zeros((atom_line_i.len(), 2));
         let mut altloc_id: Array<u32,  Ix2> = Array::zeros((atom_line_i.len(), 1));
-        
+
         let mut atom_id: Array<i64, Ix1>;
         if include_atom_id {
             atom_id = Array::zeros(atom_line_i.len());
@@ -250,7 +250,7 @@ impl PDBFile {
         else {
             charge = Array::zeros(0);
         }
-        
+
         // Iterate over ATOM and HETATM records to write annotation arrays
         for (atom_i, line_i) in atom_line_i.iter().enumerate() {
             let line = &self.lines[*line_i];
@@ -293,7 +293,7 @@ impl PDBFile {
                 }
             }
         }
-        
+
         Python::with_gil(|py| {
             Ok((
                 PyArray::from_array(py, &chain_id ).to_owned(),
@@ -326,7 +326,7 @@ impl PDBFile {
                 atom_id_to_index.insert(*id, i as u32);
             }
         });
-        
+
         // Cannot preemptively determine number of bonds
         // -> Memory allocation for all bonds is not possible
         // -> No benefit in finding 'CONECT' record lines prior to iteration
@@ -409,11 +409,11 @@ impl PDBFile {
             // This procedure aims to increase the performance is repetitive formatting is omitted
             let mut prefix: Vec<String> = Vec::new();
             let mut suffix: Vec<String> = Vec::new();
-            
+
             for i in 0..coord.shape()[1] {
                 let element_i = parse_string_from_array(&element, i)?;
                 let atom_name_i = parse_string_from_array(&atom_name, i)?;
-                
+
                 prefix.push(format!(
                     "{:6}{:>5} {:4} {:>3} {:1}{:>4}{:1}   ",
                     if hetero[i] { "HETATM" } else { "ATOM" },
@@ -449,7 +449,7 @@ impl PDBFile {
                 if is_multi_model {
                     self.lines.push(format!("MODEL {:>8}", model_i+1));
                 }
-                for atom_i in 0..coord.shape()[1] { 
+                for atom_i in 0..coord.shape()[1] {
                     let coord_string = format!(
                         "{:>8.3}{:>8.3}{:>8.3}",
                         coord[[model_i, atom_i, 0]],
@@ -471,13 +471,13 @@ impl PDBFile {
     /// array containing indices pointing to bonded atoms in the `AtomArray`.
     /// The `atom_id` annotation array is required to map the atom IDs in `CONECT` records
     /// to atom indices.
-    fn write_bonds(&mut self, 
+    fn write_bonds(&mut self,
                    bonds: Py<PyArray<i32, Ix2>>,
                    atom_id: Py<PyArray<i64, Ix1>>) -> PyResult<()> {
         Python::with_gil(|py| {
             let bonds = bonds.as_ref(py).to_owned_array();
             let atom_id = atom_id.as_ref(py).to_owned_array();
-        
+
             for (center_i, bonded_indices) in bonds.outer_iter().enumerate() {
                 let mut n_added: usize = 0;
                 let mut line: String = String::new();
@@ -552,7 +552,7 @@ impl PDBFile {
                 }
                 Ok(CoordArray::Single(coord))
             },
-            
+
             None => {
                 let length = self.get_model_length()?;
                 let mut coord = Array::zeros((self.atom_line_i.len(), 3));
@@ -591,7 +591,7 @@ impl PDBFile {
                 self.model_start_i.len(), model
             )));
         }
-        
+
         // Get the start and stop line index for this model index
         let (model_start, model_stop) = match model_i.cmp(&(self.model_start_i.len() as isize - 1)){
             Ordering::Less => (
@@ -600,13 +600,13 @@ impl PDBFile {
             ),
             // Last model -> Model reaches to end of file
             Ordering::Equal => (
-                self.model_start_i[model_i as usize], 
+                self.model_start_i[model_i as usize],
                 self.lines.len()
             ),
             // This case was excluded above
             _ => panic!("This branch should not be reached")
         };
-        
+
         // Get the atom records within these line boundaries
         Ok(
             self.atom_line_i.iter().copied()
@@ -614,7 +614,7 @@ impl PDBFile {
                 .collect()
         )
     }
-    
+
 
     /// Get the number of atoms in each model of the PDB file.
     /// A `PyErr` is returned if the number of atoms per model differ from each other.
@@ -635,7 +635,7 @@ impl PDBFile {
                 )); }
             };
         }
-    
+
         match length {
             None => panic!("Length cannot be 'None'"),
             Some(l) => Ok(l)

From 8dcfe589729bc96ac387f39efcee32158820ae53 Mon Sep 17 00:00:00 2001
From: Patrick Kunzmann <padix.key@gmail.com>
Date: Fri, 12 Jan 2024 15:19:01 +0100
Subject: [PATCH 2/2] Safer conversion of unicode arrays

---
 python-src/fastpdb/__init__.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/python-src/fastpdb/__init__.py b/python-src/fastpdb/__init__.py
index a7b751d..900640f 100644
--- a/python-src/fastpdb/__init__.py
+++ b/python-src/fastpdb/__init__.py
@@ -254,11 +254,11 @@ def set_structure(self, atoms):
 
         # Write 'ATOM' and 'MODEL' records
         # Convert Unicode arrays into uint32 arrays for usage in Rust
-        chain_id  = np.frombuffer(atoms.chain_id,  dtype=np.uint32).reshape(-1, 4)
-        ins_code  = np.frombuffer(atoms.ins_code,  dtype=np.uint32).reshape(-1, 1)
-        res_name  = np.frombuffer(atoms.res_name,  dtype=np.uint32).reshape(-1, 5)
-        atom_name = np.frombuffer(atoms.atom_name, dtype=np.uint32).reshape(-1, 6)
-        element   = np.frombuffer(atoms.element,   dtype=np.uint32).reshape(-1, 2)
+        chain_id  = _convert_unicode_to_uint32(atoms.chain_id)
+        ins_code  = _convert_unicode_to_uint32(atoms.ins_code)
+        res_name  = _convert_unicode_to_uint32(atoms.res_name)
+        atom_name = _convert_unicode_to_uint32(atoms.atom_name)
+        element   = _convert_unicode_to_uint32(atoms.element)
 
         categories = atoms.get_annotation_categories()
         atom_id   = atoms.atom_id   if "atom_id"   in categories else None
@@ -320,3 +320,18 @@ def _index_models_and_atoms(self):
         self._pdb_file.index_models_and_atoms()
         self._model_start_i = self._pdb_file.model_start_i
         self._atom_line_i = self._pdb_file.atom_line_i
+
+
+def _convert_unicode_to_uint32(array):
+    """
+    Convert a unicode string array into a 2D uint32 array.
+
+    The second dimension corresponds to the character position within a
+    string.
+    """
+    dtype = array.dtype
+    if not np.issubdtype(dtype, np.str_):
+        raise TypeError("Expected unicode string array")
+    length = array.shape[0]
+    n_char = dtype.itemsize // 4
+    return np.frombuffer(array, dtype=np.uint32).reshape(length, n_char)
\ No newline at end of file