Merge pull request #550 from padix-key/util-scripts

Migrate remaining content of `biotite-util` repository to `biotite`
biotite-dev · Apr 7, 2024 · 8c16fb0 · 8c16fb0
2 parents 4b0355a + c88f306
commit 8c16fb0
Show file tree

Hide file tree

Showing 5 changed files with 278 additions and 0 deletions.
diff --git a/tests/structure/data/README.rst b/tests/structure/data/README.rst
@@ -21,3 +21,12 @@ Test structures
 4P5J: Structure contains a non-canonical nucleotide
 1CRR: Multi-model structure with the first model ID not being 1
 7GSA: Contains 5-character residue name 'A1AA6'
+
+Creation
+--------
+
+To create the test files for the above entries, run the following command:
+
+.. code-block:: console
+
+    $ python create_test_structures.py -f ids.txt
diff --git a/tests/structure/data/base_pairs/create_bond_orientation_test_data.py b/tests/structure/data/base_pairs/create_bond_orientation_test_data.py
@@ -0,0 +1,77 @@
+import pandas as pd
+import argparse
+import numpy as np
+import json
+
+def process(input, output, chain):
+    data = pd.read_csv(input)
+    # Only retain rows with basepair annotation
+    data = data[data['Leontis-Westhof'].notna()]
+
+    output_list = []
+
+    for _, row in data.iterrows():
+
+        nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']]
+
+        # Extract the Leontis-Westhof annotation
+        lw_string = row['Leontis-Westhof']
+
+        # Some interactions are labelled with `n` for near. These are
+        # ignored
+        if lw_string[0] == 'n':
+            continue
+
+        # Get sugar orientation from string (`c` = cis, `t` = trans)
+        sugar_orientation = lw_string[0]
+
+        # The residue ids of the nucleotides
+        res_ids = [None]*2
+
+        for i, nucleotide in enumerate(nucleotides):
+            nucleotide_list = nucleotide.split('.')
+
+            # if the nucleotide is not part of the specified chain, skip
+            # base pair
+            if nucleotide_list[1] != chain:
+                break
+
+            res_ids[i] = nucleotide_list[3]
+
+        if None in res_ids:
+            continue
+
+        if sugar_orientation == 'c':
+            sugar_orientation = 1
+        elif sugar_orientation == 't':
+            sugar_orientation = 2
+
+        this_output = sorted((int(res_ids[0]), int(res_ids[1])))
+        this_output.append(int(sugar_orientation))
+        output_list.append(this_output)
+    output_list = np.unique(output_list, axis=0).tolist()
+    with open(output, 'w') as f:
+        json.dump(output_list, f, indent=1)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Parse the glycosidic bond orientation annotations in the "
+        "NAKB-database for a specific chain. The annotations can be "
+        "downloaded in the section 'Base Pairs'."
+    )
+    parser.add_argument(
+        "infile",
+        help="The path to the input file."
+    )
+    parser.add_argument(
+        "outfile",
+        help="The path to the output JSON file."
+    )
+    parser.add_argument(
+        "chain",
+        help="The chain ID to be extracted."
+    )
+    args = parser.parse_args()
+
+    process(args.infile, args.outfile, args.chain)
+
diff --git a/tests/structure/data/base_pairs/create_interacting_edge_test_data.py b/tests/structure/data/base_pairs/create_interacting_edge_test_data.py
@@ -0,0 +1,89 @@
+import pandas as pd
+import argparse
+import json
+import numpy as np
+
+def process(input, output, chain):
+    data = pd.read_csv(input)
+    # Only retain rows with basepair annotation
+    data = data[data['Leontis-Westhof'].notna()]
+
+    output_list = []
+
+    for _, row in data.iterrows():
+        nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']]
+
+        # Extract the Leontis-Westhof annotation
+        lw_string = row['Leontis-Westhof']
+
+        # Some interactions are labelled with `n` for near. These are
+        # ignored
+        if lw_string[0] == 'n':
+            continue
+
+        # Get edge annotations from string
+        edges = [lw_string[-2], lw_string[-1]]
+
+        # Dont allow unspecified edges in test data
+        if '.' in edges:
+            continue
+
+        res_ids = [None]*2
+        for i, nucleotide in enumerate(nucleotides):
+            nucleotide_list = nucleotide.split('.')
+
+            # if the nucleotide is not part of the specified chain, skip
+            # base pair
+            if nucleotide_list[1] != chain:
+                break
+
+            res_ids[i] = nucleotide_list[3]
+
+        if None in res_ids:
+            continue
+
+        for i, edge in enumerate(edges):
+            if edge == 'W':
+                edges[i] = 1
+            if edge == 'H':
+                edges[i] = 2
+            if edge == 'S':
+                edges[i] = 3
+
+        # Lower residue id on the left, higher residue id on the right
+        res_ids = np.array(res_ids, dtype=int)
+        edges = np.array(edges, dtype=int)
+        sorter = np.argsort(res_ids)
+        res_ids = res_ids[sorter]
+        edges = edges[sorter]
+
+        output_list.append(
+            (int(res_ids[0]), int(res_ids[1]), int(edges[0]), int(edges[1]))
+        )
+
+    output_list = np.unique(output_list, axis=0).tolist()
+    with open(output, 'w') as f:
+        json.dump(output_list, f, indent=1)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Parse the edge type annotations in the NAKB-database for "
+        "a specific chain. The annotations can be downloaded in the section "
+        "'Base Pairs'."
+    )
+    parser.add_argument(
+        "infile",
+        help="The path to the input file."
+    )
+    parser.add_argument(
+        "outfile",
+        help="The path to the output JSON file."
+    )
+    parser.add_argument(
+        "chain",
+        help="The chain ID to be extracted."
+    )
+    args = parser.parse_args()
+
+    process(args.infile, args.outfile, args.chain)
+
diff --git a/tests/structure/data/create_test_structures.py b/tests/structure/data/create_test_structures.py
@@ -0,0 +1,84 @@
+import argparse
+import subprocess
+from os.path import join
+import logging
+import os
+import sys
+import biotite
+from biotite.database import RequestError
+import biotite.database.rcsb as rcsb
+import biotite.structure.io as strucio
+
+
+def create(pdb_id, directory, include_gro):
+    # Create *.pdb", *.cif and *.mmtf
+    for file_format in ["pdb", "cif", "bcif", "mmtf"]:
+        try:
+            rcsb.fetch(pdb_id, file_format, directory, overwrite=True)
+        except RequestError:
+            # PDB entry is not provided in this format
+            pass
+    try:
+        array = strucio.load_structure(join(directory, pdb_id+".pdb"))
+    except biotite.InvalidFileError:
+        # Structure probably contains multiple models with different
+        # number of atoms
+        # -> Cannot load AtomArrayStack
+        # -> Skip writing GRO and NPZ file
+        return
+    # Create *.gro file
+    strucio.save_structure(join(directory, pdb_id+".npz"), array)
+    # Create *.gro files using GROMACS
+    # Clean PDB file -> remove inscodes and altlocs
+    if include_gro:
+        cleaned_file_name = biotite.temp_file("pdb")
+        strucio.save_structure(cleaned_file_name, array)
+        # Run GROMACS for file conversion
+        subprocess.run([
+            "editconf",
+            "-f", cleaned_file_name,
+            "-o", join(directory, pdb_id+".gro")
+        ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Create structure files for unit tests "
+                    "in all supported formats from PDB ID "
+                    "(excluding GROMACS trajectory files)"
+    )
+    parser.add_argument(
+        "--dir",  "-d", dest="directory", default=".",
+        help="the Biotite project directory to put the test files into"
+    )
+    parser.add_argument(
+        "--id", "-i", dest="id",
+        help="the PDB ID"
+    )
+    parser.add_argument(
+        "--file", "-f", dest="file",
+        help="read mutliple PDB IDs from text file (line break separated IDs)"
+    )
+    parser.add_argument(
+        "--gromacs", "-g", action="store_true", dest="include_gro",
+        help="Create '*.gro' files using the Gromacs software"
+    )
+    args = parser.parse_args()
+
+    if args.file is not None:
+        with open(args.file, "r") as file:
+            pdb_ids = [pdb_id.strip().lower() for pdb_id
+                       in file.read().split("\n") if len(pdb_id.strip()) != 0]
+    elif args.id is not None:
+        pdb_ids = [args.id.lower()]
+    else:
+        logging.error("Must specifiy PDB ID(s)")
+        sys.exit()
+
+    for i, pdb_id in enumerate(pdb_ids):
+        print(f"{i:2d}/{len(pdb_ids):2d}: {pdb_id}", end="\r")
+        try:
+            create(pdb_id, args.directory, args.include_gro)
+        except:
+            print()
+            raise
diff --git a/tests/structure/data/ids.txt b/tests/structure/data/ids.txt
@@ -0,0 +1,19 @@
+1aki
+1dix
+1f2n
+5zng
+1gya
+2axd
+1igy
+1l2y
+1o1z
+3o5r
+5h73
+5ugo
+5ugo
+4gxy
+2d0f
+5eil
+4p5j
+1crr
+7gsa
-Original file line number
+Diff line change
@@ -0,0 +1,19 @@
+aki
+dix
+f2n
+zng
+gya
+axd
+igy
+l2y
+o1z
+o5r
+h73
+ugo
+ugo
+gxy
+d0f
+eil
+p5j
+crr
+gsa