diff --git a/tests/structure/data/README.rst b/tests/structure/data/README.rst index 74ac9ae19..3cb2aba7a 100644 --- a/tests/structure/data/README.rst +++ b/tests/structure/data/README.rst @@ -21,3 +21,12 @@ Test structures 4P5J: Structure contains a non-canonical nucleotide 1CRR: Multi-model structure with the first model ID not being 1 7GSA: Contains 5-character residue name 'A1AA6' + +Creation +-------- + +To create the test files for the above entries, run the following command: + +.. code-block:: console + + $ python create_test_structures.py -f ids.txt \ No newline at end of file diff --git a/tests/structure/data/base_pairs/create_bond_orientation_test_data.py b/tests/structure/data/base_pairs/create_bond_orientation_test_data.py new file mode 100644 index 000000000..c81f9e050 --- /dev/null +++ b/tests/structure/data/base_pairs/create_bond_orientation_test_data.py @@ -0,0 +1,77 @@ +import pandas as pd +import argparse +import numpy as np +import json + +def process(input, output, chain): + data = pd.read_csv(input) + # Only retain rows with basepair annotation + data = data[data['Leontis-Westhof'].notna()] + + output_list = [] + + for _, row in data.iterrows(): + + nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']] + + # Extract the Leontis-Westhof annotation + lw_string = row['Leontis-Westhof'] + + # Some interactions are labelled with `n` for near. These are + # ignored + if lw_string[0] == 'n': + continue + + # Get sugar orientation from string (`c` = cis, `t` = trans) + sugar_orientation = lw_string[0] + + # The residue ids of the nucleotides + res_ids = [None]*2 + + for i, nucleotide in enumerate(nucleotides): + nucleotide_list = nucleotide.split('.') + + # if the nucleotide is not part of the specified chain, skip + # base pair + if nucleotide_list[1] != chain: + break + + res_ids[i] = nucleotide_list[3] + + if None in res_ids: + continue + + if sugar_orientation == 'c': + sugar_orientation = 1 + elif sugar_orientation == 't': + sugar_orientation = 2 + + this_output = sorted((int(res_ids[0]), int(res_ids[1]))) + this_output.append(int(sugar_orientation)) + output_list.append(this_output) + output_list = np.unique(output_list, axis=0).tolist() + with open(output, 'w') as f: + json.dump(output_list, f, indent=1) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Parse the glycosidic bond orientation annotations in the " + "NAKB-database for a specific chain. The annotations can be " + "downloaded in the section 'Base Pairs'." + ) + parser.add_argument( + "infile", + help="The path to the input file." + ) + parser.add_argument( + "outfile", + help="The path to the output JSON file." + ) + parser.add_argument( + "chain", + help="The chain ID to be extracted." + ) + args = parser.parse_args() + + process(args.infile, args.outfile, args.chain) + diff --git a/tests/structure/data/base_pairs/create_interacting_edge_test_data.py b/tests/structure/data/base_pairs/create_interacting_edge_test_data.py new file mode 100644 index 000000000..1a46eb4d3 --- /dev/null +++ b/tests/structure/data/base_pairs/create_interacting_edge_test_data.py @@ -0,0 +1,89 @@ +import pandas as pd +import argparse +import json +import numpy as np + +def process(input, output, chain): + data = pd.read_csv(input) + # Only retain rows with basepair annotation + data = data[data['Leontis-Westhof'].notna()] + + output_list = [] + + for _, row in data.iterrows(): + nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']] + + # Extract the Leontis-Westhof annotation + lw_string = row['Leontis-Westhof'] + + # Some interactions are labelled with `n` for near. These are + # ignored + if lw_string[0] == 'n': + continue + + # Get edge annotations from string + edges = [lw_string[-2], lw_string[-1]] + + # Dont allow unspecified edges in test data + if '.' in edges: + continue + + res_ids = [None]*2 + for i, nucleotide in enumerate(nucleotides): + nucleotide_list = nucleotide.split('.') + + # if the nucleotide is not part of the specified chain, skip + # base pair + if nucleotide_list[1] != chain: + break + + res_ids[i] = nucleotide_list[3] + + if None in res_ids: + continue + + for i, edge in enumerate(edges): + if edge == 'W': + edges[i] = 1 + if edge == 'H': + edges[i] = 2 + if edge == 'S': + edges[i] = 3 + + # Lower residue id on the left, higher residue id on the right + res_ids = np.array(res_ids, dtype=int) + edges = np.array(edges, dtype=int) + sorter = np.argsort(res_ids) + res_ids = res_ids[sorter] + edges = edges[sorter] + + output_list.append( + (int(res_ids[0]), int(res_ids[1]), int(edges[0]), int(edges[1])) + ) + + output_list = np.unique(output_list, axis=0).tolist() + with open(output, 'w') as f: + json.dump(output_list, f, indent=1) + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Parse the edge type annotations in the NAKB-database for " + "a specific chain. The annotations can be downloaded in the section " + "'Base Pairs'." + ) + parser.add_argument( + "infile", + help="The path to the input file." + ) + parser.add_argument( + "outfile", + help="The path to the output JSON file." + ) + parser.add_argument( + "chain", + help="The chain ID to be extracted." + ) + args = parser.parse_args() + + process(args.infile, args.outfile, args.chain) + diff --git a/tests/structure/data/create_test_structures.py b/tests/structure/data/create_test_structures.py new file mode 100644 index 000000000..cbe2216ab --- /dev/null +++ b/tests/structure/data/create_test_structures.py @@ -0,0 +1,84 @@ +import argparse +import subprocess +from os.path import join +import logging +import os +import sys +import biotite +from biotite.database import RequestError +import biotite.database.rcsb as rcsb +import biotite.structure.io as strucio + + +def create(pdb_id, directory, include_gro): + # Create *.pdb", *.cif and *.mmtf + for file_format in ["pdb", "cif", "bcif", "mmtf"]: + try: + rcsb.fetch(pdb_id, file_format, directory, overwrite=True) + except RequestError: + # PDB entry is not provided in this format + pass + try: + array = strucio.load_structure(join(directory, pdb_id+".pdb")) + except biotite.InvalidFileError: + # Structure probably contains multiple models with different + # number of atoms + # -> Cannot load AtomArrayStack + # -> Skip writing GRO and NPZ file + return + # Create *.gro file + strucio.save_structure(join(directory, pdb_id+".npz"), array) + # Create *.gro files using GROMACS + # Clean PDB file -> remove inscodes and altlocs + if include_gro: + cleaned_file_name = biotite.temp_file("pdb") + strucio.save_structure(cleaned_file_name, array) + # Run GROMACS for file conversion + subprocess.run([ + "editconf", + "-f", cleaned_file_name, + "-o", join(directory, pdb_id+".gro") + ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Create structure files for unit tests " + "in all supported formats from PDB ID " + "(excluding GROMACS trajectory files)" + ) + parser.add_argument( + "--dir", "-d", dest="directory", default=".", + help="the Biotite project directory to put the test files into" + ) + parser.add_argument( + "--id", "-i", dest="id", + help="the PDB ID" + ) + parser.add_argument( + "--file", "-f", dest="file", + help="read mutliple PDB IDs from text file (line break separated IDs)" + ) + parser.add_argument( + "--gromacs", "-g", action="store_true", dest="include_gro", + help="Create '*.gro' files using the Gromacs software" + ) + args = parser.parse_args() + + if args.file is not None: + with open(args.file, "r") as file: + pdb_ids = [pdb_id.strip().lower() for pdb_id + in file.read().split("\n") if len(pdb_id.strip()) != 0] + elif args.id is not None: + pdb_ids = [args.id.lower()] + else: + logging.error("Must specifiy PDB ID(s)") + sys.exit() + + for i, pdb_id in enumerate(pdb_ids): + print(f"{i:2d}/{len(pdb_ids):2d}: {pdb_id}", end="\r") + try: + create(pdb_id, args.directory, args.include_gro) + except: + print() + raise \ No newline at end of file diff --git a/tests/structure/data/ids.txt b/tests/structure/data/ids.txt new file mode 100644 index 000000000..ec9fcc5ab --- /dev/null +++ b/tests/structure/data/ids.txt @@ -0,0 +1,19 @@ +1aki +1dix +1f2n +5zng +1gya +2axd +1igy +1l2y +1o1z +3o5r +5h73 +5ugo +5ugo +4gxy +2d0f +5eil +4p5j +1crr +7gsa