Skip to content

Commit

Permalink
Merge pull request #550 from padix-key/util-scripts
Browse files Browse the repository at this point in the history
Migrate remaining content of `biotite-util` repository to `biotite`
  • Loading branch information
padix-key authored Apr 7, 2024
2 parents 4b0355a + c88f306 commit 8c16fb0
Show file tree
Hide file tree
Showing 5 changed files with 278 additions and 0 deletions.
9 changes: 9 additions & 0 deletions tests/structure/data/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,12 @@ Test structures
4P5J: Structure contains a non-canonical nucleotide
1CRR: Multi-model structure with the first model ID not being 1
7GSA: Contains 5-character residue name 'A1AA6'

Creation
--------

To create the test files for the above entries, run the following command:

.. code-block:: console
$ python create_test_structures.py -f ids.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import pandas as pd
import argparse
import numpy as np
import json

def process(input, output, chain):
data = pd.read_csv(input)
# Only retain rows with basepair annotation
data = data[data['Leontis-Westhof'].notna()]

output_list = []

for _, row in data.iterrows():

nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']]

# Extract the Leontis-Westhof annotation
lw_string = row['Leontis-Westhof']

# Some interactions are labelled with `n` for near. These are
# ignored
if lw_string[0] == 'n':
continue

# Get sugar orientation from string (`c` = cis, `t` = trans)
sugar_orientation = lw_string[0]

# The residue ids of the nucleotides
res_ids = [None]*2

for i, nucleotide in enumerate(nucleotides):
nucleotide_list = nucleotide.split('.')

# if the nucleotide is not part of the specified chain, skip
# base pair
if nucleotide_list[1] != chain:
break

res_ids[i] = nucleotide_list[3]

if None in res_ids:
continue

if sugar_orientation == 'c':
sugar_orientation = 1
elif sugar_orientation == 't':
sugar_orientation = 2

this_output = sorted((int(res_ids[0]), int(res_ids[1])))
this_output.append(int(sugar_orientation))
output_list.append(this_output)
output_list = np.unique(output_list, axis=0).tolist()
with open(output, 'w') as f:
json.dump(output_list, f, indent=1)

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Parse the glycosidic bond orientation annotations in the "
"NAKB-database for a specific chain. The annotations can be "
"downloaded in the section 'Base Pairs'."
)
parser.add_argument(
"infile",
help="The path to the input file."
)
parser.add_argument(
"outfile",
help="The path to the output JSON file."
)
parser.add_argument(
"chain",
help="The chain ID to be extracted."
)
args = parser.parse_args()

process(args.infile, args.outfile, args.chain)

Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import pandas as pd
import argparse
import json
import numpy as np

def process(input, output, chain):
data = pd.read_csv(input)
# Only retain rows with basepair annotation
data = data[data['Leontis-Westhof'].notna()]

output_list = []

for _, row in data.iterrows():
nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']]

# Extract the Leontis-Westhof annotation
lw_string = row['Leontis-Westhof']

# Some interactions are labelled with `n` for near. These are
# ignored
if lw_string[0] == 'n':
continue

# Get edge annotations from string
edges = [lw_string[-2], lw_string[-1]]

# Dont allow unspecified edges in test data
if '.' in edges:
continue

res_ids = [None]*2
for i, nucleotide in enumerate(nucleotides):
nucleotide_list = nucleotide.split('.')

# if the nucleotide is not part of the specified chain, skip
# base pair
if nucleotide_list[1] != chain:
break

res_ids[i] = nucleotide_list[3]

if None in res_ids:
continue

for i, edge in enumerate(edges):
if edge == 'W':
edges[i] = 1
if edge == 'H':
edges[i] = 2
if edge == 'S':
edges[i] = 3

# Lower residue id on the left, higher residue id on the right
res_ids = np.array(res_ids, dtype=int)
edges = np.array(edges, dtype=int)
sorter = np.argsort(res_ids)
res_ids = res_ids[sorter]
edges = edges[sorter]

output_list.append(
(int(res_ids[0]), int(res_ids[1]), int(edges[0]), int(edges[1]))
)

output_list = np.unique(output_list, axis=0).tolist()
with open(output, 'w') as f:
json.dump(output_list, f, indent=1)

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Parse the edge type annotations in the NAKB-database for "
"a specific chain. The annotations can be downloaded in the section "
"'Base Pairs'."
)
parser.add_argument(
"infile",
help="The path to the input file."
)
parser.add_argument(
"outfile",
help="The path to the output JSON file."
)
parser.add_argument(
"chain",
help="The chain ID to be extracted."
)
args = parser.parse_args()

process(args.infile, args.outfile, args.chain)

84 changes: 84 additions & 0 deletions tests/structure/data/create_test_structures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import argparse
import subprocess
from os.path import join
import logging
import os
import sys
import biotite
from biotite.database import RequestError
import biotite.database.rcsb as rcsb
import biotite.structure.io as strucio


def create(pdb_id, directory, include_gro):
# Create *.pdb", *.cif and *.mmtf
for file_format in ["pdb", "cif", "bcif", "mmtf"]:
try:
rcsb.fetch(pdb_id, file_format, directory, overwrite=True)
except RequestError:
# PDB entry is not provided in this format
pass
try:
array = strucio.load_structure(join(directory, pdb_id+".pdb"))
except biotite.InvalidFileError:
# Structure probably contains multiple models with different
# number of atoms
# -> Cannot load AtomArrayStack
# -> Skip writing GRO and NPZ file
return
# Create *.gro file
strucio.save_structure(join(directory, pdb_id+".npz"), array)
# Create *.gro files using GROMACS
# Clean PDB file -> remove inscodes and altlocs
if include_gro:
cleaned_file_name = biotite.temp_file("pdb")
strucio.save_structure(cleaned_file_name, array)
# Run GROMACS for file conversion
subprocess.run([
"editconf",
"-f", cleaned_file_name,
"-o", join(directory, pdb_id+".gro")
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Create structure files for unit tests "
"in all supported formats from PDB ID "
"(excluding GROMACS trajectory files)"
)
parser.add_argument(
"--dir", "-d", dest="directory", default=".",
help="the Biotite project directory to put the test files into"
)
parser.add_argument(
"--id", "-i", dest="id",
help="the PDB ID"
)
parser.add_argument(
"--file", "-f", dest="file",
help="read mutliple PDB IDs from text file (line break separated IDs)"
)
parser.add_argument(
"--gromacs", "-g", action="store_true", dest="include_gro",
help="Create '*.gro' files using the Gromacs software"
)
args = parser.parse_args()

if args.file is not None:
with open(args.file, "r") as file:
pdb_ids = [pdb_id.strip().lower() for pdb_id
in file.read().split("\n") if len(pdb_id.strip()) != 0]
elif args.id is not None:
pdb_ids = [args.id.lower()]
else:
logging.error("Must specifiy PDB ID(s)")
sys.exit()

for i, pdb_id in enumerate(pdb_ids):
print(f"{i:2d}/{len(pdb_ids):2d}: {pdb_id}", end="\r")
try:
create(pdb_id, args.directory, args.include_gro)
except:
print()
raise
19 changes: 19 additions & 0 deletions tests/structure/data/ids.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
1aki
1dix
1f2n
5zng
1gya
2axd
1igy
1l2y
1o1z
3o5r
5h73
5ugo
5ugo
4gxy
2d0f
5eil
4p5j
1crr
7gsa

0 comments on commit 8c16fb0

Please sign in to comment.