Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate remaining content of biotite-util repository to biotite #550

Merged
merged 2 commits into from
Apr 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions tests/structure/data/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,12 @@ Test structures
4P5J: Structure contains a non-canonical nucleotide
1CRR: Multi-model structure with the first model ID not being 1
7GSA: Contains 5-character residue name 'A1AA6'

Creation
--------

To create the test files for the above entries, run the following command:

.. code-block:: console

$ python create_test_structures.py -f ids.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import pandas as pd
import argparse
import numpy as np
import json

def process(input, output, chain):
data = pd.read_csv(input)
# Only retain rows with basepair annotation
data = data[data['Leontis-Westhof'].notna()]

output_list = []

for _, row in data.iterrows():

nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']]

# Extract the Leontis-Westhof annotation
lw_string = row['Leontis-Westhof']

# Some interactions are labelled with `n` for near. These are
# ignored
if lw_string[0] == 'n':
continue

# Get sugar orientation from string (`c` = cis, `t` = trans)
sugar_orientation = lw_string[0]

# The residue ids of the nucleotides
res_ids = [None]*2

for i, nucleotide in enumerate(nucleotides):
nucleotide_list = nucleotide.split('.')

# if the nucleotide is not part of the specified chain, skip
# base pair
if nucleotide_list[1] != chain:
break

res_ids[i] = nucleotide_list[3]

if None in res_ids:
continue

if sugar_orientation == 'c':
sugar_orientation = 1
elif sugar_orientation == 't':
sugar_orientation = 2

this_output = sorted((int(res_ids[0]), int(res_ids[1])))
this_output.append(int(sugar_orientation))
output_list.append(this_output)
output_list = np.unique(output_list, axis=0).tolist()
with open(output, 'w') as f:
json.dump(output_list, f, indent=1)

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Parse the glycosidic bond orientation annotations in the "
"NAKB-database for a specific chain. The annotations can be "
"downloaded in the section 'Base Pairs'."
)
parser.add_argument(
"infile",
help="The path to the input file."
)
parser.add_argument(
"outfile",
help="The path to the output JSON file."
)
parser.add_argument(
"chain",
help="The chain ID to be extracted."
)
args = parser.parse_args()

process(args.infile, args.outfile, args.chain)

Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import pandas as pd
import argparse
import json
import numpy as np

def process(input, output, chain):
data = pd.read_csv(input)
# Only retain rows with basepair annotation
data = data[data['Leontis-Westhof'].notna()]

output_list = []

for _, row in data.iterrows():
nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']]

# Extract the Leontis-Westhof annotation
lw_string = row['Leontis-Westhof']

# Some interactions are labelled with `n` for near. These are
# ignored
if lw_string[0] == 'n':
continue

# Get edge annotations from string
edges = [lw_string[-2], lw_string[-1]]

# Dont allow unspecified edges in test data
if '.' in edges:
continue

res_ids = [None]*2
for i, nucleotide in enumerate(nucleotides):
nucleotide_list = nucleotide.split('.')

# if the nucleotide is not part of the specified chain, skip
# base pair
if nucleotide_list[1] != chain:
break

res_ids[i] = nucleotide_list[3]

if None in res_ids:
continue

for i, edge in enumerate(edges):
if edge == 'W':
edges[i] = 1
if edge == 'H':
edges[i] = 2
if edge == 'S':
edges[i] = 3

# Lower residue id on the left, higher residue id on the right
res_ids = np.array(res_ids, dtype=int)
edges = np.array(edges, dtype=int)
sorter = np.argsort(res_ids)
res_ids = res_ids[sorter]
edges = edges[sorter]

output_list.append(
(int(res_ids[0]), int(res_ids[1]), int(edges[0]), int(edges[1]))
)

output_list = np.unique(output_list, axis=0).tolist()
with open(output, 'w') as f:
json.dump(output_list, f, indent=1)

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Parse the edge type annotations in the NAKB-database for "
"a specific chain. The annotations can be downloaded in the section "
"'Base Pairs'."
)
parser.add_argument(
"infile",
help="The path to the input file."
)
parser.add_argument(
"outfile",
help="The path to the output JSON file."
)
parser.add_argument(
"chain",
help="The chain ID to be extracted."
)
args = parser.parse_args()

process(args.infile, args.outfile, args.chain)

84 changes: 84 additions & 0 deletions tests/structure/data/create_test_structures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import argparse
import subprocess
from os.path import join
import logging
import os
import sys
import biotite
from biotite.database import RequestError
import biotite.database.rcsb as rcsb
import biotite.structure.io as strucio


def create(pdb_id, directory, include_gro):
# Create *.pdb", *.cif and *.mmtf
for file_format in ["pdb", "cif", "bcif", "mmtf"]:
try:
rcsb.fetch(pdb_id, file_format, directory, overwrite=True)
except RequestError:
# PDB entry is not provided in this format
pass
try:
array = strucio.load_structure(join(directory, pdb_id+".pdb"))
except biotite.InvalidFileError:
# Structure probably contains multiple models with different
# number of atoms
# -> Cannot load AtomArrayStack
# -> Skip writing GRO and NPZ file
return
# Create *.gro file
strucio.save_structure(join(directory, pdb_id+".npz"), array)
# Create *.gro files using GROMACS
# Clean PDB file -> remove inscodes and altlocs
if include_gro:
cleaned_file_name = biotite.temp_file("pdb")
strucio.save_structure(cleaned_file_name, array)
# Run GROMACS for file conversion
subprocess.run([
"editconf",
"-f", cleaned_file_name,
"-o", join(directory, pdb_id+".gro")
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Create structure files for unit tests "
"in all supported formats from PDB ID "
"(excluding GROMACS trajectory files)"
)
parser.add_argument(
"--dir", "-d", dest="directory", default=".",
help="the Biotite project directory to put the test files into"
)
parser.add_argument(
"--id", "-i", dest="id",
help="the PDB ID"
)
parser.add_argument(
"--file", "-f", dest="file",
help="read mutliple PDB IDs from text file (line break separated IDs)"
)
parser.add_argument(
"--gromacs", "-g", action="store_true", dest="include_gro",
help="Create '*.gro' files using the Gromacs software"
)
args = parser.parse_args()

if args.file is not None:
with open(args.file, "r") as file:
pdb_ids = [pdb_id.strip().lower() for pdb_id
in file.read().split("\n") if len(pdb_id.strip()) != 0]
elif args.id is not None:
pdb_ids = [args.id.lower()]
else:
logging.error("Must specifiy PDB ID(s)")
sys.exit()

for i, pdb_id in enumerate(pdb_ids):
print(f"{i:2d}/{len(pdb_ids):2d}: {pdb_id}", end="\r")
try:
create(pdb_id, args.directory, args.include_gro)
except:
print()
raise
19 changes: 19 additions & 0 deletions tests/structure/data/ids.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
1aki
1dix
1f2n
5zng
1gya
2axd
1igy
1l2y
1o1z
3o5r
5h73
5ugo
5ugo
4gxy
2d0f
5eil
4p5j
1crr
7gsa
Loading