-
Notifications
You must be signed in to change notification settings - Fork 103
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #550 from padix-key/util-scripts
Migrate remaining content of `biotite-util` repository to `biotite`
- Loading branch information
Showing
5 changed files
with
278 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
77 changes: 77 additions & 0 deletions
77
tests/structure/data/base_pairs/create_bond_orientation_test_data.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import pandas as pd | ||
import argparse | ||
import numpy as np | ||
import json | ||
|
||
def process(input, output, chain): | ||
data = pd.read_csv(input) | ||
# Only retain rows with basepair annotation | ||
data = data[data['Leontis-Westhof'].notna()] | ||
|
||
output_list = [] | ||
|
||
for _, row in data.iterrows(): | ||
|
||
nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']] | ||
|
||
# Extract the Leontis-Westhof annotation | ||
lw_string = row['Leontis-Westhof'] | ||
|
||
# Some interactions are labelled with `n` for near. These are | ||
# ignored | ||
if lw_string[0] == 'n': | ||
continue | ||
|
||
# Get sugar orientation from string (`c` = cis, `t` = trans) | ||
sugar_orientation = lw_string[0] | ||
|
||
# The residue ids of the nucleotides | ||
res_ids = [None]*2 | ||
|
||
for i, nucleotide in enumerate(nucleotides): | ||
nucleotide_list = nucleotide.split('.') | ||
|
||
# if the nucleotide is not part of the specified chain, skip | ||
# base pair | ||
if nucleotide_list[1] != chain: | ||
break | ||
|
||
res_ids[i] = nucleotide_list[3] | ||
|
||
if None in res_ids: | ||
continue | ||
|
||
if sugar_orientation == 'c': | ||
sugar_orientation = 1 | ||
elif sugar_orientation == 't': | ||
sugar_orientation = 2 | ||
|
||
this_output = sorted((int(res_ids[0]), int(res_ids[1]))) | ||
this_output.append(int(sugar_orientation)) | ||
output_list.append(this_output) | ||
output_list = np.unique(output_list, axis=0).tolist() | ||
with open(output, 'w') as f: | ||
json.dump(output_list, f, indent=1) | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description="Parse the glycosidic bond orientation annotations in the " | ||
"NAKB-database for a specific chain. The annotations can be " | ||
"downloaded in the section 'Base Pairs'." | ||
) | ||
parser.add_argument( | ||
"infile", | ||
help="The path to the input file." | ||
) | ||
parser.add_argument( | ||
"outfile", | ||
help="The path to the output JSON file." | ||
) | ||
parser.add_argument( | ||
"chain", | ||
help="The chain ID to be extracted." | ||
) | ||
args = parser.parse_args() | ||
|
||
process(args.infile, args.outfile, args.chain) | ||
|
89 changes: 89 additions & 0 deletions
89
tests/structure/data/base_pairs/create_interacting_edge_test_data.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import pandas as pd | ||
import argparse | ||
import json | ||
import numpy as np | ||
|
||
def process(input, output, chain): | ||
data = pd.read_csv(input) | ||
# Only retain rows with basepair annotation | ||
data = data[data['Leontis-Westhof'].notna()] | ||
|
||
output_list = [] | ||
|
||
for _, row in data.iterrows(): | ||
nucleotides = [row['Nucleotide 1'], row['Nucleotide 2']] | ||
|
||
# Extract the Leontis-Westhof annotation | ||
lw_string = row['Leontis-Westhof'] | ||
|
||
# Some interactions are labelled with `n` for near. These are | ||
# ignored | ||
if lw_string[0] == 'n': | ||
continue | ||
|
||
# Get edge annotations from string | ||
edges = [lw_string[-2], lw_string[-1]] | ||
|
||
# Dont allow unspecified edges in test data | ||
if '.' in edges: | ||
continue | ||
|
||
res_ids = [None]*2 | ||
for i, nucleotide in enumerate(nucleotides): | ||
nucleotide_list = nucleotide.split('.') | ||
|
||
# if the nucleotide is not part of the specified chain, skip | ||
# base pair | ||
if nucleotide_list[1] != chain: | ||
break | ||
|
||
res_ids[i] = nucleotide_list[3] | ||
|
||
if None in res_ids: | ||
continue | ||
|
||
for i, edge in enumerate(edges): | ||
if edge == 'W': | ||
edges[i] = 1 | ||
if edge == 'H': | ||
edges[i] = 2 | ||
if edge == 'S': | ||
edges[i] = 3 | ||
|
||
# Lower residue id on the left, higher residue id on the right | ||
res_ids = np.array(res_ids, dtype=int) | ||
edges = np.array(edges, dtype=int) | ||
sorter = np.argsort(res_ids) | ||
res_ids = res_ids[sorter] | ||
edges = edges[sorter] | ||
|
||
output_list.append( | ||
(int(res_ids[0]), int(res_ids[1]), int(edges[0]), int(edges[1])) | ||
) | ||
|
||
output_list = np.unique(output_list, axis=0).tolist() | ||
with open(output, 'w') as f: | ||
json.dump(output_list, f, indent=1) | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description="Parse the edge type annotations in the NAKB-database for " | ||
"a specific chain. The annotations can be downloaded in the section " | ||
"'Base Pairs'." | ||
) | ||
parser.add_argument( | ||
"infile", | ||
help="The path to the input file." | ||
) | ||
parser.add_argument( | ||
"outfile", | ||
help="The path to the output JSON file." | ||
) | ||
parser.add_argument( | ||
"chain", | ||
help="The chain ID to be extracted." | ||
) | ||
args = parser.parse_args() | ||
|
||
process(args.infile, args.outfile, args.chain) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import argparse | ||
import subprocess | ||
from os.path import join | ||
import logging | ||
import os | ||
import sys | ||
import biotite | ||
from biotite.database import RequestError | ||
import biotite.database.rcsb as rcsb | ||
import biotite.structure.io as strucio | ||
|
||
|
||
def create(pdb_id, directory, include_gro): | ||
# Create *.pdb", *.cif and *.mmtf | ||
for file_format in ["pdb", "cif", "bcif", "mmtf"]: | ||
try: | ||
rcsb.fetch(pdb_id, file_format, directory, overwrite=True) | ||
except RequestError: | ||
# PDB entry is not provided in this format | ||
pass | ||
try: | ||
array = strucio.load_structure(join(directory, pdb_id+".pdb")) | ||
except biotite.InvalidFileError: | ||
# Structure probably contains multiple models with different | ||
# number of atoms | ||
# -> Cannot load AtomArrayStack | ||
# -> Skip writing GRO and NPZ file | ||
return | ||
# Create *.gro file | ||
strucio.save_structure(join(directory, pdb_id+".npz"), array) | ||
# Create *.gro files using GROMACS | ||
# Clean PDB file -> remove inscodes and altlocs | ||
if include_gro: | ||
cleaned_file_name = biotite.temp_file("pdb") | ||
strucio.save_structure(cleaned_file_name, array) | ||
# Run GROMACS for file conversion | ||
subprocess.run([ | ||
"editconf", | ||
"-f", cleaned_file_name, | ||
"-o", join(directory, pdb_id+".gro") | ||
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description="Create structure files for unit tests " | ||
"in all supported formats from PDB ID " | ||
"(excluding GROMACS trajectory files)" | ||
) | ||
parser.add_argument( | ||
"--dir", "-d", dest="directory", default=".", | ||
help="the Biotite project directory to put the test files into" | ||
) | ||
parser.add_argument( | ||
"--id", "-i", dest="id", | ||
help="the PDB ID" | ||
) | ||
parser.add_argument( | ||
"--file", "-f", dest="file", | ||
help="read mutliple PDB IDs from text file (line break separated IDs)" | ||
) | ||
parser.add_argument( | ||
"--gromacs", "-g", action="store_true", dest="include_gro", | ||
help="Create '*.gro' files using the Gromacs software" | ||
) | ||
args = parser.parse_args() | ||
|
||
if args.file is not None: | ||
with open(args.file, "r") as file: | ||
pdb_ids = [pdb_id.strip().lower() for pdb_id | ||
in file.read().split("\n") if len(pdb_id.strip()) != 0] | ||
elif args.id is not None: | ||
pdb_ids = [args.id.lower()] | ||
else: | ||
logging.error("Must specifiy PDB ID(s)") | ||
sys.exit() | ||
|
||
for i, pdb_id in enumerate(pdb_ids): | ||
print(f"{i:2d}/{len(pdb_ids):2d}: {pdb_id}", end="\r") | ||
try: | ||
create(pdb_id, args.directory, args.include_gro) | ||
except: | ||
print() | ||
raise |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
1aki | ||
1dix | ||
1f2n | ||
5zng | ||
1gya | ||
2axd | ||
1igy | ||
1l2y | ||
1o1z | ||
3o5r | ||
5h73 | ||
5ugo | ||
5ugo | ||
4gxy | ||
2d0f | ||
5eil | ||
4p5j | ||
1crr | ||
7gsa |