Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

V1.5.1 #201

Merged
merged 12 commits into from
Aug 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@

* [Patch] - [#187](https://github.com/a-r-j/graphein/pull/187) updates sequence retrieval due to UniProt API changes.
* [Patch] - [#189](https://github.com/a-r-j/graphein/pull/189) fixes bug where chains and PDB identifiers were not properly aligned in `ml.ProteinGraphDataset`.
* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Adds missing `MSE` to `graphein.protein.resi_atoms.RESI_NAMES`, `graphein.protein.resi_atoms.RESI_THREE_TO_1`. [#200](https://github.com/a-r-j/graphein/issues/200)
* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Fixes bug where check for same-chain always evaluates as False. [#199](https://github.com/a-r-j/graphein/issues/199)
* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Fixes bug where deprotonation would only remove hydrogens based on `atom_name` rather than `element_symbol`. [#198](https://github.com/a-r-j/graphein/issues/198)
* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Fixes bug in ProteinGraphDataset input validation.

#### Breaking Changes

* [#189](https://github.com/a-r-j/graphein/pull/189/) refactors PDB download util. Now returns path to download file, does not accept a config object but instead receives the output directory path directly.


### 1.5.0

#### Protein
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
author = "Arian Jamasb"

# The full version, including alpha/beta/rc tags
release = "1.5.0"
release = "1.5.1"


# -- General configuration ---------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion graphein/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from .testing import *

__author__ = "Arian Jamasb <arian@jamasb.io>"
__version__ = "1.5.0"
__version__ = "1.5.1"


logger.configure(
Expand Down
61 changes: 34 additions & 27 deletions graphein/ml/datasets/torch_geometric_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import logging as log
import os
from pathlib import Path
from typing import Callable, Dict, List, Optional
from typing import Callable, Dict, Generator, List, Optional

import networkx as nx
from tqdm import tqdm
Expand Down Expand Up @@ -414,9 +414,9 @@ def __init__(
if chain_selections is not None:
self.chain_selection_map = dict(enumerate(chain_selections))
else:
self.graph_label_map = None
self.chain_selection_map = None
self.validate_input()
self.bad_pdbs: List[str] = []
self.bad_pdbs: List[str] = []

# Configs
self.config = graphein_config
Expand Down Expand Up @@ -451,23 +451,26 @@ def processed_file_names(self) -> List[str]:
return [f"{pdb}.pt" for pdb in self.structures]

def validate_input(self):
assert len(self.structures) == len(
self.graph_label_map
), "Number of proteins and graph labels must match"
assert len(self.structures) == len(
self.node_label_map
), "Number of proteins and node labels must match"
assert len(self.structures) == len(
self.chain_selection_map
), "Number of proteins and chain selections must match"
assert len(
{
f"{pdb}_{chain}"
for pdb, chain in zip(
self.structures, self.chain_selection_map
)
}
) == len(self.structures), "Duplicate protein/chain combinations"
if self.graph_label_map is not None:
assert len(self.structures) == len(
self.graph_label_map
), "Number of proteins and graph labels must match"
if self.node_label_map is not None:
assert len(self.structures) == len(
self.node_label_map
), "Number of proteins and node labels must match"
if self.chain_selection_map is not None:
assert len(self.structures) == len(
self.chain_selection_map
), "Number of proteins and chain selections must match"
assert len(
{
f"{pdb}_{chain}"
for pdb, chain in zip(
self.structures, self.chain_selection_map
)
}
) == len(self.structures), "Duplicate protein/chain combinations"

def download(self):
"""Download the PDB files from RCSB or Alphafold."""
Expand Down Expand Up @@ -530,7 +533,7 @@ def process(self):
# Chunk dataset for parallel processing
chunk_size = 128

def divide_chunks(l: List[str], n: int = 2) -> List[List[str]]:
def divide_chunks(l: List[str], n: int = 2) -> Generator:
for i in range(0, len(l), n):
yield l[i : i + n]

Expand Down Expand Up @@ -584,12 +587,16 @@ def divide_chunks(l: List[str], n: int = 2) -> List[List[str]]:
data_list = [self.pre_transform(data) for data in data_list]

for i, (pdb, chain) in enumerate(zip(pdbs, chain_selections)):

torch.save(
data_list[i],
os.path.join(self.processed_dir, f"{pdb}_{chain}.pt"),
)
idx += 1
if self.chain_selection_map is None:
torch.save(
data_list[i],
os.path.join(self.processed_dir, f"{pdb}.pt"),
)
else:
torch.save(
data_list[i],
os.path.join(self.processed_dir, f"{pdb}_{chain}.pt"),
)

def get(self, idx: int):
"""
Expand Down
2 changes: 1 addition & 1 deletion graphein/protein/edges/atomic.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def add_atomic_edges(G: nx.Graph, tolerance: float = 0.56) -> nx.Graph:
continue

# Check atoms are in the same chain
if not (chain_1 and chain_2):
if chain_1 != chain_2:
continue

if G.has_edge(node_1, node_2):
Expand Down
2 changes: 1 addition & 1 deletion graphein/protein/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def deprotonate_structure(df: pd.DataFrame) -> pd.DataFrame:
"Deprotonating protein. This removes H atoms from the pdb_df dataframe"
)
return filter_dataframe(
df, by_column="atom_name", list_of_values=["H"], boolean=False
df, by_column="element_symbol", list_of_values=["H"], boolean=False
)


Expand Down
2 changes: 2 additions & 0 deletions graphein/protein/resi_atoms.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@
"LYS",
"MET",
"MLE",
"MSE",
"MVA",
"NH2",
"NLE",
Expand Down Expand Up @@ -434,6 +435,7 @@
"LYS": "K",
"MET": "M",
"MLE": "L",
"MSE": "M",
"MVA": "V",
"NH2": "X",
"NLE": "L",
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def run(self):

setup(
name="graphein",
version="1.5.0",
version="1.5.1",
# versioneer.get_version(),
# cmdclass=versioneer.get_cmdclass(),
description="Protein & Interactomic Graph Construction for Machine Learning",
Expand Down