From af3e981d0937808a6c7f4ab44d0a1f72b690f18d Mon Sep 17 00:00:00 2001 From: max Date: Mon, 14 Oct 2024 16:54:48 +0200 Subject: [PATCH] Revert "Embeddings esm (#93)" This reverts commit 50ce1f18ffbed0d20d73987c3b1c3f81375b6138. --- README.md | 12 +- docs/examples/ids.json | 69 +- docs/model_diagram.json | 247 --- pyeed/__init__.py | 22 +- pyeed/adapter/primary_db_adapter.py | 206 --- pyeed/adapter/uniprot_mapper.py | 70 - pyeed/{old => }/align/__init__.py | 0 pyeed/{old => }/align/abstract_aligner.py | 0 pyeed/{old => }/align/hmm.py | 0 pyeed/{old => }/align/msa.py | 0 pyeed/{old => }/align/pairwise.py | 0 pyeed/{old => }/cluster/__init__.py | 0 pyeed/{old => }/cluster/cluster.py | 0 pyeed/{old => }/cluster/mmseqs2.py | 0 pyeed/{old => }/core/__init__.py | 0 pyeed/{old => }/core/abstractannotation.py | 0 pyeed/{old => }/core/alignmentresult.py | 0 pyeed/{old => }/core/annotation.py | 0 pyeed/{old => }/core/blastdata.py | 0 pyeed/{old => }/core/clustalomegaresult.py | 0 pyeed/{old => }/core/cluster.py | 0 pyeed/{old => }/core/dnarecord.py | 0 pyeed/{old => }/core/numberedsequence.py | 0 pyeed/{old => }/core/ontology.py | 0 pyeed/{old => }/core/organism.py | 0 .../{old => }/core/pairwisealignmentresult.py | 0 pyeed/{old => }/core/proteinrecord.py | 3 +- pyeed/{old => }/core/region.py | 0 pyeed/{old => }/core/regionset.py | 0 pyeed/{old => }/core/sequence.py | 0 pyeed/{old => }/core/sequencerecord.py | 0 pyeed/{old => }/core/sequencetype.py | 0 pyeed/{old => }/core/site.py | 0 pyeed/{old => }/core/standardnumbering.py | 0 pyeed/dbconnect.py | 185 --- pyeed/embedding.py | 54 - pyeed/{adapter => fetch}/__init__.py | 0 pyeed/{old => }/fetch/blast.py | 5 +- pyeed/{old => }/fetch/dbsort.py | 0 pyeed/{old => }/fetch/dnafetcher.py | 15 +- pyeed/{old => }/fetch/ncbidnamapper.py | 0 pyeed/{old => }/fetch/ncbiproteinmapper.py | 0 pyeed/{old => }/fetch/pdbmapper.py | 0 pyeed/{old => }/fetch/proteinfetcher.py | 18 +- pyeed/fetch/requester.py | 256 +++ pyeed/{old => }/fetch/taxonomymapper.py | 0 pyeed/fetch/uniprotmapper.py | 124 ++ pyeed/model.py | 226 --- pyeed/{old => }/network/__init__.py | 0 pyeed/{old => }/network/network.py | 0 pyeed/old/fetch/__init__.py | 0 pyeed/old/fetch/requester.py | 206 --- pyeed/old/schemes/DNARecord.json | 182 --- pyeed/old/schemes/ProteinRecord.json | 241 --- pyeed/old/schemes/proteinrecord.shex | 125 -- pyeed/pyeed.py | 106 -- pyeed/{old => }/schemes/pyeed_schema.md | 0 pyeed/{old => }/tools/__init__.py | 0 pyeed/{old => }/tools/abstract_tool.py | 0 pyeed/{old => }/tools/clustalo.py | 0 pyproject.toml | 16 +- .../sequence_record.md | 164 +- test.ipynb | 1424 ----------------- 63 files changed, 588 insertions(+), 3388 deletions(-) delete mode 100644 docs/model_diagram.json delete mode 100644 pyeed/adapter/primary_db_adapter.py delete mode 100644 pyeed/adapter/uniprot_mapper.py rename pyeed/{old => }/align/__init__.py (100%) rename pyeed/{old => }/align/abstract_aligner.py (100%) rename pyeed/{old => }/align/hmm.py (100%) rename pyeed/{old => }/align/msa.py (100%) rename pyeed/{old => }/align/pairwise.py (100%) rename pyeed/{old => }/cluster/__init__.py (100%) rename pyeed/{old => }/cluster/cluster.py (100%) rename pyeed/{old => }/cluster/mmseqs2.py (100%) rename pyeed/{old => }/core/__init__.py (100%) rename pyeed/{old => }/core/abstractannotation.py (100%) rename pyeed/{old => }/core/alignmentresult.py (100%) rename pyeed/{old => }/core/annotation.py (100%) rename pyeed/{old => }/core/blastdata.py (100%) rename pyeed/{old => }/core/clustalomegaresult.py (100%) rename pyeed/{old => }/core/cluster.py (100%) rename pyeed/{old => }/core/dnarecord.py (100%) rename pyeed/{old => }/core/numberedsequence.py (100%) rename pyeed/{old => }/core/ontology.py (100%) rename pyeed/{old => }/core/organism.py (100%) rename pyeed/{old => }/core/pairwisealignmentresult.py (100%) rename pyeed/{old => }/core/proteinrecord.py (99%) rename pyeed/{old => }/core/region.py (100%) rename pyeed/{old => }/core/regionset.py (100%) rename pyeed/{old => }/core/sequence.py (100%) rename pyeed/{old => }/core/sequencerecord.py (100%) rename pyeed/{old => }/core/sequencetype.py (100%) rename pyeed/{old => }/core/site.py (100%) rename pyeed/{old => }/core/standardnumbering.py (100%) delete mode 100644 pyeed/dbconnect.py delete mode 100644 pyeed/embedding.py rename pyeed/{adapter => fetch}/__init__.py (100%) rename pyeed/{old => }/fetch/blast.py (99%) rename pyeed/{old => }/fetch/dbsort.py (100%) rename pyeed/{old => }/fetch/dnafetcher.py (92%) rename pyeed/{old => }/fetch/ncbidnamapper.py (100%) rename pyeed/{old => }/fetch/ncbiproteinmapper.py (100%) rename pyeed/{old => }/fetch/pdbmapper.py (100%) rename pyeed/{old => }/fetch/proteinfetcher.py (96%) create mode 100644 pyeed/fetch/requester.py rename pyeed/{old => }/fetch/taxonomymapper.py (100%) create mode 100644 pyeed/fetch/uniprotmapper.py delete mode 100644 pyeed/model.py rename pyeed/{old => }/network/__init__.py (100%) rename pyeed/{old => }/network/network.py (100%) delete mode 100644 pyeed/old/fetch/__init__.py delete mode 100644 pyeed/old/fetch/requester.py delete mode 100644 pyeed/old/schemes/DNARecord.json delete mode 100644 pyeed/old/schemes/ProteinRecord.json delete mode 100644 pyeed/old/schemes/proteinrecord.shex delete mode 100644 pyeed/pyeed.py rename pyeed/{old => }/schemes/pyeed_schema.md (100%) rename pyeed/{old => }/tools/__init__.py (100%) rename pyeed/{old => }/tools/abstract_tool.py (100%) rename pyeed/{old => }/tools/clustalo.py (100%) rename {pyeed/old/specifications => specifications}/sequence_record.md (71%) delete mode 100644 test.ipynb diff --git a/README.md b/README.md index 44b0be34..b7bc078c 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![Documentation](https://github.com/PyEED/pyeed/actions/workflows/make_docs.yaml/badge.svg)](https://github.com/PyEED/pyeed/actions/workflows/make_docs.yaml) ## About 📖 -pyeed is a toolkit enabling object-oriented analysis of protein sequences, instead of working with sequences in a file-oriented fashion. This will enable the user to easily access and manipulate sequence information and to perform analyses on the sequence data. +pyEED is a toolkit enabling object-oriented analysis of protein sequences, instead of working with sequences in a file-oriented fashion. This will enable the user to easily access and manipulate sequence information and to perform analyses on the sequence data. This library is currently under development and thus the API is subject to change. @@ -20,7 +20,9 @@ pip install git+https://github.com/PyEED/pyeed.git ## Quick start 🚀 -### Launch Neo4j database via Docker and mount to a local directory -```bash -docker run --name pyeed-neo4j -p 7474:7474 -p 7687:7687 -v $PWD/data:/data -v $PWD/logs:/logs -v $PWD/import:/var/lib/neo4j/import -v $PWD/plugins:/plugins -e NEO4J_AUTH=neo4j/test -d neo4j -``` +Library is currently refactored, quick start will be updated soon! + +## Documentation 📘 + +Check out the [documentation](https://pyeed.github.io/pyeed/) for in-depth information on how to setup `pyeed`, +use the build-in tools, and store sequence data in databases. diff --git a/docs/examples/ids.json b/docs/examples/ids.json index c1f159b4..92b5550b 100644 --- a/docs/examples/ids.json +++ b/docs/examples/ids.json @@ -1 +1,68 @@ -["Q9YBK2", "A6UQS6", "A1RSD7", "Q46CW6", "Q4JAL1", "Q8TU57", "P26498", "Q980S9", "B1YC36", "C3MJ04", "Q9HM12", "A3MY01", "A9A923", "Q5V2S5", "A4G0T1", "Q8TZW1", "Q6L123", "Q2FN14", "Q8PWS4", "Q18H49", "Q8ZYP7", "A8MD44", "P0DF56", "B0R5A8", "B6YUL1", "O30186", "A5UMW7", "Q2NEB3", "A6VHQ4", "Q12WC8", "A7I771", "Q5JF22", "Q8TV85", "O67275", "A2BIZ8", "Q3IQF5", "Q976F3", "P0CW63", "P0CW62", "C5A4B7", "O27429", "C3NF87", "Q97CT6", "Q58605", "Q9V1P7", "A6UU75", "O59488", "A0B742", "RUM32465.1", "NPA53530.1", "RLF67685.1", "MCL4350655.1", "NOZ59931.1", "WP_248897180.1", "MCL4451275.1", "RLE93954.1", "MBU7022768.1", "NHV96297.1", "PLJ76955.1", "WCN31494.1", "WP_211530438.1", "MBP7070042.1", "MCD6512403.1", "MCI2415466.1", "TMJ11138.1", "NQE45598.1"] \ No newline at end of file +[ + "WP_211530438", + "RUM32465", + "NOZ59931", + "MCI2415466", + "MCD6512403", + "MCL4350655", + "RLF67685", + "RLE93954", + "TMJ11138", + "WP_248897180", + "MBU7022768", + "NPA53530", + "MCL4451275", + "MBP7070042", + "NHV96297", + "NQE45598", + "PLJ76955", + "Q6L123", + "Q8TZW1", + "Q8ZYP7", + "Q46CW6", + "Q18H49", + "C3MJ04", + "Q8TV85", + "A6UU75", + "Q97CT6", + "O59488", + "WCN31494", + "O27429", + "Q2FN14", + "P0DF56", + "A0B742", + "Q2NEB3", + "Q9HM12", + "A5UMW7", + "Q9YBK2", + "A4G0T1", + "Q12WC8", + "A2BIZ8", + "O30186", + "C3NF87", + "A6UQS6", + "A1RSD7", + "A8MD44", + "P0CW62", + "A6VHQ4", + "A9A923", + "B6YUL1", + "Q5V2S5", + "Q4JAL1", + "B1YC36", + "P0CW63", + "Q980S9", + "Q3IQF5", + "Q9V1P7", + "Q8PWS4", + "Q5JF22", + "Q8TU57", + "C5A4B7", + "B0R5A8", + "P26498", + "O67275", + "A7I771", + "Q976F3", + "A3MY01", + "Q58605" +] \ No newline at end of file diff --git a/docs/model_diagram.json b/docs/model_diagram.json deleted file mode 100644 index 192fb3a4..00000000 --- a/docs/model_diagram.json +++ /dev/null @@ -1,247 +0,0 @@ -{ - "style": { - "node-color": "#ffffff", - "border-color": "#000000", - "caption-color": "#000000", - "arrow-color": "#000000", - "label-background-color": "#ffffff", - "directionality": "directed", - "arrow-width": 5 - }, - "nodes": [ - { - "id": "n0", - "position": { - "x": 0, - "y": 0 - }, - "caption": "", - "style": {}, - "labels": [ - "StrictStructuredNode" - ], - "properties": {} - }, - { - "id": "n1", - "position": { - "x": 346.4101615137755, - "y": 199.99999999999997 - }, - "caption": "", - "style": {}, - "labels": [ - "Organism" - ], - "properties": { - "taxonomy_id": "int - required", - "name": "str", - "domain": "str", - "kingdom": "str", - "phylum": "str", - "tax_class": "str", - "order": "str", - "family": "str", - "genus": "str", - "species": "str" - } - }, - { - "id": "n2", - "position": { - "x": 2.4492935982947064e-14, - "y": 400.0 - }, - "caption": "", - "style": {}, - "labels": [ - "Site" - ], - "properties": { - "site_id": "id - unique", - "name": "str", - "positions": "list[int] - required", - "annotation": "str - required" - } - }, - { - "id": "n3", - "position": { - "x": -346.4101615137754, - "y": 200.00000000000014 - }, - "caption": "", - "style": {}, - "labels": [ - "Region" - ], - "properties": { - "region_id": "id - unique", - "start": "int - required", - "end": "int - required", - "annotation": "str - required" - } - }, - { - "id": "n4", - "position": { - "x": -346.4101615137755, - "y": -199.99999999999991 - }, - "caption": "", - "style": {}, - "labels": [ - "GOAnnotation" - ], - "properties": { - "go_id": "str - required", - "term": "str", - "definition": "str" - } - }, - { - "id": "n5", - "position": { - "x": -7.347880794884119e-14, - "y": -400.0 - }, - "caption": "", - "style": {}, - "labels": [ - "Protein" - ], - "properties": { - "accession_id": "str - required", - "sequence": "str - required", - "name": "str", - "seq_length": "int - required", - "mol_weight": "float", - "ec_number": "str", - "nucleotide_id": "str", - "locus_tag": "str", - "structure_ids": "list[str]", - "go_terms": "list[str]", - "embedding": "list[float]" - } - }, - { - "id": "n6", - "position": { - "x": 346.41016151377534, - "y": -200.00000000000017 - }, - "caption": "", - "style": {}, - "labels": [ - "DNA" - ], - "properties": { - "accession_id": "str - required", - "sequence": "str - required", - "name": "str", - "seq_length": "int - required", - "go_terms": "list[str]", - "embedding": "list[float]", - "gc_content": "float" - } - } - ], - "relationships": [ - { - "id": "e0", - "type": "ORIGINATES_FROM", - "style": {}, - "properties": {}, - "fromId": "n5", - "toId": "n1" - }, - { - "id": "e1", - "type": "ORIGINATES_FROM", - "style": {}, - "properties": {}, - "fromId": "n6", - "toId": "n1" - }, - { - "id": "e2", - "type": "ASSOCIATED_WITH", - "style": {}, - "properties": {}, - "fromId": "n4", - "toId": "n5" - }, - { - "id": "e3", - "type": "ASSOCIATED_WITH", - "style": {}, - "properties": {}, - "fromId": "n4", - "toId": "n6" - }, - { - "id": "e4", - "type": "ORIGINATES_FROM", - "style": {}, - "properties": {}, - "fromId": "n5", - "toId": "n1" - }, - { - "id": "e5", - "type": "HAS_SITE", - "style": {}, - "properties": {}, - "fromId": "n5", - "toId": "n2" - }, - { - "id": "e6", - "type": "HAS_REGION", - "style": {}, - "properties": {}, - "fromId": "n5", - "toId": "n3" - }, - { - "id": "e7", - "type": "ASSOCIATED_WITH", - "style": {}, - "properties": {}, - "fromId": "n5", - "toId": "n4" - }, - { - "id": "e8", - "type": "ORIGINATES_FROM", - "style": {}, - "properties": {}, - "fromId": "n6", - "toId": "n1" - }, - { - "id": "e9", - "type": "HAS_SITE", - "style": {}, - "properties": {}, - "fromId": "n6", - "toId": "n2" - }, - { - "id": "e10", - "type": "HAS_REGION", - "style": {}, - "properties": {}, - "fromId": "n6", - "toId": "n3" - }, - { - "id": "e11", - "type": "ASSOCIATED_WITH", - "style": {}, - "properties": {}, - "fromId": "n6", - "toId": "n4" - } - ] -} \ No newline at end of file diff --git a/pyeed/__init__.py b/pyeed/__init__.py index 38ebcff3..558e3ea7 100644 --- a/pyeed/__init__.py +++ b/pyeed/__init__.py @@ -1 +1,21 @@ -from pyeed.pyeed import Pyeed +import os + +from .core.abstractannotation import AbstractAnnotation +from .core.alignmentresult import AlignmentResult +from .core.annotation import Annotation +from .core.blastdata import BlastData +from .core.clustalomegaresult import ClustalOmegaResult +from .core.cluster import Cluster +from .core.dnarecord import DNARecord +from .core.numberedsequence import NumberedSequence +from .core.ontology import Ontology +from .core.organism import Organism +from .core.pairwisealignmentresult import PairwiseAlignmentResult +from .core.proteinrecord import ProteinRecord +from .core.region import Region +from .core.regionset import RegionSet +from .core.sequence import Sequence +from .core.sequencerecord import SequenceRecord +from .core.sequencetype import SequenceType +from .core.site import Site +from .core.standardnumbering import StandardNumbering diff --git a/pyeed/adapter/primary_db_adapter.py b/pyeed/adapter/primary_db_adapter.py deleted file mode 100644 index eba8138e..00000000 --- a/pyeed/adapter/primary_db_adapter.py +++ /dev/null @@ -1,206 +0,0 @@ -from typing import Any, Coroutine, Generic, NamedTuple, TypeVar - -import aiometer -import tenacity -from httpx import ( - AsyncClient, - Limits, - RequestError, - Response, - TimeoutException, -) -from loguru import logger -from rich.progress import Progress, TaskID - -from pyeed.adapter.uniprot_mapper import PrimaryDBtoPyeed - -T = TypeVar("T") - - -class RequestPayload(NamedTuple): - """Holds the request client, URL, and parameters for an HTTP GET request.""" - - client: AsyncClient - url: str - params: dict[str, str] - - -class PrimaryDBAdapter(Generic[T]): - """ - Orchestrates the asynchronous HTTP GET requests to a primary sequence database. - Mapper classes are injected to map the responses to the pyeed graph object model and - save them to the database. - """ - - def __init__( - self, - ids: list[str], - ids_attr_name: str, - url: str, - rate_limit: int, - n_concurrent: int, - batch_size: int, - data_mapper: "PrimaryDBtoPyeed[T]", - timeout: int = 120, - progress: Progress | None = None, - task_id: TaskID | None = None, - request_params: dict[str, str] = {}, - ): - self.ids = ids - self.ids_attr_name = ids_attr_name - self.url = url - self.batch_size = batch_size - self.rate_limit = rate_limit - self.n_concurrent = n_concurrent - self.progress = progress - self.task_id = task_id - self.data_mapper = data_mapper - self.timeout = timeout - self.request_params = request_params - - if self.batch_size > 1: - self.ids = self.make_batches() - - if not self.progress: - self._create_progress() - - def _create_progress(self): - """ - Creates a dummy progress bar for tracking the progress of the HTTP - requests if not provided. - """ - self.progress = Progress(disable=True) - self.task_id = self.progress.add_task("Requesting data...", total=len(self.ids)) - - def make_batches(self) -> list[str]: - """ - Groups the IDs into batches of the specified batch size. - - Returns: - list[str]: The list of batches, where each batch is a comma-separated - string of IDs. - """ - batches = [] - for i in range(0, len(self.ids), self.batch_size): - batch = self.ids[i : i + self.batch_size] - batch_string = ",".join(map(str, batch)) - batches.append(batch_string) - return batches - - def build_request_payload(self, client: AsyncClient, id_: str) -> RequestPayload: - """Combines the client, URL, and parameters into a RequestPayload object. - Adds the id with the key specified by ids_attr_name to the request parameters. - - Args: - client (AsyncClient): AsyncClient object for making HTTP requests - id_ (str): ID to be added to the request parameters - - Returns: - RequestPayload: RequestPayload object with the client, URL, and parameters - """ - params = self.request_params.copy() - params[self.ids_attr_name] = id_ - - return RequestPayload(client, self.url, params=params) - - @tenacity.retry( - wait=tenacity.wait_fixed(1), - stop=tenacity.stop_after_attempt(3), - retry=tenacity.retry_if_exception_type((RequestError, TimeoutException)), - ) - async def send_request( - self, - args: RequestPayload, - ) -> Coroutine[None, None, Response]: - """ - Sends an asynchronous HTTP GET request to the specified URL using the provided - AsyncClient. - """ - client = args.client - url = args.url - params = args.params - - logger.debug(f"Sending request to {url} with parameters: {params}") - return client.get(url, params=params, timeout=self.timeout) - - async def make_request(self): - """ - Makes asynchronous HTTP GET requests to the specified URL using the provided - AsyncClient, handling rate limiting and concurrency. - """ - - def update_progress(): - if self.progress and self.task_id: - self.progress.update(self.task_id, advance=1) # type: ignore - - async with AsyncClient( - limits=Limits(max_connections=self.n_concurrent), - ) as client: - # Build the list of request arguments (this prepares the coroutine tasks) - requests = [self.build_request_payload(client, id) for id in self.ids] - - logger.debug( - f"Sending {len(self.ids)} requests in batches of {self.batch_size}" - ) - - # Using aiometer to handle rate-limiting and concurrency - async with aiometer.amap( - self.send_request, - requests, - max_per_second=self.rate_limit, - max_at_once=self.n_concurrent, - ) as response_coroutines: - async for response_coroutine in response_coroutines: - res = await response_coroutine - sanitized_response = self.sanitize_response(res) - [self.map_and_add_to_db(entry) for entry in sanitized_response] - - update_progress() - - def sanitize_response(self, response: Response) -> list[dict[str, Any]]: - """ - Sanitizes the response from the HTTP GET request by checking the status code - and formatting the JSON response as a list of dictionaries. - - Returns: - Optional[List[Dict[str, Any]]]: The JSON response as a list of dictionaries, - or None if the response is invalid. - """ - if response.status_code != 200: - logger.warning( - f"Request to {response.url} failed with status code {response.status_code}" - ) - return [] - - try: - response_json = response.json() - if not response_json: - logger.warning(f"Empty response from {response.url}") - return [] - - # If the response is a dictionary, wrap it in a list - if isinstance(response_json, dict): - response_json = [response_json] - - # Ensure the response is a list of dictionaries - if not isinstance(response_json, list) or not all( - isinstance(item, dict) for item in response_json - ): - logger.warning(f"Unexpected response format from {response.url}") - return [] - - except ValueError as e: - logger.warning(f"Failed to parse JSON response from {response.url}: {e}") - return [] - - return response_json - - def map_and_add_to_db(self, response: dict[str, Any] | None): - """ - Handles the response from the HTTP GET request by passing it to the data mapper. - This adds the mapped data to the database. - """ - - if response is None: - return None - self.data_mapper.add_to_db(response) diff --git a/pyeed/adapter/uniprot_mapper.py b/pyeed/adapter/uniprot_mapper.py deleted file mode 100644 index 2fde60e5..00000000 --- a/pyeed/adapter/uniprot_mapper.py +++ /dev/null @@ -1,70 +0,0 @@ -from abc import abstractmethod -from collections import defaultdict -from typing import Generic, TypeVar - -from pyeed.model import Annotation, GOAnnotation, Organism, Protein, Site - -T = TypeVar("T") - - -class PrimaryDBtoPyeed(Generic[T]): - @abstractmethod - def add_to_db(self, data: dict): - pass - - -class UniprotToPyeed(PrimaryDBtoPyeed[Protein]): - def add_to_db(self, data: dict): - # Organism information - taxonomy_id = data["organism"]["taxonomy"] - organism = Organism.get_or_save( - taxonomy_id=taxonomy_id, - name=data["organism"]["names"][0]["value"], - ) - - try: - ec_number = data["protein"]["recommendedName"]["ecNumber"][0]["value"] - except KeyError: - ec_number = None - - protein = Protein.get_or_save( - accession_id=data["accession"], - sequence=data["sequence"]["sequence"], - mol_weight=float(data["sequence"]["mass"]), - ec_number=ec_number, - name=data["protein"]["recommendedName"]["fullName"]["value"], - seq_length=len(data["sequence"]["sequence"]), - ) - - protein.organism.connect(organism) - organism.protein.connect(protein) - - self.add_sites(data, protein) - self.add_go(data, protein) - - def add_sites(self, data: dict, protein: Protein): - ligand_dict = defaultdict(list) - - for feature in data.get("features", []): - if feature["type"] == "BINDING": - for position in range(int(feature["begin"]), int(feature["end"]) + 1): - ligand_dict[feature["ligand"]["name"]].append(position) - - for ligand, positions in ligand_dict.items(): - site = Site( - name=ligand, - positions=positions, - annotation=Annotation.BINDING_SITE.value, - ).save() - - protein.site.connect(site) - - def add_go(self, data: dict, protein: Protein): - for reference in data["dbReferences"]: - if reference["type"] == "GO": - go_annotation = GOAnnotation.get_or_save( - go_id=reference["id"], - term=reference["properties"]["term"], - ) - - protein.go_annotation.connect(go_annotation) diff --git a/pyeed/old/align/__init__.py b/pyeed/align/__init__.py similarity index 100% rename from pyeed/old/align/__init__.py rename to pyeed/align/__init__.py diff --git a/pyeed/old/align/abstract_aligner.py b/pyeed/align/abstract_aligner.py similarity index 100% rename from pyeed/old/align/abstract_aligner.py rename to pyeed/align/abstract_aligner.py diff --git a/pyeed/old/align/hmm.py b/pyeed/align/hmm.py similarity index 100% rename from pyeed/old/align/hmm.py rename to pyeed/align/hmm.py diff --git a/pyeed/old/align/msa.py b/pyeed/align/msa.py similarity index 100% rename from pyeed/old/align/msa.py rename to pyeed/align/msa.py diff --git a/pyeed/old/align/pairwise.py b/pyeed/align/pairwise.py similarity index 100% rename from pyeed/old/align/pairwise.py rename to pyeed/align/pairwise.py diff --git a/pyeed/old/cluster/__init__.py b/pyeed/cluster/__init__.py similarity index 100% rename from pyeed/old/cluster/__init__.py rename to pyeed/cluster/__init__.py diff --git a/pyeed/old/cluster/cluster.py b/pyeed/cluster/cluster.py similarity index 100% rename from pyeed/old/cluster/cluster.py rename to pyeed/cluster/cluster.py diff --git a/pyeed/old/cluster/mmseqs2.py b/pyeed/cluster/mmseqs2.py similarity index 100% rename from pyeed/old/cluster/mmseqs2.py rename to pyeed/cluster/mmseqs2.py diff --git a/pyeed/old/core/__init__.py b/pyeed/core/__init__.py similarity index 100% rename from pyeed/old/core/__init__.py rename to pyeed/core/__init__.py diff --git a/pyeed/old/core/abstractannotation.py b/pyeed/core/abstractannotation.py similarity index 100% rename from pyeed/old/core/abstractannotation.py rename to pyeed/core/abstractannotation.py diff --git a/pyeed/old/core/alignmentresult.py b/pyeed/core/alignmentresult.py similarity index 100% rename from pyeed/old/core/alignmentresult.py rename to pyeed/core/alignmentresult.py diff --git a/pyeed/old/core/annotation.py b/pyeed/core/annotation.py similarity index 100% rename from pyeed/old/core/annotation.py rename to pyeed/core/annotation.py diff --git a/pyeed/old/core/blastdata.py b/pyeed/core/blastdata.py similarity index 100% rename from pyeed/old/core/blastdata.py rename to pyeed/core/blastdata.py diff --git a/pyeed/old/core/clustalomegaresult.py b/pyeed/core/clustalomegaresult.py similarity index 100% rename from pyeed/old/core/clustalomegaresult.py rename to pyeed/core/clustalomegaresult.py diff --git a/pyeed/old/core/cluster.py b/pyeed/core/cluster.py similarity index 100% rename from pyeed/old/core/cluster.py rename to pyeed/core/cluster.py diff --git a/pyeed/old/core/dnarecord.py b/pyeed/core/dnarecord.py similarity index 100% rename from pyeed/old/core/dnarecord.py rename to pyeed/core/dnarecord.py diff --git a/pyeed/old/core/numberedsequence.py b/pyeed/core/numberedsequence.py similarity index 100% rename from pyeed/old/core/numberedsequence.py rename to pyeed/core/numberedsequence.py diff --git a/pyeed/old/core/ontology.py b/pyeed/core/ontology.py similarity index 100% rename from pyeed/old/core/ontology.py rename to pyeed/core/ontology.py diff --git a/pyeed/old/core/organism.py b/pyeed/core/organism.py similarity index 100% rename from pyeed/old/core/organism.py rename to pyeed/core/organism.py diff --git a/pyeed/old/core/pairwisealignmentresult.py b/pyeed/core/pairwisealignmentresult.py similarity index 100% rename from pyeed/old/core/pairwisealignmentresult.py rename to pyeed/core/pairwisealignmentresult.py diff --git a/pyeed/old/core/proteinrecord.py b/pyeed/core/proteinrecord.py similarity index 99% rename from pyeed/old/core/proteinrecord.py rename to pyeed/core/proteinrecord.py index 1d076db1..dac53677 100644 --- a/pyeed/old/core/proteinrecord.py +++ b/pyeed/core/proteinrecord.py @@ -25,6 +25,7 @@ class ProteinRecord( SequenceRecord, + search_mode="unordered", ): """A protein sequence and associated metadata.""" @@ -341,7 +342,7 @@ def get_dna(self): return DNARecord.get_id(self.coding_sequence[0].id) except Exception as e: - print("The DNA sequence could not be retrieved. The error is: ", e) + print('The DNA sequence could not be retrieved. The error is: ', e) return def _nblast(sequence: str, n_hits: int = None) -> List["ProteinRecord"]: diff --git a/pyeed/old/core/region.py b/pyeed/core/region.py similarity index 100% rename from pyeed/old/core/region.py rename to pyeed/core/region.py diff --git a/pyeed/old/core/regionset.py b/pyeed/core/regionset.py similarity index 100% rename from pyeed/old/core/regionset.py rename to pyeed/core/regionset.py diff --git a/pyeed/old/core/sequence.py b/pyeed/core/sequence.py similarity index 100% rename from pyeed/old/core/sequence.py rename to pyeed/core/sequence.py diff --git a/pyeed/old/core/sequencerecord.py b/pyeed/core/sequencerecord.py similarity index 100% rename from pyeed/old/core/sequencerecord.py rename to pyeed/core/sequencerecord.py diff --git a/pyeed/old/core/sequencetype.py b/pyeed/core/sequencetype.py similarity index 100% rename from pyeed/old/core/sequencetype.py rename to pyeed/core/sequencetype.py diff --git a/pyeed/old/core/site.py b/pyeed/core/site.py similarity index 100% rename from pyeed/old/core/site.py rename to pyeed/core/site.py diff --git a/pyeed/old/core/standardnumbering.py b/pyeed/core/standardnumbering.py similarity index 100% rename from pyeed/old/core/standardnumbering.py rename to pyeed/core/standardnumbering.py diff --git a/pyeed/dbconnect.py b/pyeed/dbconnect.py deleted file mode 100644 index 7e0f930a..00000000 --- a/pyeed/dbconnect.py +++ /dev/null @@ -1,185 +0,0 @@ -import subprocess - -from neo4j import Driver, GraphDatabase -from neomodel import db as neomodel_db - -from pyeed.model import Protein - - -class DatabaseConnector: - def __init__(self, uri: str, user: str | None, password: str | None): - """ - Initializes the connection to the Neo4j database using a self-managed driver. - """ - self._uri = uri - self.driver = self._get_driver(uri, user, password) - neomodel_db.set_connection(driver=self.driver) # patch db for neomodel - - if not self._constraints_exist(): - print( - "Pyeed Graph Object Mapping constraints not defined. Use _install_labels() to set up model constraints." - ) - print("📡 Connected to database.") - - def close(self): - """ - Closes the connection to the Neo4j database. - """ - self.driver.close() - print("🔌 Connection closed.") - - def execute_read(self, query: str, parameters=None): - """ - Executes a read (MATCH) query using the Neo4j driver directly. - """ - with self.driver.session() as session: - return session.execute_read(self._run_query, query, parameters) - - def execute_write(self, query: str, parameters=None): - """ - Executes a write (CREATE, DELETE, etc.) query using the Neo4j driver directly. - """ - with self.driver.session() as session: - return session.execute_write(self._run_query, query, parameters) - - def add_protein(self, protein_record: Protein): - """ - Placeholder for adding a Protein to the database via Neomodel. - """ - # Here you can add logic to store protein_record using Neomodel models - pass - - def stats(self) -> dict: - """ - Returns the number of nodes and relationships in the database. - """ - node_count_query = "MATCH (n) RETURN count(n) AS node_count" - relationship_count_query = ( - "MATCH ()-[r]->() RETURN count(r) AS relationship_count" - ) - - node_count = self.execute_read(node_count_query)[0]["node_count"] - relationship_count = self.execute_read(relationship_count_query)[0][ - "relationship_count" - ] - - return {"nodes": node_count, "relationships": relationship_count} - - def _initialize_db_constraints( - self, - user: str | None, - password: str | None, - models_path: str = "pyeed/model.py", - ): - """ - Run the neomodel_install_labels script to set up indexes and constraints on labels - of Object-Graph Mapping (OGM) models. - """ - try: - # Construct connection string based on whether user/password are provided - if user and password: - connection_url = f"bolt://{user}:{password}@{self._uri.split('//')[1]}" - else: - connection_url = self.insert_after_second_slash( - self._uri, "neo4j:neo4j@" - ) - - subprocess.run( - [ - "neomodel_install_labels", - models_path, - "--db", - connection_url, - ], - check=True, - ) - print( - "✅ Databse constraints and indexes set up according to Pyeed Graph Object Model." - ) - except subprocess.CalledProcessError as e: - print(f"Failed to install labels: {str(e)}") - - def _constraints_exist(self) -> bool: - """Check and if constraints exist in the database. Return True if constraints exist.""" - query = """ - SHOW CONSTRAINTS YIELD name, type - RETURN count(*) AS constraint_count - """ - - results = self.execute_read(query) - return True if results[0]["constraint_count"] > 0 else False - - def _remove_db_constraints( - self, - user: str | None, - password: str | None, - ): - """ - Run the neomodel_remove_labels script to drop all indexes and constraints - from labels in the Neo4j database. - """ - try: - if user and password: - connection_url = f"bolt://{user}:{password}@{self._uri.split('//')[1]}" - else: - connection_url = self.insert_after_second_slash( - self._uri, "neo4j:neo4j@" - ) - - subprocess.run( - [ - "neomodel_remove_labels", - "--db", - connection_url, - ], - check=True, - ) - print("All constraints and indexes have been removed from the database.") - except subprocess.CalledProcessError as e: - print(f"Failed to remove labels: {str(e)}") - - def generate_model_diagram( - self, - models_path: str = "pyeed/model.py", - ): - subprocess.run( - [ - "neomodel_generate_diagram", - models_path, - ] - ) - - def _wipe_database(self): - """ - Deletes all nodes and relationships in the database. - """ - delete_query = """ - MATCH (n) - DETACH DELETE n - """ - self.execute_write(delete_query) - print("All data has been wiped from the database.") - - @staticmethod - def _run_query(tx, query, parameters): - """ - Executes a Cypher query in the provided transaction. - """ - result = tx.run(query, parameters) - return [record.data() for record in result] - - @staticmethod - def _get_driver(uri: str, user: str | None, password: str | None) -> Driver: - """ - Creates a new Neo4j driver instance. - """ - auth = (user, password) if user and password else None - return GraphDatabase.driver(uri, auth=auth) - - @staticmethod - def insert_after_second_slash(uri: str, to_insert: str) -> str: - # Split the string at '//' into two parts - scheme, rest = uri.split("//", 1) - - # Insert the new content after the second '//' - return f"{scheme}//{to_insert}{rest}" diff --git a/pyeed/embedding.py b/pyeed/embedding.py deleted file mode 100644 index a6331918..00000000 --- a/pyeed/embedding.py +++ /dev/null @@ -1,54 +0,0 @@ -import gc - -import torch -from transformers import EsmModel, EsmTokenizer - - -def get_batch_embeddings(sequences: list[str], batch_size: int = 16): - # Load the ESM2 model and tokenizer - model_name = "facebook/esm2_t33_650M_UR50D" - model = EsmModel.from_pretrained(model_name) - tokenizer = EsmTokenizer.from_pretrained(model_name) - - # Check if MPS (Metal Performance Shaders) is available and use it - device = ( - torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu") - ) - model = model.to(device) - - embedding_list = [] - model.eval() - - with torch.no_grad(): - # Process sequences in batches - for i in range(0, len(sequences), batch_size): - batch = sequences[i : i + batch_size] - - # Tokenize the input sequences (must be a list of strings) - inputs = tokenizer( - batch, padding=True, truncation=True, return_tensors="pt" - ).to(device) - - # Get model outputs - outputs = model(**inputs) - embeddings = outputs.last_hidden_state - - # Process each sequence in the batch - for j in range(len(batch)): - valid_token_mask = inputs["attention_mask"][j].bool() - seq_embeddings = embeddings[j][valid_token_mask].mean(dim=0).cpu() - embedding_list.append(seq_embeddings) - - return embedding_list - - -def free_memory(): - gc.collect() # Python garbage collection - if torch.backends.mps.is_built(): - torch.mps.empty_cache() - elif torch.cuda.is_available(): - torch.cuda.empty_cache() - - -if __name__ == "__main__": - free_memory() diff --git a/pyeed/adapter/__init__.py b/pyeed/fetch/__init__.py similarity index 100% rename from pyeed/adapter/__init__.py rename to pyeed/fetch/__init__.py diff --git a/pyeed/old/fetch/blast.py b/pyeed/fetch/blast.py similarity index 99% rename from pyeed/old/fetch/blast.py rename to pyeed/fetch/blast.py index 09590bc1..2785a773 100644 --- a/pyeed/old/fetch/blast.py +++ b/pyeed/fetch/blast.py @@ -86,6 +86,7 @@ def run(self, program: str, ncbi_db: str) -> io.StringIO: ), f"Invalid database: {ncbi_db}, valid databases: {NCBIDataBase}" if program == BlastProgram.BLASTP.value: + return NCBIWWW.qblast( program, ncbi_db, @@ -94,8 +95,9 @@ def run(self, program: str, ncbi_db: str) -> io.StringIO: matrix_name=self.matrix, hitlist_size=self.n_hits, ) - + elif program == BlastProgram.BLASTN.value: + return NCBIWWW.qblast( program=program, database=ncbi_db, @@ -110,6 +112,7 @@ async def async_run( program: str, foreign_executor: Optional[ThreadPoolExecutor] = None, ) -> io.StringIO: + if not foreign_executor: executor = ThreadPoolExecutor() else: diff --git a/pyeed/old/fetch/dbsort.py b/pyeed/fetch/dbsort.py similarity index 100% rename from pyeed/old/fetch/dbsort.py rename to pyeed/fetch/dbsort.py diff --git a/pyeed/old/fetch/dnafetcher.py b/pyeed/fetch/dnafetcher.py similarity index 92% rename from pyeed/old/fetch/dnafetcher.py rename to pyeed/fetch/dnafetcher.py index fca3beeb..48849871 100644 --- a/pyeed/old/fetch/dnafetcher.py +++ b/pyeed/fetch/dnafetcher.py @@ -1,4 +1,5 @@ import asyncio +import json import logging from typing import List @@ -6,7 +7,7 @@ from rich.console import Console from rich.progress import Progress -from pyeed.adapter.primary_db_adapter import AsyncRequester +from pyeed.fetch.requester import AsyncRequester, AsyncParamRequester from .ncbidnamapper import NCBIDNAMapper @@ -40,10 +41,10 @@ async def fetch(self, **console_kwargs): console=Console(**console_kwargs), ) as progress: requesters: List[AsyncRequester] = [] - - # + + # task_id = progress.add_task( - "Requesting sequences from NCBI...", total=len(self.ids) + f"Requesting sequences from NCBI...", total=len(self.ids) ) requesters.append( AsyncRequester( @@ -57,10 +58,12 @@ async def fetch(self, **console_kwargs): ) ) + responses = await asyncio.gather( *[requester.make_request() for requester in requesters] ) + # in case of multiple databases, identify the source of the data ncbi_responses, uniprot_response = self.identify_data_source(responses) @@ -69,6 +72,7 @@ async def fetch(self, **console_kwargs): return ncbi_entries + def identify_data_source(self, responses: List[str]) -> tuple: """ Identifies the source of the data based on the response content. @@ -83,3 +87,6 @@ def identify_data_source(self, responses: List[str]) -> tuple: uniprot_response.append(response) return ncbi_responses, uniprot_response + + + diff --git a/pyeed/old/fetch/ncbidnamapper.py b/pyeed/fetch/ncbidnamapper.py similarity index 100% rename from pyeed/old/fetch/ncbidnamapper.py rename to pyeed/fetch/ncbidnamapper.py diff --git a/pyeed/old/fetch/ncbiproteinmapper.py b/pyeed/fetch/ncbiproteinmapper.py similarity index 100% rename from pyeed/old/fetch/ncbiproteinmapper.py rename to pyeed/fetch/ncbiproteinmapper.py diff --git a/pyeed/old/fetch/pdbmapper.py b/pyeed/fetch/pdbmapper.py similarity index 100% rename from pyeed/old/fetch/pdbmapper.py rename to pyeed/fetch/pdbmapper.py diff --git a/pyeed/old/fetch/proteinfetcher.py b/pyeed/fetch/proteinfetcher.py similarity index 96% rename from pyeed/old/fetch/proteinfetcher.py rename to pyeed/fetch/proteinfetcher.py index 8d79c97c..dee697fa 100644 --- a/pyeed/old/fetch/proteinfetcher.py +++ b/pyeed/fetch/proteinfetcher.py @@ -1,24 +1,26 @@ import asyncio import json -from logging import Logger +import logging from typing import List import nest_asyncio from rich.console import Console from rich.progress import Progress -from pyeed.adapter.primary_db_adapter import AsyncParamRequester, AsyncRequester -from pyeed.dbconnect import DatabaseConnector from pyeed.fetch.dbsort import DBPattern, SortIDs from pyeed.fetch.ncbiproteinmapper import NCBIProteinMapper from pyeed.fetch.pdbmapper import PDBMapper +from pyeed.fetch.requester import AsyncParamRequester, AsyncRequester from pyeed.fetch.taxonomymapper import TaxonomyMapper +from pyeed.fetch.uniprotmapper import UniprotMapper + +LOGGER = logging.getLogger(__name__) class ProteinFetcher: - def __init__(self, ids: List[str], db: DatabaseConnector): + def __init__(self, ids: List[str]): self.ids = ids - self.db = db + # self.ncbi_key = ncbi_key #TODO: Add NCBI key to NCBI requester nest_asyncio.apply() async def fetch(self, **console_kwargs): @@ -34,15 +36,15 @@ async def fetch(self, **console_kwargs): Raises: Exception: If there is an error during the fetching process. - """ + """ db_entries = SortIDs.sort(self.ids) param_requester = None with Progress( console=Console(**console_kwargs), ) as progress: - requesters = [] + requesters: List[AsyncRequester] = [] for db_name, db_ids in db_entries.items(): if db_name == DBPattern.UNIPROT.name: task_id = progress.add_task( @@ -254,7 +256,7 @@ def identify_data_source(self, responses: List[List[str]]): json.loads(entry)[0] for entry in response ] else: - Logger.warning(f"Response could not be mapped to mapper: {response[0]}") + LOGGER.warning(f"Response could not be mapped to mapper: {response[0]}") if not ncbi: ncbi = [] diff --git a/pyeed/fetch/requester.py b/pyeed/fetch/requester.py new file mode 100644 index 00000000..87259edb --- /dev/null +++ b/pyeed/fetch/requester.py @@ -0,0 +1,256 @@ +import asyncio +import logging +from typing import Dict, List, NamedTuple, Optional + +import aiometer +import tenacity +from httpx import AsyncClient, Limits, Response +from rich.progress import Progress, TaskID + +LOGGER = logging.getLogger(__name__) + + +class RequestArgs(NamedTuple): + """Holds the arguments for an HTTP GET request.""" + + client: AsyncClient + url: str + params: Optional[dict] = None + + +class AsyncRequester: + def __init__( + self, + ids: List[str], + url: str, + batch_size: int, + rate_limit: int, + n_concurrent: int, + progress: Optional[Progress] = None, + task_id: Optional[TaskID] = None, + ): + self.ids = ids + self.url = url + self.batch_size = batch_size + self.rate_limit = rate_limit + self.n_concurrent = n_concurrent + self.progress = progress + self.task_id = task_id + + if self.batch_size: + self.ids = self.make_batches() + + if not self.progress: + self._create_progress() + + def _create_progress(self): + """ + Creates a dummy progress bar for tracking the progress of the HTTP + requests if not provided. + """ + + self.progress = Progress(disable=True) + self.task_id = self.progress.add_task("Requesting data...", total=len(self.ids)) + + @tenacity.retry( + wait=tenacity.wait_fixed(1), + stop=tenacity.stop_after_attempt(3), + ) + async def send_request(self, args: RequestArgs) -> str: + """ + Sends an asynchronous HTTP GET request to the specified URL using the provided + AsyncClient. + + Parameters: + args (RequestArgs): The arguments for the request, including the client and + the URL. + + Returns: + str: The response text from the request. + """ + client = args.client + url = args.url + + LOGGER.debug(f"Sending request to {url}") + response = await client.get(url, timeout=120) + + LOGGER.debug(f"Received response from {url}. Code: {response.status_code}") + + if response.status_code != 200: + LOGGER.warning( + f"Request to {url} failed with status code {response.status_code}" + ) + LOGGER.warning(f"Response: {response.text}") + + if response.status_code == 429: + LOGGER.warning("Rate limit exceeded. Waiting for 1 second...") + await asyncio.sleep(1) + return await self.send_request(args) + + return response.text + + @tenacity.retry( + wait=tenacity.wait_fixed(0.5), + stop=tenacity.stop_after_attempt(3), + ) + async def make_request(self) -> List[str]: + """ + Makes asynchronous HTTP GET requests to the specified URL using the provided + AsyncClient. + + Returns: + List[str]: The response texts from the requests. + + Notes: + - If the response status code is not 200, a warning message is logged. + - If the response status code is 429 (rate limit exceeded), the method waits + for 1 second and then retries the request. + """ + + all_responses = [] + + async def update_progress(response: Response): + if self.progress: + self.progress.update(self.task_id, advance=self.batch_size) # type: ignore + + async with AsyncClient( + event_hooks={"response": [update_progress]}, + limits=Limits(max_connections=self.n_concurrent), + ) as client: + LOGGER.debug(f"Creating {len(self.ids)} tasks") + + tasks = [RequestArgs(client, f"{self.url}{id}") for id in self.ids] + + LOGGER.debug(f"Sending {len(self.ids)} requests") + async with aiometer.amap( + self.send_request, + tasks, + max_per_second=self.rate_limit, + max_at_once=self.n_concurrent, + ) as responses: + async for res in responses: + all_responses.append(res) + + return all_responses + + def make_batches(self) -> List[str]: + """ + Creates batches of IDs for making HTTP requests. + + Returns: + List[str]: The list of batches, where each batch is a comma-separated + string of IDs. + + """ + + batches = [] + for i in range(0, len(self.ids), self.batch_size): + batch = self.ids[i : i + self.batch_size] + if len(batch) > 1: + batch_string = ",".join(batch) + else: + batch_string = str(batch[0]) + batches.append(batch_string) + self.ids = batches + return batches + + +class AsyncParamRequester: + """Updated Requester utilizing parameters as dict for the request""" + + def __init__( + self, + params: Dict[str, str], + url: str, + ids: List[str], + rate_limit: int, + n_concurrent: int, + progress: Optional[Progress] = None, + task_id: Optional[TaskID] = None, + batch_size: int = 1, + ): + self.params = params + self.url = url + self.ids = ids + self.batch_size = batch_size + self.rate_limit = rate_limit + self.n_concurrent = n_concurrent + self.progress = progress + self.task_id = task_id + + if not self.progress: + self._create_progress() + + def _create_progress(self): + """ + Creates a dummy progress bar for tracking the progress of the HTTP + requests if not provided. + """ + + self.progress = Progress(disable=True) + self.task_id = self.progress.add_task("Requesting data...", total=len(self.ids)) + + async def send_request(self, args: RequestArgs) -> str: + """ + Sends an asynchronous HTTP GET request to the specified URL using the provided + AsyncClient. + + Parameters: + args (RequestArgs): The arguments for the request, including the client and + the URL. + + Returns: + str: The response text from the request. + """ + client = args.client + url = args.url + params = args.params + + LOGGER.debug(f"Sending request to {url}") + response = await client.get(url, params=params, timeout=120) + + LOGGER.debug(f"Received response from {url}. Code: {response.status_code}") + + if response.status_code != 200: + LOGGER.warning( + f"Request to {url} failed with status code {response.status_code}" + ) + LOGGER.warning(f"Response: {response.text}") + + if response.status_code == 429: + LOGGER.warning("Rate limit exceeded. Waiting for 1 second...") + await asyncio.sleep(0.5) + return await self.send_request(args) + + return response.text + + async def make_request(self) -> List[str]: + """Handles the asynchronous HTTP GET and configures rate limits and progress bar.""" + + all_responses = [] + + async def update_progress(response: Response): + if self.progress: + self.progress.update(self.task_id, advance=self.batch_size) # type: ignore + + async with AsyncClient( + event_hooks={"response": [update_progress]}, + limits=Limits(max_connections=self.n_concurrent, keepalive_expiry=30), + ) as client: + tasks = [] + for id in self.ids: + params = self.params.copy() + params["query"] = params["query"].replace("SEQUENCE_ID", str(id)) + tasks.append(RequestArgs(client, self.url, params)) + + LOGGER.debug(f"Sending {len(self.ids)} requests") + async with aiometer.amap( + self.send_request, + tasks, + max_per_second=self.rate_limit, + max_at_once=self.n_concurrent, + ) as responses: + async for res in responses: + all_responses.append(res) + + return all_responses diff --git a/pyeed/old/fetch/taxonomymapper.py b/pyeed/fetch/taxonomymapper.py similarity index 100% rename from pyeed/old/fetch/taxonomymapper.py rename to pyeed/fetch/taxonomymapper.py diff --git a/pyeed/fetch/uniprotmapper.py b/pyeed/fetch/uniprotmapper.py new file mode 100644 index 00000000..2f16bcfa --- /dev/null +++ b/pyeed/fetch/uniprotmapper.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +import logging +import re +from typing import TYPE_CHECKING + +from pyeed.core import Organism +from pyeed.core.ontology import Ontology +from pyeed.core.region import Region + +if TYPE_CHECKING: + from pyeed.core import ProteinRecord + +LOGGER = logging.getLogger(__name__) + + +class UniprotMapper: + def __init__(self): + pass + + def map_uniprot_data(self, uniprot_data: dict) -> ProteinRecord: + from pyeed.core import ProteinRecord + + # Organism information + organism = Organism( + taxonomy_id=uniprot_data["organism"]["taxonomy"], + ) + + try: + ec_number = uniprot_data["protein"]["recommendedName"]["ecNumber"][0][ + "value" + ] + except KeyError: + ec_number = None + + protein_info = ProteinRecord( + id=uniprot_data["accession"], + sequence=uniprot_data["sequence"]["sequence"], + name=uniprot_data["protein"]["recommendedName"]["fullName"]["value"], + ec_number=ec_number, + mol_weight=uniprot_data["sequence"]["mass"], + organism=organism, + ) + + global_go_annotations = [ + Ontology.GO.value + go_annotation["id"] + for go_annotation in uniprot_data["dbReferences"] + if go_annotation["type"] == "GO" + ] + + return protein_info + + def map( + self, + uniprot: dict, + interpro: dict, + ) -> ProteinRecord: + """Maps the sequence information from Uniprot and annotations from InterPro + records to a ProteinInfo object.""" + + from pyeed.core import ProteinRecord + + assert ( + interpro["results"][0]["proteins"][0]["accession"].upper() + == uniprot["accession"].upper() + ) + + organism = Organism( + taxonomy_id=uniprot["organism"]["taxonomy"], + ) + + try: + ec_number = uniprot["protein"]["recommendedName"]["ecNumber"][0]["value"] + except KeyError: + ec_number = None + + protein_info = ProteinRecord( + id=uniprot["accession"], + sequence=uniprot["sequence"]["sequence"], + name=uniprot["protein"]["recommendedName"]["fullName"]["value"], + ec_number=ec_number, + mol_weight=uniprot["sequence"]["mass"], + organism=organism, + ) + for reference in uniprot["dbReferences"]: + if reference["type"].upper() == "REFSEQ": + try: + protein_info.coding_sequence.append( + Region( + id=reference["properties"]["nucleotide sequence ID"], + ) + ) + except KeyError: + LOGGER.debug( + f"Could not find the coding sequence reference for {protein_info.id}" + ) + + protein_info = self.map_interpro(interpro, protein_info) + + return protein_info + + def map_interpro( + self, interpro: dict, protein_info: ProteinRecord + ) -> ProteinRecord: + """Maps the InterPro records to a ProteinInfo object.""" + + interpro_pattern = re.compile(r"IPR\d{6}") + # pfam_pattern = re.compile(r"PF\d{5}") + # panther_pattern = re.compile(r"PTHR\d{5}") + + for annotation in interpro["results"]: + if interpro_pattern.search(annotation["metadata"]["accession"]): + region = protein_info.add_to_regions( + name=annotation["metadata"]["name"], + cross_reference=annotation["metadata"]["accession"], + start=annotation["proteins"][0]["entry_protein_locations"][0][ + "fragments" + ][0]["start"], + end=annotation["proteins"][0]["entry_protein_locations"][0][ + "fragments" + ][0]["end"], + ) + + return protein_info diff --git a/pyeed/model.py b/pyeed/model.py deleted file mode 100644 index 877283e8..00000000 --- a/pyeed/model.py +++ /dev/null @@ -1,226 +0,0 @@ -from enum import Enum - -from neomodel import ( - ArrayProperty, - FloatProperty, - IntegerProperty, - RelationshipFrom, - RelationshipTo, - StringProperty, - StructuredNode, - UniqueIdProperty, - UniqueProperty, - VectorIndex, -) - - -class StrictStructuredNode(StructuredNode): - """A StructuredNode subclass that raises an error if an invalid property is provided.""" - - __abstract_node__ = True - - def __init__(self, *args, **kwargs): - # Get the defined properties of the model - allowed_properties = set(self.__class__._class_properties()) - - # Check if any provided properties are not in the allowed set - for key in kwargs: - if key not in allowed_properties: - raise AttributeError( - f"'{key}' is not a valid property for {self.__class__.__name__}" - ) - - super().__init__(*args, **kwargs) - - @classmethod - def _class_properties(cls): - """Retrieve all allowed properties (fields) defined on the class.""" - return { - k - for k, v in cls.__dict__.items() - if isinstance( - v, - ( - StringProperty, - IntegerProperty, - FloatProperty, - ArrayProperty, - UniqueIdProperty, - ), - ) - } - - def save(self, *args, **kwargs): - """Validates the properties and then saves the node.""" - allowed_properties = self.__class__._class_properties() - - # Only validate properties defined in the model schema - for field, prop in self.__dict__.items(): - if field not in allowed_properties: - continue # Skip non-class properties (like internal Neo4j fields) - - if prop is None or callable(prop): - continue - - try: - neo_type = getattr(self.__class__, field) - except AttributeError: - raise AttributeError( - f"'{self.__class__.__name__}' has no attribute '{field}'" - ) - - # Skip validation for UniqueIdProperty - if isinstance(neo_type, UniqueIdProperty): - continue - - # Validate StringProperty - if isinstance(neo_type, StringProperty) and not isinstance(prop, str): - raise TypeError( - f"Expected a string for '{field}', got {type(prop).__name__}" - ) - - # Validate IntegerProperty - elif isinstance(neo_type, IntegerProperty) and not isinstance(prop, int): - raise TypeError( - f"Expected an integer for '{field}', got {type(prop).__name__}" - ) - - # Validate FloatProperty - elif isinstance(neo_type, FloatProperty) and not isinstance(prop, float): - raise TypeError( - f"Expected a float for '{field}', got {type(prop).__name__}" - ) - - # Validate ArrayProperty - elif isinstance(neo_type, ArrayProperty): - if not isinstance(prop, list): - raise TypeError( - f"Expected a list for '{field}', got {type(prop).__name__}" - ) - - # Validate list of integers, strings, or floats - base_property = neo_type.base_property - if isinstance(base_property, StringProperty): - if not all(isinstance(item, str) for item in prop): - raise TypeError(f"All items in '{field}' must be strings") - elif isinstance(base_property, IntegerProperty): - if not all(isinstance(item, int) for item in prop): - raise TypeError(f"All items in '{field}' must be integers") - elif isinstance(base_property, FloatProperty): - if not all(isinstance(item, float) for item in prop): - raise TypeError(f"All items in '{field}' must be floats") - - return super().save(*args, **kwargs) - - @classmethod - def get_or_save(cls, **kwargs): - """Attempts to save the node first, and if it already exists (due to unique constraint), retrieves it.""" - try: - # Attempt to create and save a new node - instance = cls(**kwargs) - instance.save() - return instance - except UniqueProperty: - # If a unique constraint error occurs, retrieve the existing node - return cls.nodes.get(**kwargs) - - -class Annotation(Enum): - ACTIVE_SITE = "active site" - ALLOSTERIC_SITE = "allosteric site" - ALPHAHELIX = "alpha helix" - BETASTRAND = "beta strand" - BINDING_SITE = "binding site" - CODING_SEQ = "coding sequence" - DNA = "DNA" - DOMAIN = "domain" - FAMILY = "family" - MOTIVE = "motive" - PROTEIN = "protein" - - -class Organism(StrictStructuredNode): - taxonomy_id: int = IntegerProperty(required=True, unique_index=True) - name = StringProperty() - domain = StringProperty() - kingdom = StringProperty() - phylum = StringProperty() - tax_class = StringProperty() - order = StringProperty() - family = StringProperty() - genus = StringProperty() - species = StringProperty() - - # Relationships - protein = RelationshipFrom("Protein", "ORIGINATES_FROM") - dna = RelationshipFrom("DNA", "ORIGINATES_FROM") - - -class Site(StrictStructuredNode): - site_id = UniqueIdProperty() - name = StringProperty() - positions = ArrayProperty(IntegerProperty(), required=True) - annotation = StringProperty( - choices=[(e.value, e.name) for e in Annotation], required=True - ) - - -class Region(StrictStructuredNode): - region_id = UniqueIdProperty() - start = IntegerProperty(required=True) - end = IntegerProperty(required=True) - annotation = StringProperty( - choices=[(e.value, e.name) for e in Annotation], required=True - ) - - -class GOAnnotation(StrictStructuredNode): - go_id = StringProperty(unique_index=True, required=True) - term = StringProperty() - definition = StringProperty() - - @property - def name(self): - return self.term - - -class Protein(StrictStructuredNode): - accession_id = StringProperty(unique_index=True, required=True) - sequence = StringProperty(required=True) - name = StringProperty() - seq_length = IntegerProperty(required=True) - mol_weight = FloatProperty() - ec_number = StringProperty() - nucleotide_id = StringProperty() - locus_tag = StringProperty() - structure_ids = ArrayProperty(StringProperty()) - go_terms = ArrayProperty(StringProperty()) - embedding = ArrayProperty( - FloatProperty(), - vector_index=VectorIndex(dimensions=1048), - ) - - # Relationships - organism = RelationshipTo("Organism", "ORIGINATES_FROM") - site = RelationshipTo("Site", "HAS_SITE") - region = RelationshipTo("Region", "HAS_REGION") - go_annotation = RelationshipTo("GOAnnotation", "ASSOCIATED_WITH") - - -class DNA(StrictStructuredNode): - accession_id = StringProperty(unique_index=True, required=True) - sequence = StringProperty(required=True) - name = StringProperty() - seq_length = IntegerProperty(required=True) - go_terms = ArrayProperty(StringProperty()) - embedding = ArrayProperty( - FloatProperty(), - vector_index=VectorIndex(dimensions=1048), - ) - gc_content = FloatProperty() - - # Relationships - organism = RelationshipTo("Organism", "ORIGINATES_FROM") - site = RelationshipTo("Site", "HAS_SITE") - region = RelationshipTo("Region", "HAS_REGION") - go_annotation = RelationshipTo("GOAnnotation", "ASSOCIATED_WITH") diff --git a/pyeed/old/network/__init__.py b/pyeed/network/__init__.py similarity index 100% rename from pyeed/old/network/__init__.py rename to pyeed/network/__init__.py diff --git a/pyeed/old/network/network.py b/pyeed/network/network.py similarity index 100% rename from pyeed/old/network/network.py rename to pyeed/network/network.py diff --git a/pyeed/old/fetch/__init__.py b/pyeed/old/fetch/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/pyeed/old/fetch/requester.py b/pyeed/old/fetch/requester.py deleted file mode 100644 index eba8138e..00000000 --- a/pyeed/old/fetch/requester.py +++ /dev/null @@ -1,206 +0,0 @@ -from typing import Any, Coroutine, Generic, NamedTuple, TypeVar - -import aiometer -import tenacity -from httpx import ( - AsyncClient, - Limits, - RequestError, - Response, - TimeoutException, -) -from loguru import logger -from rich.progress import Progress, TaskID - -from pyeed.adapter.uniprot_mapper import PrimaryDBtoPyeed - -T = TypeVar("T") - - -class RequestPayload(NamedTuple): - """Holds the request client, URL, and parameters for an HTTP GET request.""" - - client: AsyncClient - url: str - params: dict[str, str] - - -class PrimaryDBAdapter(Generic[T]): - """ - Orchestrates the asynchronous HTTP GET requests to a primary sequence database. - Mapper classes are injected to map the responses to the pyeed graph object model and - save them to the database. - """ - - def __init__( - self, - ids: list[str], - ids_attr_name: str, - url: str, - rate_limit: int, - n_concurrent: int, - batch_size: int, - data_mapper: "PrimaryDBtoPyeed[T]", - timeout: int = 120, - progress: Progress | None = None, - task_id: TaskID | None = None, - request_params: dict[str, str] = {}, - ): - self.ids = ids - self.ids_attr_name = ids_attr_name - self.url = url - self.batch_size = batch_size - self.rate_limit = rate_limit - self.n_concurrent = n_concurrent - self.progress = progress - self.task_id = task_id - self.data_mapper = data_mapper - self.timeout = timeout - self.request_params = request_params - - if self.batch_size > 1: - self.ids = self.make_batches() - - if not self.progress: - self._create_progress() - - def _create_progress(self): - """ - Creates a dummy progress bar for tracking the progress of the HTTP - requests if not provided. - """ - self.progress = Progress(disable=True) - self.task_id = self.progress.add_task("Requesting data...", total=len(self.ids)) - - def make_batches(self) -> list[str]: - """ - Groups the IDs into batches of the specified batch size. - - Returns: - list[str]: The list of batches, where each batch is a comma-separated - string of IDs. - """ - batches = [] - for i in range(0, len(self.ids), self.batch_size): - batch = self.ids[i : i + self.batch_size] - batch_string = ",".join(map(str, batch)) - batches.append(batch_string) - return batches - - def build_request_payload(self, client: AsyncClient, id_: str) -> RequestPayload: - """Combines the client, URL, and parameters into a RequestPayload object. - Adds the id with the key specified by ids_attr_name to the request parameters. - - Args: - client (AsyncClient): AsyncClient object for making HTTP requests - id_ (str): ID to be added to the request parameters - - Returns: - RequestPayload: RequestPayload object with the client, URL, and parameters - """ - params = self.request_params.copy() - params[self.ids_attr_name] = id_ - - return RequestPayload(client, self.url, params=params) - - @tenacity.retry( - wait=tenacity.wait_fixed(1), - stop=tenacity.stop_after_attempt(3), - retry=tenacity.retry_if_exception_type((RequestError, TimeoutException)), - ) - async def send_request( - self, - args: RequestPayload, - ) -> Coroutine[None, None, Response]: - """ - Sends an asynchronous HTTP GET request to the specified URL using the provided - AsyncClient. - """ - client = args.client - url = args.url - params = args.params - - logger.debug(f"Sending request to {url} with parameters: {params}") - return client.get(url, params=params, timeout=self.timeout) - - async def make_request(self): - """ - Makes asynchronous HTTP GET requests to the specified URL using the provided - AsyncClient, handling rate limiting and concurrency. - """ - - def update_progress(): - if self.progress and self.task_id: - self.progress.update(self.task_id, advance=1) # type: ignore - - async with AsyncClient( - limits=Limits(max_connections=self.n_concurrent), - ) as client: - # Build the list of request arguments (this prepares the coroutine tasks) - requests = [self.build_request_payload(client, id) for id in self.ids] - - logger.debug( - f"Sending {len(self.ids)} requests in batches of {self.batch_size}" - ) - - # Using aiometer to handle rate-limiting and concurrency - async with aiometer.amap( - self.send_request, - requests, - max_per_second=self.rate_limit, - max_at_once=self.n_concurrent, - ) as response_coroutines: - async for response_coroutine in response_coroutines: - res = await response_coroutine - sanitized_response = self.sanitize_response(res) - [self.map_and_add_to_db(entry) for entry in sanitized_response] - - update_progress() - - def sanitize_response(self, response: Response) -> list[dict[str, Any]]: - """ - Sanitizes the response from the HTTP GET request by checking the status code - and formatting the JSON response as a list of dictionaries. - - Returns: - Optional[List[Dict[str, Any]]]: The JSON response as a list of dictionaries, - or None if the response is invalid. - """ - if response.status_code != 200: - logger.warning( - f"Request to {response.url} failed with status code {response.status_code}" - ) - return [] - - try: - response_json = response.json() - if not response_json: - logger.warning(f"Empty response from {response.url}") - return [] - - # If the response is a dictionary, wrap it in a list - if isinstance(response_json, dict): - response_json = [response_json] - - # Ensure the response is a list of dictionaries - if not isinstance(response_json, list) or not all( - isinstance(item, dict) for item in response_json - ): - logger.warning(f"Unexpected response format from {response.url}") - return [] - - except ValueError as e: - logger.warning(f"Failed to parse JSON response from {response.url}: {e}") - return [] - - return response_json - - def map_and_add_to_db(self, response: dict[str, Any] | None): - """ - Handles the response from the HTTP GET request by passing it to the data mapper. - This adds the mapped data to the database. - """ - - if response is None: - return None - self.data_mapper.add_to_db(response) diff --git a/pyeed/old/schemes/DNARecord.json b/pyeed/old/schemes/DNARecord.json deleted file mode 100644 index 95751e10..00000000 --- a/pyeed/old/schemes/DNARecord.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema", - "title": "DNARecord", - "type": "object", - "properties": { - "id": { - "title": "id", - "description": "Unique identifier of the sequence.", - "term": "sio:SIO_000729", - "type": "string" - }, - "name": { - "title": "name", - "description": "Arbitrary name of the sequence.", - "term": "sio:SIO_000116", - "type": "string" - }, - "organism": { - "term": "sio:SIO_010000", - "$ref": "#/definitions/Organism" - }, - "sequence": { - "title": "sequence", - "description": "The letter sequence of the macromolecule.", - "term": "sio:SIO_000030", - "type": "string" - }, - "seq_length": { - "title": "seq_length", - "description": "Length of the sequence.", - "term": "sio:SIO_000041", - "type": "integer" - }, - "embedding": { - "title": "embedding", - "description": "1D embedding vector of the protein sequence.", - "type": "array", - "items": { - "type": "number" - } - }, - "sites": { - "type": "array", - "items": { - "$ref": "#/definitions/Site" - } - }, - "regions": { - "type": "array", - "items": { - "$ref": "#/definitions/Region" - } - }, - "region_sets": { - "type": "array", - "items": { - "$ref": "#/definitions/RegionSet" - } - }, - "gc_content": { - "title": "gc_content", - "description": "GC content of the sequence.", - "type": "number" - } - }, - "description": "A nucleic acid sequence and associated metadata 🧬", - "term": "sio:SIO_010008", - "definitions": { - "Region": { - "title": "Region", - "type": "object", - "properties": { - "start": { - "title": "start", - "description": "Start position of the site.", - "term": "sio:SIO_000943", - "type": "integer" - }, - "end": { - "title": "end", - "description": "End position of the site.", - "term": "sio:SIO_000953", - "type": "integer" - } - }, - "description": "Regional annotation of a feature within a sequence." - }, - "Site": { - "title": "Site", - "type": "object", - "properties": { - "positions": { - "title": "positions", - "description": "Position of the site(s) within the sequence.", - "term": "sio:SIO_000056", - "type": "array", - "items": { - "type": "integer" - } - } - }, - "description": "Position(s) constituting a site within a sequence." - }, - "Organism": { - "title": "Organism", - "type": "object", - "properties": { - "taxonomy_id": { - "title": "taxonomy_id", - "description": "A stable unique identifier for each taxon (for a species, a family, an order, or any other group in the NCBI taxonomy database.", - "term": "edam:data_1179", - "type": "integer" - }, - "name": { - "title": "name", - "description": "The name of an organism (or group of organisms).", - "term": "edam:data_2909", - "type": "string" - }, - "domain": { - "title": "domain", - "description": "Domain of the organism", - "type": "string" - }, - "kingdom": { - "title": "kingdom", - "description": "Kingdom of the organism", - "term": "edam:data_1044", - "type": "string" - }, - "phylum": { - "title": "phylum", - "description": "Phylum of the organism", - "type": "string" - }, - "tax_class": { - "title": "tax_class", - "description": "Class of the organism", - "type": "string" - }, - "order": { - "title": "order", - "description": "Order of the organism", - "type": "string" - }, - "family": { - "title": "family", - "description": "The name of a family of organism.", - "term": "edam:data_2732", - "type": "string" - }, - "genus": { - "title": "genus", - "description": "The name of a genus of organism.", - "term": "edam:data_1870", - "type": "string" - }, - "species": { - "title": "species", - "description": "The name of a species (typically a taxonomic group) of organism.", - "term": "edam:data_1045", - "type": "string" - } - }, - "description": "Description of an organism 🦠." - }, - "RegionSet": { - "title": "RegionSet", - "type": "object", - "properties": { - "regions": { - "type": "array", - "items": { - "$ref": "#/definitions/Region" - } - } - }, - "description": "A set of regions forming a higher order structure. For example, a set of exons in a gene, or a set of secondary structures forming a super-secondary structure.", - "term": "sio:SIO_000370" - } - } -} \ No newline at end of file diff --git a/pyeed/old/schemes/ProteinRecord.json b/pyeed/old/schemes/ProteinRecord.json deleted file mode 100644 index 14b93398..00000000 --- a/pyeed/old/schemes/ProteinRecord.json +++ /dev/null @@ -1,241 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema", - "title": "ProteinRecord", - "type": "object", - "properties": { - "id": { - "title": "id", - "description": "Unique identifier of the sequence.", - "term": "sio:SIO_000729", - "type": "string" - }, - "name": { - "title": "name", - "description": "Arbitrary name of the sequence.", - "term": "sio:SIO_000116", - "type": "string" - }, - "organism": { - "term": "sio:SIO_010000", - "$ref": "#/definitions/Organism" - }, - "sequence": { - "title": "sequence", - "description": "The letter sequence of the macromolecule.", - "term": "sio:SIO_000030", - "type": "string" - }, - "embedding": { - "title": "embedding", - "description": "1D embedding vector of the protein sequence.", - "type": "array", - "items": { - "type": "number" - } - }, - "seq_length": { - "title": "seq_length", - "description": "Length of the sequence.", - "term": "sio:SIO_000041", - "type": "integer" - }, - "nucleotide_id": { - "title": "nucleotide_id", - "description": "Identifier of the nucleotide sequence.", - "type": "string" - }, - "locus_tag": { - "title": "locus_tag", - "description": "Locus tag of the protein within the nucleotide sequence.", - "type": "string" - }, - "sites": { - "type": "array", - "items": { - "$ref": "#/definitions/Site" - } - }, - "regions": { - "type": "array", - "items": { - "$ref": "#/definitions/Region" - } - }, - "structure_ids": { - "title": "structure_ids", - "description": "Identifiers of the structures of the protein.", - "term": "sio:SIO_000729", - "type": "array", - "items": { - "type": "string" - } - }, - "ec_number": { - "title": "ec_number", - "description": "An Enzyme Commission (EC) number of an enzyme.", - "term": "edam:data_1011", - "type": "string" - }, - "mol_weight": { - "title": "mol_weight", - "description": "Calculated molecular weight of the protein based on the sequence.", - "term": "edam:data_1505", - "type": "number" - }, - "annotations": { - "title": "annotations", - "$ref": "#/definitions/Annotation" - }, - "go_terms": { - "title": "go_terms", - "description": "Gene Ontology terms associated with the protein.", - "type": "array", - "items": { - "type": "string" - } - } - }, - "description": "A protein sequence and associated metadata.", - "term": "sio:SIO_010043", - "definitions": { - "Organism": { - "title": "Organism", - "type": "object", - "properties": { - "taxonomy_id": { - "title": "taxonomy_id", - "description": "A stable unique identifier for each taxon for a species, a family, an order, or any other group in the NCBI taxonomy database.", - "term": "edam:data_1179", - "type": "integer" - }, - "name": { - "title": "name", - "description": "The name of an organism (or group of organisms).", - "term": "edam:data_2909", - "type": "string" - }, - "domain": { - "title": "domain", - "description": "Domain of the organism", - "type": "string" - }, - "kingdom": { - "title": "kingdom", - "description": "Kingdom of the organism", - "term": "edam:data_1044", - "type": "string" - }, - "phylum": { - "title": "phylum", - "description": "Phylum of the organism", - "type": "string" - }, - "tax_class": { - "title": "tax_class", - "description": "Class of the organism", - "type": "string" - }, - "order": { - "title": "order", - "description": "Order of the organism", - "type": "string" - }, - "family": { - "title": "family", - "description": "The name of a family of organism.", - "term": "edam:data_2732", - "type": "string" - }, - "genus": { - "title": "genus", - "description": "The name of a genus of organism.", - "term": "edam:data_1870", - "type": "string" - }, - "species": { - "title": "species", - "description": "The name of a species (typically a taxonomic group) of organism.", - "term": "edam:data_1045", - "type": "string" - } - }, - "description": "Description of an organism 🦠." - }, - "Site": { - "title": "Site", - "type": "object", - "properties": { - "name": { - "title": "name", - "description": "Name of the site.", - "type": "string" - }, - "annotation": { - "title": "annotation", - "$ref": "#/definitions/Annotation" - }, - "positions": { - "title": "positions", - "description": "Position of the site(s) within the sequence.", - "term": "sio:SIO_000056", - "type": "array", - "items": { - "type": "integer" - } - } - }, - "description": "Position(s) constituting a site within a sequence.", - "term": "sio:sio:010049" - }, - "Annotation": { - "title": "Annotation", - "type": "string", - "enum": [ - "http://semanticscience.org/resource/SIO_010041", - "http://semanticscience.org/resource/SIO_010050", - "http://semanticscience.org/resource/SIO_010468", - "http://semanticscience.org/resource/SIO_010469", - "http://semanticscience.org/resource/SIO_010040", - "http://semanticscience.org/resource/SIO_001276", - "http://semanticscience.org/resource/SIO_010018", - "http://semanticscience.org/resource/SIO_001379", - "http://semanticscience.org/resource/SIO_001380", - "http://semanticscience.org/resource/SIO_000131", - "http://semanticscience.org/resource/SIO_010015" - ] - }, - "Region": { - "title": "Region", - "type": "object", - "properties": { - "id": { - "title": "id", - "description": "Unique identifier of the site.", - "type": "string" - }, - "name": { - "title": "name", - "description": "Name of the site.", - "type": "string" - }, - "annotation": { - "title": "annotation", - "$ref": "#/definitions/Annotation" - }, - "start": { - "title": "start", - "description": "Start position of the site.", - "term": "sio:SIO_000943", - "type": "integer" - }, - "end": { - "title": "end", - "description": "End position of the site.", - "term": "sio:SIO_000953", - "type": "integer" - } - }, - "description": "Regional annotation of a feature within a sequence." - } - } -} \ No newline at end of file diff --git a/pyeed/old/schemes/proteinrecord.shex b/pyeed/old/schemes/proteinrecord.shex deleted file mode 100644 index b46fd7d7..00000000 --- a/pyeed/old/schemes/proteinrecord.shex +++ /dev/null @@ -1,125 +0,0 @@ -PREFIX xsd: -PREFIX md: -PREFIX edam: -PREFIX sio: - -md:ProteinRecord { - sio:SIO_000729 xsd:string { - shex:annotation [ - shex:label "id" - ] - }; - sio:SIO_000116 xsd:string? { - shex:annotation [ - shex:label "name" - ] - }; - sio:SIO_010000 @md:Organism? { - shex:annotation [ - shex:label "organism" - ] - }; - sio:SIO_000030 xsd:string { - shex:annotation [ - shex:label "sequence" - ] - }; - sio:SIO_000041 xsd:integer? { - shex:annotation [ - shex:label "seq_length" - ] - }; - sio:SIO_000729 xsd:string* { - shex:annotation [ - shex:label "structure_ids" - ] - }; - edam:data_1011 xsd:string? { - shex:annotation [ - shex:label "ec_number" - ] - }; - edam:data_1505 xsd:double? { - shex:annotation [ - shex:label "mol_weight" - ] - }; -} -md:DNARecord { - sio:SIO_000729 xsd:string { - shex:annotation [ - shex:label "id" - ] - }; - sio:SIO_000116 xsd:string? { - shex:annotation [ - shex:label "name" - ] - }; - sio:SIO_010000 @md:Organism? { - shex:annotation [ - shex:label "organism" - ] - }; - sio:SIO_000030 xsd:string { - shex:annotation [ - shex:label "sequence" - ] - }; - sio:SIO_000041 xsd:integer? { - shex:annotation [ - shex:label "seq_length" - ] - }; -} -md:Site { - sio:SIO_000056 xsd:integer+ { - shex:annotation [ - shex:label "positions" - ] - }; -} -md:Region { - sio:SIO_000943 xsd:integer { - shex:annotation [ - shex:label "start" - ] - }; - sio:SIO_000953 xsd:integer { - shex:annotation [ - shex:label "end" - ] - }; -} -md:Organism { - edam:data_1179 xsd:integer { - shex:annotation [ - shex:label "taxonomy_id" - ] - }; - edam:data_2909 xsd:string? { - shex:annotation [ - shex:label "name" - ] - }; - edam:data_1044 xsd:string? { - shex:annotation [ - shex:label "kingdom" - ] - }; - edam:data_2732 xsd:string? { - shex:annotation [ - shex:label "family" - ] - }; - edam:data_1870 xsd:string? { - shex:annotation [ - shex:label "genus" - ] - }; - edam:data_1045 xsd:string? { - shex:annotation [ - shex:label "species" - ] - }; -} \ No newline at end of file diff --git a/pyeed/pyeed.py b/pyeed/pyeed.py deleted file mode 100644 index 7b3b4023..00000000 --- a/pyeed/pyeed.py +++ /dev/null @@ -1,106 +0,0 @@ -import asyncio - -import nest_asyncio -from loguru import logger - -from pyeed.adapter.primary_db_adapter import PrimaryDBAdapter -from pyeed.adapter.uniprot_mapper import UniprotToPyeed -from pyeed.dbconnect import DatabaseConnector -from pyeed.embedding import free_memory, get_batch_embeddings -from pyeed.model import Protein - - -class Pyeed: - def __init__( - self, - uri: str, - user: str | None = None, - password: str | None = None, - ): - self.db = DatabaseConnector(uri, user, password) - - def fetch_from_primary_db(self, ids: list[str]): - """ - Fetches sequences and corresponding annotations from primary sequence databases - and adds them to local database. - """ - nest_asyncio.apply() - - if isinstance(ids, str): - ids = [ids] - - params_template = { - "format": "json", - } - - adapter = PrimaryDBAdapter( - ids=ids, - ids_attr_name="accession", - url="https://www.ebi.ac.uk/proteins/api/proteins", - rate_limit=10, - n_concurrent=5, - batch_size=5, - data_mapper=UniprotToPyeed(), - progress=None, - task_id=None, - request_params=params_template, - ) - - asyncio.run(adapter.make_request()) - - def calculate_sequence_embeddings(self): - """ - Calculates embeddings for all sequences in the database that do not have embeddings. - """ - - proteins = Protein.nodes.filter(embedding__isnull=True) - logger.debug(f"Found {len(proteins)} proteins without embeddings.") - accessions = [protein.accession_id for protein in proteins] - sequences = [protein.sequence for protein in proteins] - - logger.debug(f"Calculating embeddings for {len(sequences)} sequences.") - embeddings = get_batch_embeddings(sequences) - - for i, protein in enumerate(proteins): - if not protein.accession_id == accessions[i]: - raise ValueError("Protein accessions do not match.") - protein.embedding = embeddings[i].tolist() - protein.save() - - free_memory() - - -if __name__ == "__main__": - eedb = Pyeed("bolt://127.0.0.1:7687") - - search = False - if search: - eedb.db._wipe_database() - - eedb.fetch_from_primary_db( - [ - "P04182", - "Q6QDP7", - "P04182", - "P29758", - "A0A851UXD9", - "A0A8C6HVU6", - "A0A8C6GQ10", - "A0A1U7QEB0", - "A0A6I9L5L6", - "G3HVE0", - "A0A8J6G992", - "A0A8C6W4W5", - "A0A8B9YUY7", - "L8I4V3", - "A0A6P3IYQ1", - "A0A452EKJ3", - "A0A6P5B7Q0", - "F1MYG0", - "A0A5J5MK22", - "A0A6J0Y425", - "Q3ZCF5", - ] - ) - - print(eedb.db.stats()) diff --git a/pyeed/old/schemes/pyeed_schema.md b/pyeed/schemes/pyeed_schema.md similarity index 100% rename from pyeed/old/schemes/pyeed_schema.md rename to pyeed/schemes/pyeed_schema.md diff --git a/pyeed/old/tools/__init__.py b/pyeed/tools/__init__.py similarity index 100% rename from pyeed/old/tools/__init__.py rename to pyeed/tools/__init__.py diff --git a/pyeed/old/tools/abstract_tool.py b/pyeed/tools/abstract_tool.py similarity index 100% rename from pyeed/old/tools/abstract_tool.py rename to pyeed/tools/abstract_tool.py diff --git a/pyeed/old/tools/clustalo.py b/pyeed/tools/clustalo.py similarity index 100% rename from pyeed/old/tools/clustalo.py rename to pyeed/tools/clustalo.py diff --git a/pyproject.toml b/pyproject.toml index b4ebd886..945a5955 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyeed" -version = "0.4.1" +version = "0.3.7" description = "Toolkit to create, annotate, and analyze sequence data" authors = ["haeussma <83341109+haeussma@users.noreply.github.com>"] license = "MIT" @@ -12,6 +12,7 @@ python = ">=3.10,<3.13" biopython = ">=1.81,<1.84" networkx = "^3.2.1" plotly = "^5.18.0" +nbformat = "^5.9.2" scipy = "^1.11.3" pyhmmer = "^0.10.11" httpx = "^0.27.0" @@ -20,18 +21,13 @@ rich = "^13.0.0" aiometer = "^0.5.0" joblib = "^1.4.0" requests = "^2.31.0" +sdrdm = { git = "https://github.com/JR-1991/software-driven-rdm.git" } matplotlib = "^3.9.0" pymsaviz = "^0.4.2" +py4cytoscape = "^1.9.0" tenacity = "^8.3.0" -neo4j = "5.19.*" +neo4j = "^5.20.0" bio = "^1.7.1" -loguru = "^0.7.2" -neomodel = "^5.3.3" -shapely = "^2.0.6" -torch = "^2.4.1" -transformers = "^4.45.2" -scikit-learn = "^1.5.2" -numpy = "^2.1.2" [tool.poetry.group.dev.dependencies] mkdocs-material = "^9.5.9" @@ -42,6 +38,8 @@ ruff = "^0.4.1" mkdocs-jupyter = "^0.24.7" jupyter-contrib-nbextensions = "^0.7.0" notebook = "^7.1.3" +ipython = "^8.24.0" +nbconvert = "^7.16.4" [build-system] requires = ["poetry-core"] diff --git a/pyeed/old/specifications/sequence_record.md b/specifications/sequence_record.md similarity index 71% rename from pyeed/old/specifications/sequence_record.md rename to specifications/sequence_record.md index 9131e988..4e50f5b8 100644 --- a/pyeed/old/specifications/sequence_record.md +++ b/specifications/sequence_record.md @@ -8,11 +8,11 @@ prefixes: ## Macromolecules -### ProteinRecord (sio:SIO_010043) +### SequenceRecord -A protein sequence and associated metadata. +A molecular sequence and associated annotation data. -- **id** +- id - Type: string - Description: Unique identifier of the sequence. - Term: sio:SIO_000729 @@ -28,29 +28,32 @@ A protein sequence and associated metadata. - Type: string - Description: The letter sequence of the macromolecule. - Term: sio:SIO_000030 -- embedding - - Type: float[] - - Description: 1D embedding vector of the protein sequence. - seq_length - Type: integer - Description: Length of the sequence. - Term: sio:SIO_000041 -- nucleotide_id - - Type: string - - Description: Identifier of the nucleotide sequence. -- locus_tag - - Type: string - - Description: Locus tag of the protein within the nucleotide sequence. - sites - Type: Site[] - Description: Defines sites within the nucleotide sequence. - regions - Type: Region[] - Description: Defines regions within the nucleotide sequence. -- structure_ids - - Type: string[] - - Description: Identifiers of the structures of the protein. +- region_sets + - Type: RegionSet[] + - Description: Multiple regions forming a higher order structure or feature of a sequence. + +### ProteinRecord(_SequenceRecord_) (sio:SIO_010043) + +A protein sequence and associated metadata. + +- structure_id + - Type: string + - Description: Protein Data Bank (PDB) identifier. - Term: sio:SIO_000729 +- coding_sequence + - Type: Region[] + - Description: Defines the coding sequence of the protein + - Term: sio:SIO_001390 - ec_number - Type: string - Description: An Enzyme Commission (EC) number of an enzyme. @@ -59,100 +62,66 @@ A protein sequence and associated metadata. - Type: float - Description: Calculated molecular weight of the protein based on the sequence. - Term: edam:data_1505 -- annotations - - Type: Annotation[] - - Description: Annotations of the protein sequence. -- go_terms - - Type: string[] - - Description: Gene Ontology terms associated with the protein. -### DNARecord (sio:SIO_010008) +### DNARecord(_SequenceRecord_) (sio:SIO_010008) A nucleic acid sequence and associated metadata 🧬 -- **id** - - Type: string - - Description: Unique identifier of the sequence. - - Term: sio:SIO_000729 -- name - - Type: string - - Description: Arbitrary name of the sequence. - - Term: sio:SIO_000116 -- organism - - Type: Organism - - Description: The organism from which the sequence was obtained. - - Term: sio:SIO_010000 -- **sequence** - - Type: string - - Description: The letter sequence of the macromolecule. - - Term: sio:SIO_000030 -- seq_length - - Type: integer - - Description: Length of the sequence. - - Term: sio:SIO_000041 -- embedding - - Type: float[] - - Description: 1D embedding vector of the protein sequence. -- sites - - Type: Site[] - - Description: Defines sites within the nucleotide sequence. -- regions - - Type: Region[] - - Description: Defines regions within the nucleotide sequence. - gc_content - Type: float - Description: GC content of the sequence. -- annotations - - Type: Annotation[] - - Description: Annotations of the DNA sequence. -- go_terms - - Type: string[] - - Description: Gene Ontology terms associated with the DNA. -### Site (sio:sio:010049) - -Position(s) constituting a site within a sequence. +### AbstractAnnotation +- url + - Type: string + - Description: URI of the annotation. + - Term: sio:SIO_000811 +- accession_id + - Type: string + - Description: Accession ID of the annotation. + - Term: sio:SIO_000675 - name - Type: string - - Description: Name of the site. -- **annotation** - - Type: Annotation - - Description: Annotation of the site. -- **positions** + - Description: A name of a sequence feature, e.g. the name of a feature + +### Site(_AbstractAnnotation_) (sio:sio:010049) + +Position(s) constituting a site within a sequence. + +- positions - Type: integer[] - Description: Position of the site(s) within the sequence. - Term: sio:SIO_000056 -### Region ( _AbstractAnnotation_ ) (sio:SIO_000370) +### Region(_AbstractAnnotation_) (sio:SIO_000370) -Regional annotation of a feature within a sequence. +Regional annotation of a feature within a sequence. The direction of the region is defined by the start and end positions. -- **id** - - Type: string - - Description: Unique identifier of the site. -- name - - Type: string - - Description: Name of the site. -- **annotation** - - Type: Annotation - - Description: Annotation of the site. -- **start** +- start - Type: integer - Description: Start position of the site. - Term: sio:SIO_000943 -- **end** +- end - Type: integer - Description: End position of the site. - Term: sio:SIO_000953 +### RegionSet (sio:SIO_000370) + +A set of regions forming a higher order structure. For example, a set of exons in a gene, or a set of secondary structures forming a super-secondary structure. + +- regions + - Type: Region[] + - Description: Regions of the cluster. + ### Organism Description of an organism 🦠. - **taxonomy_id** - Type: integer - - Description: A stable unique identifier for each taxon for a species, a family, an order, or any other group in the NCBI taxonomy database. + - Description: A stable unique identifier for each taxon (for a species, a family, an order, or any other group in the NCBI taxonomy database. - Term: edam:data_1179 - name - Type: string @@ -200,7 +169,7 @@ Description of an organism 🦠. - Description: Expectation value (E) to safe hits. - Default: 10.0 - n_hits - - Type: integer + - Type: int - Description: Number of hits to return. - Default: 100 - substitution_matrix @@ -208,7 +177,7 @@ Description of an organism 🦠. - Description: Substitution matrix to use. - Default: "BLOSUM62" - word_size - - Type: integer + - Type: int - Description: Word size of the initial match. - Default: 3 - Inclusivminimum: 2 @@ -255,7 +224,7 @@ Description of an organism 🦠. - Type: StandardNumbering - Description: Standard numbering of the aligned sequences. -### PairwiseAlignmentResult (_AlignmentResult_) +### PairwiseAlignmentResult(_AlignmentResult_) - score - Type: float @@ -267,16 +236,16 @@ Description of an organism 🦠. - Type: float - Description: Ratio of similar residues in the alignment - gaps - - Type: integer + - Type: int - Description: Number of gaps in the alignment - mismatches - - Type: integer + - Type: int - Description: Number of mismatches in the alignment ### StandardNumbering - reference_id - - Type: string + - Type: str - Description: Standard numbering of the reference sequence - numberd_sequences - Type: NumberedSequence[] @@ -291,8 +260,24 @@ Description of an organism 🦠. - Type: string[] - Description: Standard numbering of the aligned sequence +### ClustalOmegaResult(_AlignmentResult_) + +- version + - Type: string + - Description: Version of the Clustal Omega software + ## Enumerations +### Ontology + +Ontology endpoints for different types of sequences. + +```python +GO = "https://amigo.geneontology.org/amigo/term/" +SIO = "http://semanticscience.org/resource/" +ECO = "https://www.evidenceontology.org/term/" +``` + ### Annotation Ontology terms for different sections of a sequence. @@ -307,6 +292,13 @@ MOTIVE = "http://semanticscience.org/resource/SIO_000131" CODING_SEQ = "http://semanticscience.org/resource/SIO_001276" ALPHAHELIX = "http://semanticscience.org/resource/SIO_010468" BETASTRAND = "http://semanticscience.org/resource/SIO_010469" +``` + +### SequenceType + +Ontology terms for different types of sequences. + +```python DNA = "http://semanticscience.org/resource/SIO_010018" PROTEIN = "http://semanticscience.org/resource/SIO_010015" -``` \ No newline at end of file +``` diff --git a/test.ipynb b/test.ipynb deleted file mode 100644 index 55507470..00000000 --- a/test.ipynb +++ /dev/null @@ -1,1424 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/max/Library/Caches/pypoetry/virtualenvs/pyeed-iiMJg_Qc-py3.11/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "from pyeed import Pyeed\n", - "from pyeed.model import GOAnnotation, Protein" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Create Neo4j DB\n", - "\n", - "Create local Neo4j DB without authentication.\n", - "Graph data science plugin is not installed.\n", - "\n", - "```bash\n", - "docker run -it --name pyeed-neo4j \\\n", - " -p 7474:7474 \\--user=\"$(id -u):$(id -g)\" \\\n", - " -e NEO4J_AUTH=none \\\n", - " -p 7687:7687 \\\n", - " -v $HOME/Documents/db/data:/data \\\n", - " -v $HOME/Documents/db/logs:/logs \\\n", - " -v $HOME/Documents/db/import:/var/lib/neo4j/import \\\n", - " -v $HOME/Documents/db/plugins:/plugins \\\n", - " -e NEO4J_AUTH=neo4j/test \\\n", - " -e NEO4JLABS_PLUGINS='[\"apoc\"]' \\\n", - " -e NEO4J_dbms_security_procedures_unrestricted=\"apoc.*\" \\\n", - " -d neo4j:latest\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Connect to DB\n", - "\n", - "Neo4j DB is hosted locally via Docker.\n", - "Also possible to use free hosted Neo4j Sandbox (not tested)." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "📡 Connected to database.\n", - "All data has been wiped from the database.\n", - "\n" - ] - } - ], - "source": [ - "uri = \"bolt://127.0.0.1:7687\"\n", - "user = None\n", - "password = None\n", - "\n", - "# Create a Pyeed object, automatically connecting to the database\n", - "eedb = Pyeed(uri)\n", - "eedb.db._wipe_database()\n", - "\n", - "# DB connector is a property of the Pyeed object\n", - "print(eedb.db)\n", - "\n", - "# If this is the first time you are running this script, the pyeed graph model needs to be initialized\n", - "first_time = False\n", - "if first_time:\n", - " eedb.db._initialize_db_constraints(user=user, password=password)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m2024-10-14 16:18:48.472\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36mmake_request\u001b[0m:\u001b[36m142\u001b[0m - \u001b[34m\u001b[1mSending 9 requests in batches of 5\u001b[0m\n", - "\u001b[32m2024-10-14 16:18:48.476\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'P04182,Q6QDP7,P04182,P29758,A0A851UXD9'}\u001b[0m\n", - "\u001b[32m2024-10-14 16:18:48.577\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'A0A8C6HVU6,A0A8C6GQ10,A0A1U7QEB0,A0A6I9L5L6,G3HVE0'}\u001b[0m\n", - "\u001b[32m2024-10-14 16:18:48.806\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'A0A8J6G992,A0A8C6W4W5,A0A8B9YUY7,L8I4V3,A0A6P3IYQ1'}\u001b[0m\n", - "\u001b[32m2024-10-14 16:18:48.985\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'A0A452EKJ3,A0A6P5B7Q0,F1MYG0,A0A5J5MK22,A0A6J0Y425'}\u001b[0m\n", - "\u001b[32m2024-10-14 16:18:49.249\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'Q3ZCF5,P00330,J8LIG6,A0AA35J9C9,P00331'}\u001b[0m\n", - "\u001b[32m2024-10-14 16:18:49.470\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'J8Q680,J5PRJ1,A0A1X7R1I9,Q6FQA4,C5DNB7'}\u001b[0m\n", - "\u001b[32m2024-10-14 16:18:49.573\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'Q9P4C2,C5DHM6,Q757I1,A0A7H9HSD9,P20369'}\u001b[0m\n", - "\u001b[32m2024-10-14 16:18:49.838\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'H2AXS6,G0W4V9,A0A1G4M9V8,A0A1G4KF85,A0A1G4JJF2'}\u001b[0m\n", - "\u001b[32m2024-10-14 16:18:49.967\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'G8ZTZ5,A0A1G4MBD6,A0A7H9HSJ3,J7SA96,G0VK69'}\u001b[0m\n", - "\u001b[32m2024-10-14 16:18:50.503\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.pyeed\u001b[0m:\u001b[36mcalculate_sequence_embeddings\u001b[0m:\u001b[36m57\u001b[0m - \u001b[34m\u001b[1mFound 44 proteins without embeddings.\u001b[0m\n", - "\u001b[32m2024-10-14 16:18:50.509\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.pyeed\u001b[0m:\u001b[36mcalculate_sequence_embeddings\u001b[0m:\u001b[36m61\u001b[0m - \u001b[34m\u001b[1mCalculating embeddings for 44 sequences.\u001b[0m\n", - "Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", - "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'nodes': 135, 'relationships': 293}\n" - ] - } - ], - "source": [ - "ids = [\n", - " \"P04182\",\n", - " \"Q6QDP7\",\n", - " \"P04182\",\n", - " \"P29758\",\n", - " \"A0A851UXD9\",\n", - " \"A0A8C6HVU6\",\n", - " \"A0A8C6GQ10\",\n", - " \"A0A1U7QEB0\",\n", - " \"A0A6I9L5L6\",\n", - " \"G3HVE0\",\n", - " \"A0A8J6G992\",\n", - " \"A0A8C6W4W5\",\n", - " \"A0A8B9YUY7\",\n", - " \"L8I4V3\",\n", - " \"A0A6P3IYQ1\",\n", - " \"A0A452EKJ3\",\n", - " \"A0A6P5B7Q0\",\n", - " \"F1MYG0\",\n", - " \"A0A5J5MK22\",\n", - " \"A0A6J0Y425\",\n", - " \"Q3ZCF5\",\n", - " \"P00330\", # ADH\n", - " \"J8LIG6\",\n", - " \"A0AA35J9C9\",\n", - " \"P00331\",\n", - " \"J8Q680\",\n", - " \"J5PRJ1\",\n", - " \"A0A1X7R1I9\",\n", - " \"Q6FQA4\",\n", - " \"C5DNB7\",\n", - " \"Q9P4C2\",\n", - " \"C5DHM6\",\n", - " \"Q757I1\",\n", - " \"A0A7H9HSD9\",\n", - " \"P20369\",\n", - " \"H2AXS6\",\n", - " \"G0W4V9\",\n", - " \"A0A1G4M9V8\",\n", - " \"A0A1G4KF85\",\n", - " \"A0A1G4JJF2\",\n", - " \"G8ZTZ5\",\n", - " \"A0A1G4MBD6\",\n", - " \"A0A7H9HSJ3\",\n", - " \"J7SA96\",\n", - " \"G0VK69\",\n", - "]\n", - "\n", - "# Fetch proteins from primary database\n", - "eedb.fetch_from_primary_db(ids)\n", - "eedb.calculate_sequence_embeddings()\n", - "\n", - "# number of nodes and edges in db\n", - "print(eedb.db.stats())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To use the web interface, open a browser and go to `http://localhost:7474/`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Query DB" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Number of proteins in database: 44\n", - "{'accession_id': 'P04182', 'sequence': 'MLSKLASLQTVAALRRGLRTSVASATSVATKKTEQGPPSSEYIFERESKYGAHNYHPLPVALERGKGIYMWDVEGRQYFDFLSAYGAVSQGHCHPKIIEAMKSQVDKLTLTSRAFYNNVLGEYEEYITKLFNYNKVLPMNTGVEAGETACKLARRWGYTVKGIQKYKAKIVFAVGNFWGRTLSAVSSSTDPTSYDGFGPFMPGFETIPYNDLPALERALQDPNVAAFMVEPIQGEAGVIVPDPGYLTGVRELCTRHQVLFIADEIQTGLARTGRWLAVDHENVRPDIVLLGKALSGGLYPVSAVLCDDDIMLTIKPGEHGSTYGGNPLGCRIAIAALEVLEEEHLAENADKMGAILRKELMKLPSDVVTAVRGKGLLNAIVIRETKDCDAWKVCLRLRDNGLLAKPTHGDIIRLAPPLVIKEDEIRESVEIINKTILSF', 'name': 'Ornithine aminotransferase, mitochondrial', 'seq_length': 439, 'mol_weight': 48333.0, 'ec_number': '2.6.1.13', 'nucleotide_id': None, 'locus_tag': None, 'structure_ids': None, 'go_terms': None, 'embedding': [0.02024856023490429, -0.10120689868927002, -0.054875459522008896, 0.05940677598118782, -0.08618494868278503, -0.029551653191447258, 0.09230533987283707, -0.051247984170913696, -0.14654004573822021, 0.02974345162510872, 0.057191673666238785, -0.08889872580766678, 0.14024749398231506, 0.17638203501701355, 0.03450097143650055, 0.08764003217220306, 0.024486850947141647, 0.1010783389210701, -0.03536701574921608, 0.010831189341843128, 0.13994470238685608, -0.008208542130887508, 0.08778387308120728, -0.06917404383420944, -0.14067070186138153, 0.012566761113703251, 0.007692092098295689, -0.0831497460603714, 0.02742845192551613, -0.22349904477596283, 0.02116805873811245, 0.0315057709813118, 0.13410906493663788, 0.023250417783856392, -0.005980873480439186, 0.08939117193222046, -0.11512637883424759, 0.07771635055541992, 0.0008585193427279592, 0.019618116319179535, 0.03975960984826088, -0.034718647599220276, 0.20850276947021484, -0.14582329988479614, -0.016126694157719612, -0.05384471267461777, -0.03726068139076233, 0.12207911163568497, -0.011061558499932289, -0.11828935891389847, -0.20746827125549316, 0.02503921277821064, 0.0532228983938694, 0.144418403506279, 0.029017755761742592, -0.0142306387424469, -0.0313822440803051, -0.06383419781923294, -7.995564374141395e-05, 0.13239522278308868, -0.11705058813095093, 0.028009044006466866, -0.06976060569286346, 0.09761255234479904, -0.063786581158638, 0.05692310631275177, 0.15504327416419983, -0.018106607720255852, 0.11691496521234512, -0.0655292272567749, 0.02884158119559288, -0.05444662272930145, 0.018265506252646446, 0.02579304203391075, -0.04605093225836754, 0.014390230178833008, -0.012364896014332771, 0.23866473138332367, -0.009956937283277512, 0.0681992843747139, 0.014134470373392105, 0.02794279344379902, -0.10487978160381317, 0.08972960710525513, -0.0912482962012291, -0.09848162531852722, 0.02055089920759201, 0.029058776795864105, 0.14218510687351227, 0.02496551349759102, 0.031931325793266296, 0.16415734589099884, 0.04803087189793587, 0.006804416421800852, -0.008162754587829113, 0.04275595396757126, 0.015986235812306404, -0.03268745541572571, 0.08824224770069122, 0.09635967016220093, 0.1823301464319229, -0.07360901683568954, -0.09456747025251389, -0.07724674046039581, -0.16656287014484406, -0.14528372883796692, -0.08138983696699142, 0.061651408672332764, 0.0021833048667758703, -0.04141315817832947, 0.0076423645950853825, -0.04459812864661217, 0.07975580543279648, 0.02211489900946617, -0.029687168076634407, 0.005693844519555569, -0.17859789729118347, -0.06285973638296127, -0.04598310589790344, 0.17897313833236694, -0.08883719891309738, -0.006319078616797924, 0.06714661419391632, -0.04161020368337631, -0.10419580340385437, -0.07310382276773453, 0.146962970495224, 0.08816196024417877, -0.10586261749267578, -0.031728193163871765, -0.04375051334500313, -0.047378286719322205, 0.030666152015328407, -0.1353800892829895, -0.014605922624468803, 0.1563176065683365, 0.15227675437927246, 0.05652746930718422, 0.02366151660680771, -0.08338523656129837, 0.09168574959039688, -0.15236665308475494, -0.06580774486064911, 0.022683361545205116, 0.18279708921909332, 0.30025309324264526, -0.005162137560546398, 0.04054735228419304, -0.11110367625951767, 0.1477659046649933, 0.14293374121189117, -0.17026466131210327, 0.0051727802492678165, 0.04558205232024193, 0.010130445472896099, -0.09954708069562912, 0.05359900742769241, 0.15307672321796417, 0.04194203019142151, -0.04382958263158798, -0.1927614063024521, 0.11217568814754486, 0.07116460055112839, -0.07825478911399841, 0.07008392363786697, 0.10073629766702652, 0.16728831827640533, 0.08653615415096283, 0.08844856172800064, 0.06653032451868057, -0.1302960067987442, -0.06850302964448929, 0.16061058640480042, -0.14906901121139526, -0.11815643310546875, -0.0945635512471199, -0.07603438198566437, 0.08288383483886719, 0.04238039627671242, 0.004362911451607943, -0.01701570488512516, -0.09059951454401016, 0.09787600487470627, 0.03224221244454384, -0.011840535327792168, -0.11788751929998398, -0.11228067427873611, 0.01578008197247982, 0.08044200390577316, -0.050935640931129456, 0.006653961725533009, 0.04129105806350708, -0.016114138066768646, -0.1804916113615036, 0.12004414200782776, 0.08699070662260056, 0.05172829329967499, 0.01498870924115181, -0.1566164195537567, -0.12350823730230331, 0.030897095799446106, 0.1268661618232727, -0.14114680886268616, -0.0042837243527174, -0.07116500288248062, -0.05182066559791565, 0.017959758639335632, -0.02647753804922104, -0.2292952835559845, -0.1586570292711258, -0.0003755104844458401, 0.10735882818698883, 0.06963561475276947, -0.013053910806775093, -0.1742338240146637, 0.05144227296113968, 0.042464517056941986, -0.06541845947504044, 0.20838643610477448, -0.060216788202524185, 0.0571635402739048, -0.14732177555561066, 0.24624955654144287, -0.11695057153701782, 0.08509759604930878, -0.051088809967041016, -0.13435448706150055, 0.06139412522315979, -0.26016560196876526, 0.017035098746418953, 0.15031199157238007, 0.04677288606762886, -0.07068074494600296, -0.048508986830711365, 0.29367393255233765, 0.15618152916431427, 0.2820050120353699, 0.043994076550006866, -0.160979762673378, -0.24123862385749817, -0.12286628037691116, 0.15596705675125122, 0.002191054867580533, 0.04166155681014061, 0.01566813886165619, 0.0059838066808879375, 0.10580437630414963, 0.0005123030277900398, -0.05302106589078903, 0.08531664311885834, -0.3055320382118225, 0.11194870620965958, -0.010545526631176472, -0.02240646444261074, -0.1403033435344696, 0.029825836420059204, 0.09574121981859207, -0.010166076011955738, -0.07026774436235428, -0.26879456639289856, -0.025891803205013275, 0.10971372574567795, 0.03595172241330147, 0.09800836443901062, -0.13005897402763367, -0.09532411396503448, -0.16931308805942535, -0.08578775823116302, 0.05837809666991234, -0.005076675675809383, -0.0056008100509643555, 0.06914830952882767, -0.08532405644655228, 0.09586989134550095, -0.020517682656645775, -0.014993557706475258, 0.03339797630906105, 0.08349625766277313, -0.01848652958869934, 0.045515935868024826, 0.08933272212743759, -0.03875561058521271, 0.17289817333221436, -0.02203812077641487, 0.028600890189409256, -0.3312375247478485, -0.017497489228844643, 0.18198943138122559, 0.13468331098556519, 0.11163417249917984, 0.02053936757147312, 0.0741971880197525, 0.05844476819038391, -0.02745579369366169, -0.050529636442661285, 0.01913490891456604, 0.04597477242350578, 0.13102567195892334, 0.1383875161409378, -0.013494573533535004, 0.21210713684558868, 0.10059105604887009, -0.1482100784778595, 0.14469875395298004, 0.02646399848163128, 0.08720823377370834, -0.003809984540566802, 0.0763804018497467, -0.06920450925827026, -0.06119345501065254, -0.06375177949666977, -0.03699449077248573, 0.011271377094089985, -0.06528067588806152, -0.009433823637664318, 0.0014158852864056826, -0.07885332405567169, -0.050813715904951096, 0.030872460454702377, -0.024577734991908073, -0.010174094699323177, -0.02980596385896206, -0.026072382926940918, -0.04333676025271416, -0.22810889780521393, -0.0939566120505333, 0.10181492567062378, 0.026783794164657593, -0.014762178994715214, -0.013456735759973526, -0.1313604712486267, -0.043617136776447296, -0.02049385942518711, -0.049815379083156586, -0.07851074635982513, -0.022455329075455666, 0.19660161435604095, -0.10905441641807556, 0.038022320717573166, 0.0014186608605086803, 0.0923498347401619, 0.040489763021469116, -0.06285952031612396, -0.02051447331905365, -0.06787365674972534, -0.15376047790050507, -0.095383420586586, -0.0003280747914686799, -0.03407712280750275, -0.02108774147927761, 0.006160557735711336, 0.019009821116924286, 0.00033684735535643995, -0.1021285280585289, 0.30446287989616394, -0.09730297327041626, 0.07469581812620163, -0.13798987865447998, 0.01917666383087635, 0.04283057525753975, -0.0536748506128788, 0.04248344898223877, -0.1026068925857544, 0.0017098721582442522, -0.013081393204629421, -0.01233616378158331, 0.08622084558010101, -0.08948462456464767, 0.1467512547969818, -0.14721159636974335, 0.1255239099264145, 0.08868531882762909, -0.036404799669981, -0.09460549056529999, 0.0028824806213378906, 0.08929192274808884, 0.033463481813669205, 0.1493336409330368, 0.18243302404880524, 0.005216342397034168, 0.01778189279139042, 0.059227216988801956, -0.009260527789592743, -0.08512048423290253, 0.1051655113697052, 0.020001472905278206, -0.08737093955278397, 0.02480717934668064, -0.07241638749837875, 0.04852200672030449, 0.0024197744205594063, -0.06279566884040833, 0.047167401760816574, 0.018892256543040276, 0.0023336068261414766, -0.10302535444498062, 0.10976292192935944, 0.11765879392623901, 0.11515920609235764, -0.08366940915584564, 0.019063729792833328, -0.002999610500410199, -0.1782609224319458, 0.15108954906463623, -0.013570889830589294, 0.11654751747846603, -0.025835838168859482, -0.07280769944190979, -0.016377035528421402, -0.11234492808580399, 0.010014514438807964, -0.027213219553232193, 0.17620645463466644, -0.15494410693645477, 0.08090037107467651, 0.12706249952316284, 0.1470927596092224, 0.17929023504257202, 0.19450244307518005, -0.02966410294175148, -0.05005091801285744, 0.08243829011917114, -0.09022480249404907, 0.04065537452697754, -0.002519000554457307, -0.025284478440880775, -0.04351834952831268, 0.04616590961813927, 0.12006843090057373, 0.012185708619654179, -0.009103317745029926, -0.04189823940396309, -0.041814591735601425, 0.08847274631261826, 0.06028532236814499, 0.14623475074768066, 0.01387952733784914, -0.010984296910464764, -0.14735060930252075, -0.015200859867036343, -0.02784755825996399, -0.020713338628411293, 0.12101811170578003, -0.17540663480758667, 0.057228293269872665, -0.1106448695063591, 0.06890348345041275, -0.03264918923377991, 0.10605201125144958, -0.01991543360054493, 0.11312338709831238, 0.029462451115250587, 0.06109760329127312, -0.06062428653240204, -0.041553620249032974, 0.12487564980983734, 0.06535398960113525, 0.14045560359954834, -0.10569396615028381, 0.2600948214530945, 0.0807153657078743, 0.049442924559116364, 0.030626796185970306, -0.029005402699112892, 0.022074809297919273, -0.08597365766763687, 0.00031049398239701986, 0.022770436480641365, -0.013262610882520676, 0.09789551049470901, 0.01812072843313217, 0.05302220955491066, 0.1267828643321991, -0.060110487043857574, 0.10897479206323624, -0.06584092974662781, -0.07712274044752121, 0.025456175208091736, -0.050237443298101425, -0.07959134131669998, -0.1272030770778656, -0.09985269606113434, -0.09860970079898834, 0.07897760719060898, 0.06900321692228317, 0.026292139664292336, 0.01792265474796295, 0.037446245551109314, 0.03919033333659172, -0.0352974459528923, -0.019194146618247032, -0.02049831673502922, -0.08958612382411957, 0.04343271255493164, 0.06868553161621094, 0.08785471320152283, 0.08740116655826569, -0.05674044042825699, 0.05336346849799156, -0.10339788347482681, -0.23855671286582947, -0.15420715510845184, -0.15423451364040375, -0.015500390902161598, -0.021716145798563957, 0.04922948032617569, 0.09897596389055252, 0.26406827569007874, 0.06140316650271416, 0.10921572893857956, 0.01940695196390152, -0.017130140215158463, 0.004089604131877422, -0.03490280732512474, 0.12443830817937851, -0.11853078007698059, 0.05865868553519249, -0.17225712537765503, 0.06290682405233383, -0.2054528445005417, 0.060116905719041824, -0.1378956139087677, -0.0359942764043808, -0.04071856290102005, -0.11661553382873535, -0.17280980944633484, -0.011577999219298363, 0.020944803953170776, 0.1294027864933014, -0.11107338219881058, 0.06643210351467133, 0.10235550999641418, -0.03726346418261528, -0.14067699015140533, -0.1344561129808426, -0.08124163746833801, 0.035732902586460114, -0.15031319856643677, -0.3293832838535309, -0.004662099294364452, -0.09775374084711075, -0.12812276184558868, 0.03596089035272598, 0.026845093816518784, -0.07316321134567261, -0.0490727499127388, 0.042702775448560715, 0.07023901492357254, 0.05402340739965439, -0.08028438687324524, -0.03499436751008034, 0.06712958961725235, -0.15439504384994507, 0.23653313517570496, -0.002085133222863078, -0.14339089393615723, -0.17206282913684845, -0.016279302537441254, 0.034319330006837845, 0.07099718600511551, -0.1098177507519722, 0.11373815685510635, 0.055597491562366486, 0.0683194100856781, 0.19847805798053741, -0.10339944809675217, 0.10560564696788788, 0.07138531655073166, 0.17929230630397797, 0.003437547944486141, -0.1428922861814499, 0.3423954248428345, -0.0706791952252388, 0.04645683616399765, -0.06315120309591293, 0.14840300381183624, 0.028263572603464127, 0.1167556568980217, -0.03096133843064308, 0.02880556881427765, -0.05438739433884621, -0.08203216642141342, 0.04271448031067848, 0.12751607596874237, 0.043448831886053085, 0.0807381123304367, 0.0013076410396024585, 0.1797882467508316, -0.12761518359184265, 0.10751351714134216, -0.09137621521949768, -0.1855749785900116, 0.1176833063364029, 0.09625124931335449, 0.1433810144662857, 0.1432649791240692, -0.0004332279204390943, -0.06642164289951324, 0.02638934552669525, -0.053056661039590836, -0.04425548017024994, 0.09958036243915558, 0.023525893688201904, -0.08794252574443817, -0.017259882763028145, -0.03994952514767647, -0.006985538173466921, -0.0741734579205513, 0.020498478785157204, 0.028516877442598343, -0.006524977274239063, 0.10634642094373703, -0.12504811584949493, 0.07255450636148453, -0.0423094667494297, -0.009143682196736336, -0.03481057658791542, -0.06569904088973999, 0.040476784110069275, -0.05309542268514633, -0.09613602608442307, -0.03770593926310539, -0.03438562899827957, 0.04392457753419876, -0.0481325201690197, 0.1407165229320526, 0.03972296044230461, -0.033222101628780365, 0.17605775594711304, -0.13402539491653442, 0.0006810897029936314, 0.019407112151384354, 0.00377766415476799, 0.09363320469856262, 0.06214385852217674, -0.07423675805330276, -0.05837290361523628, -0.008451418951153755, 0.015009402297437191, -0.02300109900534153, -0.1797635704278946, -0.13766421377658844, 0.14759740233421326, -0.08850917965173721, -0.01113794557750225, -0.11841753125190735, 0.11984574794769287, -0.0011192884994670749, -0.06350966542959213, 0.06972872465848923, 0.06312360614538193, 0.28447049856185913, 0.09141506254673004, -0.09978833794593811, 0.13340063393115997, 0.0003196934994775802, 0.019351491704583168, 0.18355481326580048, -0.22981563210487366, 0.22609420120716095, 0.023419765755534172, -0.09055270254611969, -0.13085849583148956, -0.06363873928785324, 0.18052780628204346, -0.05926336720585823, -0.12221534550189972, 0.1034654751420021, 0.0133915850892663, -0.0024950499646365643, 0.12711454927921295, -0.10337857156991959, 0.028463860973715782, -0.15540336072444916, -0.04307795315980911, 0.04374640807509422, -0.030178451910614967, -0.03271753340959549, 0.009815776720643044, 0.008050522767007351, 0.09385637938976288, 0.025749146938323975, -0.1621808111667633, 0.2813403904438019, 0.04740709438920021, -0.1749933958053589, 0.03495623916387558, 0.16400285065174103, -0.0312114879488945, -0.07321140915155411, -0.1341012716293335, -0.014672468416392803, -0.010986143723130226, -0.06743758916854858, 0.012777184136211872, 0.05768841505050659, 0.004030787386000156, -0.13529808819293976, 0.09876207262277603, -0.0468611866235733, 0.16974088549613953, 0.02501581609249115, 0.1264452338218689, 0.06082111969590187, -0.04366388916969299, -0.1348801702260971, 0.11086985468864441, -0.04927532374858856, 0.04213939607143402, 0.0022172981407493353, -0.0407545305788517, -0.1574772745370865, -0.04986068978905678, -0.0014988631010055542, 0.078058160841465, 0.1313028633594513, 0.057681307196617126, 0.1949898898601532, -0.0628645196557045, -0.0016831703251227736, 0.08270734548568726, -0.017322279512882233, 0.07633494585752487, 0.03549951687455177, -0.08475852757692337, -0.010466239415109158, 0.020711777731776237, 0.08277484029531479, 0.19449098408222198, 0.06682382524013519, -0.021322594955563545, -0.06364545226097107, 0.15940390527248383, 0.10968450456857681, 0.006011195480823517, 0.15889807045459747, 0.05654590576887131, -0.030204879119992256, -0.09032057970762253, -0.09764868766069412, -0.047035444527864456, 0.05938642472028732, 0.038441140204668045, -0.07379613816738129, -0.06486465781927109, -0.051326218992471695, -0.007158554159104824, -0.08362764865159988, 0.1434265673160553, 0.065452940762043, 0.022280436009168625, 0.06983362883329391, 0.0655921995639801, -0.12224975228309631, -0.033968400210142136, -0.03560259938240051, 0.15075121819972992, 0.013390030711889267, -0.022397074848413467, -0.030516093596816063, -0.28465282917022705, 0.15296955406665802, 0.16366440057754517, 0.05877356976270676, -0.08421339839696884, 0.045836590230464935, -0.10028616338968277, -0.015635836869478226, -0.017035694792866707, 0.056484222412109375, 0.07420908659696579, 0.025918789207935333, -0.03058515302836895, -0.05792619660496712, -0.022607674822211266, 0.25674837827682495, -0.0014167989138513803, -0.004049188923090696, 0.12579216063022614, -0.07458031177520752, -0.17107318341732025, 0.047931741923093796, -0.18749310076236725, 0.04970903694629669, -0.061829593032598495, 0.027982240542769432, 0.1599223017692566, -0.00226485263556242, 0.001498253783211112, 0.077382892370224, 0.10719826072454453, 0.07760629802942276, -0.014595226384699345, 0.12043289095163345, 0.00364903686568141, -0.018542015925049782, 0.09836103022098541, -0.0463043749332428, -0.03663811460137367, 0.0602332204580307, 0.09662344306707382, 0.17383702099323273, -0.014050626195967197, 0.04072811082005501, -0.0599776916205883, -0.0003507315705064684, 0.07573600858449936, -0.016957415267825127, -0.06819498538970947, -0.023668017238378525, 0.07168107479810715, -0.10938413441181183, 0.11158768087625504, 0.04963001608848572, 0.06237021088600159, -0.07346033304929733, 0.07000505179166794, -0.08804061263799667, 0.12148215621709824, 0.02390368841588497, -0.013349214568734169, -0.05994154140353203, -0.08506876975297928, 0.07834086567163467, -0.1575133353471756, 0.05813775956630707, 0.005945154000073671, -0.2141299843788147, 0.13088612258434296, 0.01053778175264597, -0.08737921714782715, -0.04595981538295746, -0.09423398971557617, 0.041714493185281754, 0.006146503612399101, 0.06865673512220383, 0.1765589863061905, 0.04925873503088951, -0.1008756086230278, 0.120145782828331, -0.08902676403522491, -0.1649080365896225, 0.005595429800450802, 0.0852993056178093, -0.02678864076733589, 0.154261514544487, -0.12273611128330231, -0.11140090227127075, 0.09432768821716309, -0.08696961402893066, 0.07798225432634354, -0.09729574620723724, 0.0619821697473526, -0.06626961380243301, -0.040821559727191925, -0.00639294134452939, 0.07503759860992432, -0.07308249175548553, 0.03398127108812332, -0.09971911460161209, 0.09805792570114136, -0.06189515069127083, -0.009234662167727947, -0.10652638971805573, -0.10740005970001221, -0.02302333153784275, 0.17406855523586273, -0.1372072547674179, 0.01680213212966919, 0.05663597956299782, -0.11098883301019669, 0.04227234050631523, 0.044499147683382034, -0.001520995399914682, -0.05206606164574623, 0.02838118001818657, 0.048317231237888336, 0.015240950509905815, 0.11754842102527618, -0.0352788046002388, 0.21274851262569427, -0.0329473577439785, -0.028921179473400116, -0.07112666964530945, -0.09715057164430618, -0.06778454035520554, 0.022032231092453003, 0.018970537930727005, 0.03126441314816475, 0.06691600382328033, -0.10067760944366455, 0.03824927657842636, -0.03312929347157478, 0.11001989990472794, -0.05194339156150818, 0.0930628702044487, -0.05583773925900459, -0.11217235773801804, -0.19513151049613953, 0.07856069505214691, 0.10589566826820374, 0.027995122596621513, 0.02027960494160652, 0.048165787011384964, -0.06600058823823929, 0.10910913348197937, -0.15872815251350403, 0.041132859885692596, 0.06503879278898239, -0.16081373393535614, -0.13546857237815857, 0.06100233271718025, -0.014645378105342388, -0.05948017165064812, -0.030159970745444298, -0.15036892890930176, -0.07771341502666473, 0.05175530165433884, 0.06613273173570633, 0.18414521217346191, 0.09452458471059799, -0.008265281096100807, -0.023393195122480392, 0.14289632439613342, -0.18488292396068573, -0.021262196823954582, 0.09066349267959595, 0.01606258936226368, -0.007634471170604229, 0.00475526787340641, 0.18113818764686584, -0.047970250248909, -0.14326009154319763, -0.17971082031726837, -0.10171552002429962, 0.03649020567536354, 0.15469276905059814, 0.20238709449768066, 0.0016547990962862968, 0.0719163715839386, -0.013998174108564854, -0.21352717280387878, 0.17589367926120758, 0.1794709712266922, -0.10998624563217163, -0.005287290550768375, 0.07963693886995316, -0.07185697555541992, -0.12464912235736847, -0.03830643743276596, 0.12657229602336884, -0.011505347676575184, 0.1710480898618698, 0.009055320173501968, -0.054782915860414505, -0.20316065847873688, -0.04156111553311348, -0.01828072965145111, -0.094750314950943, 0.05611417442560196, 0.06745430082082748, 0.12387123703956604, -0.11713898181915283, -0.11337531358003616, 0.05419988930225372, 0.06234319880604744, -0.04764193296432495, -0.08524003624916077, 0.09431871771812439, -0.023813113570213318, -0.14496110379695892, 0.21064816415309906, 0.12100528925657272, 0.034939274191856384, 0.15224230289459229, 0.08137591928243637, -0.026566853746771812, 0.08891071379184723, 0.08593989163637161, -0.042530518025159836, -0.1674778312444687, -0.12656015157699585, -0.014471560716629028, -0.06510739773511887, -0.03426086902618408, 0.07466757297515869, -0.2082493156194687, -0.08307965844869614, -0.10420498996973038, 0.1558913290500641, 0.028072470799088478, -0.09716740250587463, -0.0400727279484272, 0.026112109422683716, -0.09331346303224564, -0.16026845574378967, -0.06261994689702988, -0.06983047723770142, 0.17970888316631317, 0.03440011665225029, -0.2055584192276001, -0.038036469370126724, -0.062006875872612, -0.11172258853912354, -0.01173026580363512, 0.048689160495996475, -0.15363647043704987, 0.01403244212269783, -0.04444508999586105, -0.11052147299051285, 0.17785683274269104, -0.11111737787723541, 0.01503710262477398, -0.07455936074256897, 0.130684033036232, 0.03330593928694725, -0.07816138118505478, 0.025988135486841202, -0.03797400742769241, 0.17918355762958527, -0.20728078484535217, 0.1412011831998825, 0.13217709958553314, -0.014316472224891186, 0.005081279203295708, -0.1542651206254959, -0.1471462845802307, -0.09261956065893173, 0.002631417941302061, -0.18974906206130981, 0.04395926743745804, 0.0046070897951722145, 0.02943548560142517, 0.10677950084209442, -0.04952944070100784, 0.18688569962978363, 0.017092207446694374, 0.09805809706449509, 0.0049245948903262615, 0.03453328087925911, -0.06016426533460617, 0.025776885449886322, -0.013974452391266823, -0.03190900757908821, -0.009159940294921398, -0.0430099219083786, 0.03210628032684326, 0.1152113825082779, 0.012184173800051212, -0.17191091179847717, -0.07570149004459381, 0.012117799371480942, 0.07418178021907806, -0.00985792838037014, -0.0775870755314827, 0.00496249133720994, -0.0061594764702022076, 0.11624157428741455, -0.07058001309633255, -0.15174758434295654, -0.1581566035747528, 0.056423790752887726, -0.05476456135511398, 0.0769076943397522, -0.16390331089496613, -0.01811409741640091, -0.08917003124952316, 0.05810399726033211, -0.009724590927362442, 0.02164386957883835, -0.0851968303322792, -0.15616726875305176, 0.06831739842891693, -0.0495242178440094, -0.013792694546282291, -0.05393576622009277, 0.017160745337605476, 0.15392270684242249, -0.11543647199869156, 0.033591095358133316, 0.08896441757678986, -0.012205970473587513, -0.18523526191711426, -0.10642052441835403, 0.09048060327768326, 0.0031809716019779444, -0.016715984791517258, 0.12602239847183228, 0.1810091733932495, 0.04803398251533508, -0.02864702232182026, 0.034848134964704514, 0.16351750493049622, -0.08177510648965836, 0.0799950361251831, 0.049287762492895126, -0.21479704976081848, -0.05787177383899689, 0.20870274305343628, 0.19670717418193817, -0.02917899191379547, -0.014673368073999882, 0.1077510267496109, -0.11776024103164673, -0.050609856843948364, -0.12978774309158325, 0.04686860740184784, 0.21039901673793793, -0.08759512007236481, -0.05787206441164017, -0.04313012585043907, 0.19960099458694458, -0.07932645827531815, -0.08118028193712234, 0.013904622755944729, 0.005269734188914299, 0.03049347922205925, 0.13643129169940948, -0.03759344294667244, -0.03999306261539459, 0.08956252038478851, -0.25976264476776123, -0.12035997956991196, -0.1726505160331726, 0.07922554016113281, -0.08997832983732224, -0.08604609221220016, -0.010965113528072834, 0.04433104023337364, -0.020485447719693184, 0.00030001599225215614, -0.0828426256775856, 0.06677427142858505, 0.15722711384296417, 0.039878327399492264, -0.11692765355110168, -0.09322967380285263, -0.0693441852927208, -0.049696072936058044, -0.0005302872741594911, -0.06479810923337936, -0.12493802607059479, -0.0847337618470192, 0.03811774030327797, -0.12163253128528595, 0.07494144886732101, 0.14497092366218567, -0.04469547048211098, -0.07968832552433014, 0.11526643484830856, -0.08222935348749161, -0.12995383143424988, -0.02542913518846035, -0.05531290918588638, -0.0524732731282711, -0.0205096323043108, -0.043371353298425674, 0.06826610118150711, -0.1230267658829689, 0.09968044608831406, -2.8656516075134277, -0.14637483656406403, 0.1458456814289093, -0.07855939120054245, 0.003973464947193861, 0.04395592212677002, -0.14763695001602173, -0.1054278165102005, 0.25260257720947266, -0.06251044571399689, 0.13749021291732788, -0.04347370192408562, 0.022073261439800262, -0.031100915744900703, -0.1201740950345993, 0.04671158641576767, 0.038935620337724686, 0.11523193120956421, -0.012500127777457237, 0.043766628950834274, 0.07818104326725006, -0.03578841686248779, 0.017005236819386482, 0.3377397656440735, 0.05754644051194191, -0.07806985080242157, 0.06679005175828934, 0.07263384014368057, 0.1570320725440979, 0.15458165109157562, -0.07175267487764359, 0.014109877869486809, 0.020472025498747826, 0.18863946199417114, 0.07830576598644257, 0.1482079178094864, -0.0048288521356880665, 0.03671528398990631, 0.13236811757087708, -0.02578289993107319, 0.3087286651134491, -0.2583500146865845, -0.06534243375062943, -0.01825057528913021, 0.078977070748806, -0.04472985118627548, -0.18951094150543213, -0.15289969742298126, 0.15358944237232208, -0.05776436626911163, 0.09396965056657791, -0.011777237057685852, 0.05286586657166481, -0.10848789662122726, 0.011509508825838566, -0.046027541160583496, -0.06410644203424454, 0.17234528064727783, 0.097799152135849, 0.20909744501113892, 0.03900535777211189, 0.0025442042388021946, -0.058702096343040466, 0.07223135977983475, 0.11101941019296646, -0.11970861256122589, -0.2834143340587616, -0.12498477846384048, -0.03989219665527344, -0.061618901789188385, 0.12398082762956619, -0.12197677791118622, -0.1326797902584076, 0.05142227187752724, 0.008867129683494568, 0.02931891195476055, -0.1950065642595291, -0.24886098504066467, 0.175762340426445, 0.006455547176301479, -0.2528555691242218, -0.014882724732160568, -0.11596345901489258, 0.1824953854084015, 0.19246917963027954, -0.004538584966212511, 0.07001838833093643, 0.03681835159659386, -0.054071009159088135, 0.02595287747681141, 0.13316476345062256, 0.13073943555355072, 0.04011424258351326, -0.018131745979189873, 0.07026994973421097, -0.06842922419309616, -0.06123876944184303, -0.03567569702863693, 0.07594582438468933, 0.09212852269411087, 0.11816873401403427, 0.041473619639873505, 0.0419541634619236, 0.10429178178310394, 0.15900875627994537, -0.07231968641281128, -0.000486445554997772, -0.0281683262437582, -0.0066808899864554405, 0.06101728975772858, 0.011451605707406998, -0.06977646797895432, -0.08920690417289734, -0.03161873668432236, -0.021749207749962807, -0.044524289667606354, 0.04088452085852623, -0.30895310640335083, 0.07990328967571259, 0.14182297885417938], 'element_id_property': '4:2dbbe7d3-51e1-4903-a514-4dd4aed7696d:138'}\n", - "Number of proteins associated with GO:0005739: 11\n", - "Number of organisms with at least two proteins: 9\n" - ] - } - ], - "source": [ - "## Query using pyeed graph objects\n", - "# Get all proteins\n", - "proteins = Protein.nodes.all()\n", - "print(\"Number of proteins in database: \", len(proteins))\n", - "\n", - "# Get protein with id P04182\n", - "protein = Protein.nodes.get(accession_id=\"P04182\")\n", - "print(protein)\n", - "\n", - "# Get all protein which are accociated with GO term GO:0005739 (mitochondrion)\n", - "go_annotation = GOAnnotation.nodes.get(go_id=\"GO:0005739\")\n", - "mito_proteins = protein.go_annotation.all()\n", - "print(\"Number of proteins associated with GO:0005739: \", len(mito_proteins))\n", - "\n", - "\n", - "## Or execute cypher query\n", - "# Get all organisms that have at least two connected proteins\n", - "query = \"\"\"\n", - "MATCH (o:Organism)<-[:ORIGINATES_FROM]-(p:Protein)\n", - "WITH o, COUNT(p) AS proteinCount\n", - "WHERE proteinCount >= 2\n", - "RETURN o\n", - "\"\"\"\n", - "\n", - "organisms = eedb.db.execute_read(query)\n", - "print(\"Number of organisms with at least two proteins: \", len(organisms))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.plotly.v1+json": { - "config": { - "plotlyServerURL": "https://plot.ly" - }, - "data": [ - { - "marker": { - "color": "red", - "size": 8 - }, - "mode": "markers", - "name": "ADH", - "text": [ - "accession_id: Q6FQA4
name: alcohol dehydrogenase
seq_length: 352
mol_weight: 37545.0
ec_number: 1.1.1.1", - "accession_id: A0A7H9HSD9
name: alcohol dehydrogenase
seq_length: 348
mol_weight: 36974.0
ec_number: 1.1.1.1", - "accession_id: C5DHM6
name: alcohol dehydrogenase
seq_length: 348
mol_weight: 36599.0
ec_number: 1.1.1.1", - "accession_id: P20369
name: Alcohol dehydrogenase 1
seq_length: 350
mol_weight: 37261.0
ec_number: 1.1.1.1", - "accession_id: Q757I1
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37158.0
ec_number: 1.1.1.1", - "accession_id: Q9P4C2
name: Alcohol dehydrogenase 2
seq_length: 348
mol_weight: 36968.0
ec_number: 1.1.1.1", - "accession_id: A0A1X7R1I9
name: alcohol dehydrogenase
seq_length: 348
mol_weight: 36784.0
ec_number: 1.1.1.1", - "accession_id: A0A1G4JJF2
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37147.0
ec_number: 1.1.1.1", - "accession_id: A0A1G4KF85
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37390.0
ec_number: 1.1.1.1", - "accession_id: C5DNB7
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37069.0
ec_number: 1.1.1.1", - "accession_id: A0A1G4M9V8
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37261.0
ec_number: 1.1.1.1", - "accession_id: G0W4V9
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37144.0
ec_number: 1.1.1.1", - "accession_id: H2AXS6
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37171.0
ec_number: 1.1.1.1", - "accession_id: A0A1G4MBD6
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37009.0
ec_number: 1.1.1.1", - "accession_id: G0VK69
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37045.0
ec_number: 1.1.1.1", - "accession_id: P00330
name: Alcohol dehydrogenase 1
seq_length: 348
mol_weight: 36849.0
ec_number: 1.1.1.1", - "accession_id: J8Q680
name: alcohol dehydrogenase
seq_length: 348
mol_weight: 36673.0
ec_number: 1.1.1.1", - "accession_id: P00331
name: Alcohol dehydrogenase 2
seq_length: 348
mol_weight: 36732.0
ec_number: 1.1.1.1" - ], - "type": "scatter", - "x": [ - -186.77227783203125, - 23.778718948364258, - 40.66227722167969, - -193.2668914794922, - -8.318355560302734, - -130.4921417236328, - -151.20591735839844, - 134.99868774414062, - -63.87946319580078, - 103.29740905761719, - -30.65559959411621, - -28.87839698791504, - 131.7727508544922, - 6.913041114807129, - -108.14468383789062, - 208.11013793945312, - 110.35122680664062, - 137.19406127929688 - ], - "y": [ - -341.30322265625, - -499.9996337890625, - -648.2156982421875, - -296.458740234375, - -686.1292114257812, - -260.89154052734375, - -83.85225677490234, - -284.53155517578125, - -467.99359130859375, - -218.72593688964844, - -244.74966430664062, - -324.8351135253906, - -173.20262145996094, - -268.6771545410156, - -98.4156723022461, - -467.0985107421875, - -619.5795288085938, - -525.6121826171875 - ] - }, - { - "marker": { - "color": "blue", - "size": 8 - }, - "mode": "markers", - "name": "proline biosynthesis", - "text": [ - "accession_id: A0A8B9YUY7
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48104.0
ec_number: 2.6.1.13", - "accession_id: A0A8C6W4W5
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48284.0
ec_number: 2.6.1.13", - "accession_id: A0A8J6G992
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48239.0
ec_number: 2.6.1.13", - "accession_id: L8I4V3
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48104.0
ec_number: 2.6.1.13", - "accession_id: A0A1U7QEB0
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48328.0
ec_number: 2.6.1.13", - "accession_id: A0A8C6GQ10
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48314.0
ec_number: 2.6.1.13", - "accession_id: A0A452EKJ3
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48132.0
ec_number: 2.6.1.13", - "accession_id: A0A8C6HVU6
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48370.0
ec_number: 2.6.1.13", - "accession_id: A0A6I9L5L6
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48331.0
ec_number: 2.6.1.13", - "accession_id: A0A5J5MK22
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48074.0
ec_number: 2.6.1.13", - "accession_id: A0A6J0Y425
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48090.0
ec_number: 2.6.1.13", - "accession_id: A0A6P5B7Q0
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48118.0
ec_number: 2.6.1.13", - "accession_id: F1MYG0
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48118.0
ec_number: 2.6.1.13", - "accession_id: A0A851UXD9
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48355.0
ec_number: 2.6.1.13", - "accession_id: P04182
name: Ornithine aminotransferase, mitochondrial
seq_length: 439
mol_weight: 48333.0
ec_number: 2.6.1.13", - "accession_id: G3HVE0
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48431.0
ec_number: 2.6.1.13", - "accession_id: P29758
name: Ornithine aminotransferase, mitochondrial
seq_length: 439
mol_weight: 48355.0
ec_number: 2.6.1.13", - "accession_id: A0A6P3IYQ1
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48104.0
ec_number: 2.6.1.13", - "accession_id: Q3ZCF5
name: Ornithine aminotransferase, mitochondrial
seq_length: 439
mol_weight: 48075.0
ec_number: 2.6.1.13" - ], - "type": "scatter", - "x": [ - -410.7798767089844, - 204.80616760253906, - 473.3436279296875, - -457.86761474609375, - 396.9206848144531, - 379.79766845703125, - -311.87481689453125, - 345.0708312988281, - 420.676025390625, - -385.58782958984375, - -354.5928039550781, - -404.9271545410156, - -404.9271545410156, - 410.7213134765625, - 271.6178894042969, - 327.4793395996094, - 410.7213134765625, - -468.85711669921875, - -484.5874938964844 - ], - "y": [ - 530.3589477539062, - 669.9793090820312, - 732.4609985351562, - 498.2153625488281, - 757.3943481445312, - 550.4390258789062, - 597.9725952148438, - 619.3108520507812, - 712.5540161132812, - 691.67138671875, - 489.0978088378906, - 598.8034057617188, - 598.8034057617188, - 607.118896484375, - 651.4642333984375, - 773.041259765625, - 607.118896484375, - 557.7855834960938, - 663.695068359375 - ] - }, - { - "marker": { - "color": "green", - "size": 8 - }, - "mode": "markers", - "name": "no annotation", - "text": [ - "accession_id: A0A7H9HSJ3
name: alcohol dehydrogenase
seq_length: 350
mol_weight: 36998.0
ec_number: 1.1.1.1", - "accession_id: J5PRJ1
name: alcohol dehydrogenase
seq_length: 348
mol_weight: 36645.0
ec_number: 1.1.1.1", - "accession_id: G8ZTZ5
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37303.0
ec_number: 1.1.1.1", - "accession_id: A0AA35J9C9
name: alcohol dehydrogenase
seq_length: 348
mol_weight: 36808.0
ec_number: 1.1.1.1", - "accession_id: J8LIG6
name: alcohol dehydrogenase
seq_length: 348
mol_weight: 36765.0
ec_number: 1.1.1.1", - "accession_id: J7SA96
name: alcohol dehydrogenase
seq_length: 350
mol_weight: 37091.0
ec_number: 1.1.1.1", - "accession_id: Q6QDP7
name: Cyclic AMP-responsive element-binding protein 3-like protein 2
seq_length: 521
mol_weight: 57379.0
ec_number: None" - ], - "type": "scatter", - "x": [ - 46.96210479736328, - 89.03892517089844, - 290.4442138671875, - 183.9209442138672, - 233.75531005859375, - 192.45892333984375, - 293.7549743652344 - ], - "y": [ - -434.9825439453125, - -574.7026977539062, - -258.2119140625, - -415.0147705078125, - -411.8081970214844, - -148.38223266601562, - -182.1365203857422 - ] - } - ], - "layout": { - "height": 600, - "template": { - "data": { - "bar": [ - { - "error_x": { - "color": "#2a3f5f" - }, - "error_y": { - "color": "#2a3f5f" - }, - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "bar" - } - ], - "barpolar": [ - { - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "barpolar" - } - ], - "carpet": [ - { - "aaxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "baxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "type": "carpet" - } - ], - "choropleth": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "choropleth" - } - ], - "contour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "contour" - } - ], - "contourcarpet": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "contourcarpet" - } - ], - "heatmap": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmap" - } - ], - "heatmapgl": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmapgl" - } - ], - "histogram": [ - { - "marker": { - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "histogram" - } - ], - "histogram2d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2d" - } - ], - "histogram2dcontour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2dcontour" - } - ], - "mesh3d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "mesh3d" - } - ], - "parcoords": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "parcoords" - } - ], - "pie": [ - { - "automargin": true, - "type": "pie" - } - ], - "scatter": [ - { - "fillpattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - }, - "type": "scatter" - } - ], - "scatter3d": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter3d" - } - ], - "scattercarpet": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattercarpet" - } - ], - "scattergeo": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergeo" - } - ], - "scattergl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergl" - } - ], - "scattermapbox": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattermapbox" - } - ], - "scatterpolar": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolar" - } - ], - "scatterpolargl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolargl" - } - ], - "scatterternary": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterternary" - } - ], - "surface": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "surface" - } - ], - "table": [ - { - "cells": { - "fill": { - "color": "#EBF0F8" - }, - "line": { - "color": "white" - } - }, - "header": { - "fill": { - "color": "#C8D4E3" - }, - "line": { - "color": "white" - } - }, - "type": "table" - } - ] - }, - "layout": { - "annotationdefaults": { - "arrowcolor": "#2a3f5f", - "arrowhead": 0, - "arrowwidth": 1 - }, - "autotypenumbers": "strict", - "coloraxis": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "colorscale": { - "diverging": [ - [ - 0, - "#8e0152" - ], - [ - 0.1, - "#c51b7d" - ], - [ - 0.2, - "#de77ae" - ], - [ - 0.3, - "#f1b6da" - ], - [ - 0.4, - "#fde0ef" - ], - [ - 0.5, - "#f7f7f7" - ], - [ - 0.6, - "#e6f5d0" - ], - [ - 0.7, - "#b8e186" - ], - [ - 0.8, - "#7fbc41" - ], - [ - 0.9, - "#4d9221" - ], - [ - 1, - "#276419" - ] - ], - "sequential": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "sequentialminus": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ] - }, - "colorway": [ - "#636efa", - "#EF553B", - "#00cc96", - "#ab63fa", - "#FFA15A", - "#19d3f3", - "#FF6692", - "#B6E880", - "#FF97FF", - "#FECB52" - ], - "font": { - "color": "#2a3f5f" - }, - "geo": { - "bgcolor": "white", - "lakecolor": "white", - "landcolor": "#E5ECF6", - "showlakes": true, - "showland": true, - "subunitcolor": "white" - }, - "hoverlabel": { - "align": "left" - }, - "hovermode": "closest", - "mapbox": { - "style": "light" - }, - "paper_bgcolor": "white", - "plot_bgcolor": "#E5ECF6", - "polar": { - "angularaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "radialaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "scene": { - "xaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "yaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "zaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - } - }, - "shapedefaults": { - "line": { - "color": "#2a3f5f" - } - }, - "ternary": { - "aaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "baxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "caxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "title": { - "x": 0.05 - }, - "xaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - }, - "yaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - } - } - }, - "title": { - "text": "2D t-SNE Visualization of Protein Embeddings" - }, - "width": 900, - "xaxis": { - "title": { - "text": "t-SNE Dimension 1" - } - }, - "yaxis": { - "title": { - "text": "t-SNE Dimension 2" - } - } - } - } - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import numpy as np\n", - "from sklearn.manifold import TSNE\n", - "import plotly.graph_objects as go\n", - "\n", - "# Annotations of interest (Alcohol dehydrogenase annotation and proline biosynthesis annotation --> two different protein families)\n", - "adh_go_id = \"GO:0004022\"\n", - "ploline_biosynthesis_go_id = \"GO:0055129\"\n", - "\n", - "# Query to get all proteins with embeddings and get the label based on the annotations\n", - "query = \"\"\"\n", - "MATCH (p:Protein)\n", - "OPTIONAL MATCH (p)-[:ASSOCIATED_WITH]-(g:GOAnnotation)\n", - "WITH p, collect(g.go_id) AS go_ids\n", - "RETURN p.accession_id AS protein_id, \n", - " p.embedding AS embedding,\n", - " CASE \n", - " WHEN 'GO:0055129' IN go_ids THEN 'proline biosynthesis'\n", - " WHEN 'GO:0004022' IN go_ids THEN 'ADH'\n", - " ELSE 'no annotation'\n", - " END AS label\n", - "\"\"\"\n", - "\n", - "result = eedb.db.execute_read(query)\n", - "\n", - "# Prepare data for visualization\n", - "data = dict(\n", - " protein_id=[],\n", - " embedding=[],\n", - " label=[],\n", - ")\n", - "for record in result:\n", - " data[\"protein_id\"].append(record[\"protein_id\"])\n", - " data[\"embedding\"].append(record[\"embedding\"])\n", - " data[\"label\"].append(record[\"label\"])\n", - "\n", - "protein_ids, embeddings, labels = (\n", - " data[\"protein_id\"],\n", - " np.array(data[\"embedding\"]),\n", - " data[\"label\"],\n", - ")\n", - "\n", - "colors = []\n", - "for label in labels:\n", - " if label == \"ADH\":\n", - " colors.append(\"red\")\n", - " elif label == \"proline biosynthesis\":\n", - " colors.append(\"blue\")\n", - " else:\n", - " colors.append(\"green\")\n", - "\n", - "hover_texts = [\n", - " \"
\".join(\n", - " [\n", - " f\"{key}: {value}\"\n", - " for key, value in Protein.nodes.get(\n", - " accession_id=protein_id\n", - " ).__dict__.items()\n", - " if key\n", - " in [\n", - " \"accession_id\",\n", - " \"mol_weight\",\n", - " \"ec_number\",\n", - " \"seq_length\",\n", - " \"mol_weight\",\n", - " \"name\",\n", - " ]\n", - " ]\n", - " )\n", - " for protein_id in protein_ids\n", - "]\n", - "\n", - "\n", - "# Apply t-SNE to Reduce Embeddings to 2D\n", - "tsne = TSNE(n_components=2, random_state=42, perplexity=5, max_iter=3000)\n", - "embeddings_2d = tsne.fit_transform(embeddings)\n", - "\n", - "unique_labels = set(labels) # Find the unique labels\n", - "traces = []\n", - "\n", - "for label in unique_labels:\n", - " indices = [i for i, l in enumerate(labels) if l == label]\n", - " trace = go.Scatter(\n", - " x=[embeddings_2d[i, 0] for i in indices],\n", - " y=[embeddings_2d[i, 1] for i in indices],\n", - " mode=\"markers\",\n", - " marker=dict(\n", - " size=8,\n", - " color=colors[indices[0]],\n", - " ),\n", - " name=label,\n", - " text=[hover_texts[i] for i in indices],\n", - " )\n", - " traces.append(trace)\n", - "\n", - "layout = go.Layout(\n", - " title=\"2D t-SNE Visualization of Protein Embeddings\",\n", - " xaxis_title=\"t-SNE Dimension 1\",\n", - " yaxis_title=\"t-SNE Dimension 2\",\n", - " width=900,\n", - " height=600,\n", - ")\n", - "\n", - "fig = go.Figure(data=traces, layout=layout)\n", - "\n", - "fig.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "🔌 Connection closed.\n" - ] - } - ], - "source": [ - "# close connection\n", - "eedb.db.close()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pyeed-iiMJg_Qc-py3.11", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}