diff --git a/docs/model_diagram.json b/docs/model_diagram.json new file mode 100644 index 00000000..192fb3a4 --- /dev/null +++ b/docs/model_diagram.json @@ -0,0 +1,247 @@ +{ + "style": { + "node-color": "#ffffff", + "border-color": "#000000", + "caption-color": "#000000", + "arrow-color": "#000000", + "label-background-color": "#ffffff", + "directionality": "directed", + "arrow-width": 5 + }, + "nodes": [ + { + "id": "n0", + "position": { + "x": 0, + "y": 0 + }, + "caption": "", + "style": {}, + "labels": [ + "StrictStructuredNode" + ], + "properties": {} + }, + { + "id": "n1", + "position": { + "x": 346.4101615137755, + "y": 199.99999999999997 + }, + "caption": "", + "style": {}, + "labels": [ + "Organism" + ], + "properties": { + "taxonomy_id": "int - required", + "name": "str", + "domain": "str", + "kingdom": "str", + "phylum": "str", + "tax_class": "str", + "order": "str", + "family": "str", + "genus": "str", + "species": "str" + } + }, + { + "id": "n2", + "position": { + "x": 2.4492935982947064e-14, + "y": 400.0 + }, + "caption": "", + "style": {}, + "labels": [ + "Site" + ], + "properties": { + "site_id": "id - unique", + "name": "str", + "positions": "list[int] - required", + "annotation": "str - required" + } + }, + { + "id": "n3", + "position": { + "x": -346.4101615137754, + "y": 200.00000000000014 + }, + "caption": "", + "style": {}, + "labels": [ + "Region" + ], + "properties": { + "region_id": "id - unique", + "start": "int - required", + "end": "int - required", + "annotation": "str - required" + } + }, + { + "id": "n4", + "position": { + "x": -346.4101615137755, + "y": -199.99999999999991 + }, + "caption": "", + "style": {}, + "labels": [ + "GOAnnotation" + ], + "properties": { + "go_id": "str - required", + "term": "str", + "definition": "str" + } + }, + { + "id": "n5", + "position": { + "x": -7.347880794884119e-14, + "y": -400.0 + }, + "caption": "", + "style": {}, + "labels": [ + "Protein" + ], + "properties": { + "accession_id": "str - required", + "sequence": "str - required", + "name": "str", + "seq_length": "int - required", + "mol_weight": "float", + "ec_number": "str", + "nucleotide_id": "str", + "locus_tag": "str", + "structure_ids": "list[str]", + "go_terms": "list[str]", + "embedding": "list[float]" + } + }, + { + "id": "n6", + "position": { + "x": 346.41016151377534, + "y": -200.00000000000017 + }, + "caption": "", + "style": {}, + "labels": [ + "DNA" + ], + "properties": { + "accession_id": "str - required", + "sequence": "str - required", + "name": "str", + "seq_length": "int - required", + "go_terms": "list[str]", + "embedding": "list[float]", + "gc_content": "float" + } + } + ], + "relationships": [ + { + "id": "e0", + "type": "ORIGINATES_FROM", + "style": {}, + "properties": {}, + "fromId": "n5", + "toId": "n1" + }, + { + "id": "e1", + "type": "ORIGINATES_FROM", + "style": {}, + "properties": {}, + "fromId": "n6", + "toId": "n1" + }, + { + "id": "e2", + "type": "ASSOCIATED_WITH", + "style": {}, + "properties": {}, + "fromId": "n4", + "toId": "n5" + }, + { + "id": "e3", + "type": "ASSOCIATED_WITH", + "style": {}, + "properties": {}, + "fromId": "n4", + "toId": "n6" + }, + { + "id": "e4", + "type": "ORIGINATES_FROM", + "style": {}, + "properties": {}, + "fromId": "n5", + "toId": "n1" + }, + { + "id": "e5", + "type": "HAS_SITE", + "style": {}, + "properties": {}, + "fromId": "n5", + "toId": "n2" + }, + { + "id": "e6", + "type": "HAS_REGION", + "style": {}, + "properties": {}, + "fromId": "n5", + "toId": "n3" + }, + { + "id": "e7", + "type": "ASSOCIATED_WITH", + "style": {}, + "properties": {}, + "fromId": "n5", + "toId": "n4" + }, + { + "id": "e8", + "type": "ORIGINATES_FROM", + "style": {}, + "properties": {}, + "fromId": "n6", + "toId": "n1" + }, + { + "id": "e9", + "type": "HAS_SITE", + "style": {}, + "properties": {}, + "fromId": "n6", + "toId": "n2" + }, + { + "id": "e10", + "type": "HAS_REGION", + "style": {}, + "properties": {}, + "fromId": "n6", + "toId": "n3" + }, + { + "id": "e11", + "type": "ASSOCIATED_WITH", + "style": {}, + "properties": {}, + "fromId": "n6", + "toId": "n4" + } + ] +} \ No newline at end of file diff --git a/pyeed/__init__.py b/pyeed/__init__.py index e69de29b..38ebcff3 100644 --- a/pyeed/__init__.py +++ b/pyeed/__init__.py @@ -0,0 +1 @@ +from pyeed.pyeed import Pyeed diff --git a/pyeed/fetch/__init__.py b/pyeed/adapter/__init__.py similarity index 100% rename from pyeed/fetch/__init__.py rename to pyeed/adapter/__init__.py diff --git a/pyeed/adapter/primary_db_adapter.py b/pyeed/adapter/primary_db_adapter.py new file mode 100644 index 00000000..eba8138e --- /dev/null +++ b/pyeed/adapter/primary_db_adapter.py @@ -0,0 +1,206 @@ +from typing import Any, Coroutine, Generic, NamedTuple, TypeVar + +import aiometer +import tenacity +from httpx import ( + AsyncClient, + Limits, + RequestError, + Response, + TimeoutException, +) +from loguru import logger +from rich.progress import Progress, TaskID + +from pyeed.adapter.uniprot_mapper import PrimaryDBtoPyeed + +T = TypeVar("T") + + +class RequestPayload(NamedTuple): + """Holds the request client, URL, and parameters for an HTTP GET request.""" + + client: AsyncClient + url: str + params: dict[str, str] + + +class PrimaryDBAdapter(Generic[T]): + """ + Orchestrates the asynchronous HTTP GET requests to a primary sequence database. + Mapper classes are injected to map the responses to the pyeed graph object model and + save them to the database. + """ + + def __init__( + self, + ids: list[str], + ids_attr_name: str, + url: str, + rate_limit: int, + n_concurrent: int, + batch_size: int, + data_mapper: "PrimaryDBtoPyeed[T]", + timeout: int = 120, + progress: Progress | None = None, + task_id: TaskID | None = None, + request_params: dict[str, str] = {}, + ): + self.ids = ids + self.ids_attr_name = ids_attr_name + self.url = url + self.batch_size = batch_size + self.rate_limit = rate_limit + self.n_concurrent = n_concurrent + self.progress = progress + self.task_id = task_id + self.data_mapper = data_mapper + self.timeout = timeout + self.request_params = request_params + + if self.batch_size > 1: + self.ids = self.make_batches() + + if not self.progress: + self._create_progress() + + def _create_progress(self): + """ + Creates a dummy progress bar for tracking the progress of the HTTP + requests if not provided. + """ + self.progress = Progress(disable=True) + self.task_id = self.progress.add_task("Requesting data...", total=len(self.ids)) + + def make_batches(self) -> list[str]: + """ + Groups the IDs into batches of the specified batch size. + + Returns: + list[str]: The list of batches, where each batch is a comma-separated + string of IDs. + """ + batches = [] + for i in range(0, len(self.ids), self.batch_size): + batch = self.ids[i : i + self.batch_size] + batch_string = ",".join(map(str, batch)) + batches.append(batch_string) + return batches + + def build_request_payload(self, client: AsyncClient, id_: str) -> RequestPayload: + """Combines the client, URL, and parameters into a RequestPayload object. + Adds the id with the key specified by ids_attr_name to the request parameters. + + Args: + client (AsyncClient): AsyncClient object for making HTTP requests + id_ (str): ID to be added to the request parameters + + Returns: + RequestPayload: RequestPayload object with the client, URL, and parameters + """ + params = self.request_params.copy() + params[self.ids_attr_name] = id_ + + return RequestPayload(client, self.url, params=params) + + @tenacity.retry( + wait=tenacity.wait_fixed(1), + stop=tenacity.stop_after_attempt(3), + retry=tenacity.retry_if_exception_type((RequestError, TimeoutException)), + ) + async def send_request( + self, + args: RequestPayload, + ) -> Coroutine[None, None, Response]: + """ + Sends an asynchronous HTTP GET request to the specified URL using the provided + AsyncClient. + """ + client = args.client + url = args.url + params = args.params + + logger.debug(f"Sending request to {url} with parameters: {params}") + return client.get(url, params=params, timeout=self.timeout) + + async def make_request(self): + """ + Makes asynchronous HTTP GET requests to the specified URL using the provided + AsyncClient, handling rate limiting and concurrency. + """ + + def update_progress(): + if self.progress and self.task_id: + self.progress.update(self.task_id, advance=1) # type: ignore + + async with AsyncClient( + limits=Limits(max_connections=self.n_concurrent), + ) as client: + # Build the list of request arguments (this prepares the coroutine tasks) + requests = [self.build_request_payload(client, id) for id in self.ids] + + logger.debug( + f"Sending {len(self.ids)} requests in batches of {self.batch_size}" + ) + + # Using aiometer to handle rate-limiting and concurrency + async with aiometer.amap( + self.send_request, + requests, + max_per_second=self.rate_limit, + max_at_once=self.n_concurrent, + ) as response_coroutines: + async for response_coroutine in response_coroutines: + res = await response_coroutine + sanitized_response = self.sanitize_response(res) + [self.map_and_add_to_db(entry) for entry in sanitized_response] + + update_progress() + + def sanitize_response(self, response: Response) -> list[dict[str, Any]]: + """ + Sanitizes the response from the HTTP GET request by checking the status code + and formatting the JSON response as a list of dictionaries. + + Returns: + Optional[List[Dict[str, Any]]]: The JSON response as a list of dictionaries, + or None if the response is invalid. + """ + if response.status_code != 200: + logger.warning( + f"Request to {response.url} failed with status code {response.status_code}" + ) + return [] + + try: + response_json = response.json() + if not response_json: + logger.warning(f"Empty response from {response.url}") + return [] + + # If the response is a dictionary, wrap it in a list + if isinstance(response_json, dict): + response_json = [response_json] + + # Ensure the response is a list of dictionaries + if not isinstance(response_json, list) or not all( + isinstance(item, dict) for item in response_json + ): + logger.warning(f"Unexpected response format from {response.url}") + return [] + + except ValueError as e: + logger.warning(f"Failed to parse JSON response from {response.url}: {e}") + return [] + + return response_json + + def map_and_add_to_db(self, response: dict[str, Any] | None): + """ + Handles the response from the HTTP GET request by passing it to the data mapper. + This adds the mapped data to the database. + """ + + if response is None: + return None + self.data_mapper.add_to_db(response) diff --git a/pyeed/fetch/mapper.py b/pyeed/adapter/uniprot_mapper.py similarity index 64% rename from pyeed/fetch/mapper.py rename to pyeed/adapter/uniprot_mapper.py index fb67b993..2fde60e5 100644 --- a/pyeed/fetch/mapper.py +++ b/pyeed/adapter/uniprot_mapper.py @@ -2,7 +2,7 @@ from collections import defaultdict from typing import Generic, TypeVar -from pyeed.model import Annotation, Organism, Protein, Site +from pyeed.model import Annotation, GOAnnotation, Organism, Protein, Site T = TypeVar("T") @@ -17,31 +17,35 @@ class UniprotToPyeed(PrimaryDBtoPyeed[Protein]): def add_to_db(self, data: dict): # Organism information taxonomy_id = data["organism"]["taxonomy"] - organism = Organism(taxonomy_id=taxonomy_id).save() + organism = Organism.get_or_save( + taxonomy_id=taxonomy_id, + name=data["organism"]["names"][0]["value"], + ) try: ec_number = data["protein"]["recommendedName"]["ecNumber"][0]["value"] except KeyError: ec_number = None - protein = Protein( + protein = Protein.get_or_save( accession_id=data["accession"], sequence=data["sequence"]["sequence"], mol_weight=float(data["sequence"]["mass"]), ec_number=ec_number, name=data["protein"]["recommendedName"]["fullName"]["value"], + seq_length=len(data["sequence"]["sequence"]), ) - protein.seq_length = len(protein.sequence) - protein.save() protein.organism.connect(organism) organism.protein.connect(protein) self.add_sites(data, protein) + self.add_go(data, protein) def add_sites(self, data: dict, protein: Protein): ligand_dict = defaultdict(list) - for feature in data["features"]: + + for feature in data.get("features", []): if feature["type"] == "BINDING": for position in range(int(feature["begin"]), int(feature["end"]) + 1): ligand_dict[feature["ligand"]["name"]].append(position) @@ -53,4 +57,14 @@ def add_sites(self, data: dict, protein: Protein): annotation=Annotation.BINDING_SITE.value, ).save() - protein.sites.connect(site) + protein.site.connect(site) + + def add_go(self, data: dict, protein: Protein): + for reference in data["dbReferences"]: + if reference["type"] == "GO": + go_annotation = GOAnnotation.get_or_save( + go_id=reference["id"], + term=reference["properties"]["term"], + ) + + protein.go_annotation.connect(go_annotation) diff --git a/pyeed/dbconnect.py b/pyeed/dbconnect.py index d7ba417f..7e0f930a 100644 --- a/pyeed/dbconnect.py +++ b/pyeed/dbconnect.py @@ -138,6 +138,17 @@ def _remove_db_constraints( except subprocess.CalledProcessError as e: print(f"Failed to remove labels: {str(e)}") + def generate_model_diagram( + self, + models_path: str = "pyeed/model.py", + ): + subprocess.run( + [ + "neomodel_generate_diagram", + models_path, + ] + ) + def _wipe_database(self): """ Deletes all nodes and relationships in the database. diff --git a/pyeed/model.py b/pyeed/model.py index 0c9a54f6..b3476ca7 100644 --- a/pyeed/model.py +++ b/pyeed/model.py @@ -9,6 +9,7 @@ StringProperty, StructuredNode, UniqueIdProperty, + UniqueProperty, VectorIndex, ) @@ -16,6 +17,8 @@ class StrictStructuredNode(StructuredNode): """A StructuredNode subclass that raises an error if an invalid property is provided.""" + __abstract_node__ = True + def __init__(self, *args, **kwargs): # Get the defined properties of the model allowed_properties = set(self.__class__._class_properties()) @@ -103,6 +106,18 @@ def save(self, *args, **kwargs): return super().save(*args, **kwargs) + @classmethod + def get_or_save(cls, **kwargs): + """Attempts to save the node first, and if it already exists (due to unique constraint), retrieves it.""" + try: + # Attempt to create and save a new node + instance = cls(**kwargs) + instance.save() + return instance + except UniqueProperty: + # If a unique constraint error occurs, retrieve the existing node + return cls.nodes.get(**kwargs) + class Annotation(Enum): ACTIVE_SITE = "active site" @@ -134,16 +149,6 @@ class Organism(StrictStructuredNode): protein = RelationshipFrom("Protein", "ORIGINATES_FROM") dna = RelationshipFrom("DNA", "ORIGINATES_FROM") - @classmethod - def add_or_skip(cls, **kwargs): - """Add an organism if it does not already exist.""" - taxonomy_id = kwargs.get("taxonomy_id") - organism = cls.nodes.get_or_none(taxonomy_id=taxonomy_id) - if organism is None: - organism = cls(**kwargs).save() - - return organism - class Site(StrictStructuredNode): site_id = UniqueIdProperty() @@ -165,12 +170,12 @@ class Region(StrictStructuredNode): class GOAnnotation(StrictStructuredNode): go_id = StringProperty(unique_index=True, required=True) - name = StringProperty() + term = StringProperty() definition = StringProperty() - # Relationships - proteins = RelationshipTo("Protein", "ASSOCIATED_WITH") - dnas = RelationshipTo("DNA", "ASSOCIATED_WITH") + @property + def name(self): + return self.term class Protein(StrictStructuredNode): @@ -191,9 +196,9 @@ class Protein(StrictStructuredNode): # Relationships organism = RelationshipTo("Organism", "ORIGINATES_FROM") - sites = RelationshipTo("Site", "HAS_SITE") - regions = RelationshipTo("Region", "HAS_REGION") - go_annotations = RelationshipTo("GOAnnotation", "ASSOCIATED_WITH") + site = RelationshipTo("Site", "HAS_SITE") + region = RelationshipTo("Region", "HAS_REGION") + go_annotation = RelationshipTo("GOAnnotation", "ASSOCIATED_WITH") class DNA(StrictStructuredNode): @@ -210,6 +215,6 @@ class DNA(StrictStructuredNode): # Relationships organism = RelationshipTo("Organism", "ORIGINATES_FROM") - sites = RelationshipTo("Site", "HAS_SITE") - regions = RelationshipTo("Region", "HAS_REGION") - go_annotations = RelationshipTo("GOAnnotation", "ASSOCIATED_WITH") + site = RelationshipTo("Site", "HAS_SITE") + region = RelationshipTo("Region", "HAS_REGION") + go_annotation = RelationshipTo("GOAnnotation", "ASSOCIATED_WITH") diff --git a/pyeed/align/__init__.py b/pyeed/old/align/__init__.py similarity index 100% rename from pyeed/align/__init__.py rename to pyeed/old/align/__init__.py diff --git a/pyeed/align/abstract_aligner.py b/pyeed/old/align/abstract_aligner.py similarity index 100% rename from pyeed/align/abstract_aligner.py rename to pyeed/old/align/abstract_aligner.py diff --git a/pyeed/align/hmm.py b/pyeed/old/align/hmm.py similarity index 100% rename from pyeed/align/hmm.py rename to pyeed/old/align/hmm.py diff --git a/pyeed/align/msa.py b/pyeed/old/align/msa.py similarity index 100% rename from pyeed/align/msa.py rename to pyeed/old/align/msa.py diff --git a/pyeed/align/pairwise.py b/pyeed/old/align/pairwise.py similarity index 100% rename from pyeed/align/pairwise.py rename to pyeed/old/align/pairwise.py diff --git a/pyeed/cluster/__init__.py b/pyeed/old/cluster/__init__.py similarity index 100% rename from pyeed/cluster/__init__.py rename to pyeed/old/cluster/__init__.py diff --git a/pyeed/cluster/cluster.py b/pyeed/old/cluster/cluster.py similarity index 100% rename from pyeed/cluster/cluster.py rename to pyeed/old/cluster/cluster.py diff --git a/pyeed/cluster/mmseqs2.py b/pyeed/old/cluster/mmseqs2.py similarity index 100% rename from pyeed/cluster/mmseqs2.py rename to pyeed/old/cluster/mmseqs2.py diff --git a/pyeed/core/__init__.py b/pyeed/old/core/__init__.py similarity index 100% rename from pyeed/core/__init__.py rename to pyeed/old/core/__init__.py diff --git a/pyeed/core/abstractannotation.py b/pyeed/old/core/abstractannotation.py similarity index 100% rename from pyeed/core/abstractannotation.py rename to pyeed/old/core/abstractannotation.py diff --git a/pyeed/core/alignmentresult.py b/pyeed/old/core/alignmentresult.py similarity index 100% rename from pyeed/core/alignmentresult.py rename to pyeed/old/core/alignmentresult.py diff --git a/pyeed/core/annotation.py b/pyeed/old/core/annotation.py similarity index 100% rename from pyeed/core/annotation.py rename to pyeed/old/core/annotation.py diff --git a/pyeed/core/blastdata.py b/pyeed/old/core/blastdata.py similarity index 100% rename from pyeed/core/blastdata.py rename to pyeed/old/core/blastdata.py diff --git a/pyeed/core/clustalomegaresult.py b/pyeed/old/core/clustalomegaresult.py similarity index 100% rename from pyeed/core/clustalomegaresult.py rename to pyeed/old/core/clustalomegaresult.py diff --git a/pyeed/core/cluster.py b/pyeed/old/core/cluster.py similarity index 100% rename from pyeed/core/cluster.py rename to pyeed/old/core/cluster.py diff --git a/pyeed/core/dnarecord.py b/pyeed/old/core/dnarecord.py similarity index 100% rename from pyeed/core/dnarecord.py rename to pyeed/old/core/dnarecord.py diff --git a/pyeed/core/numberedsequence.py b/pyeed/old/core/numberedsequence.py similarity index 100% rename from pyeed/core/numberedsequence.py rename to pyeed/old/core/numberedsequence.py diff --git a/pyeed/core/ontology.py b/pyeed/old/core/ontology.py similarity index 100% rename from pyeed/core/ontology.py rename to pyeed/old/core/ontology.py diff --git a/pyeed/core/organism.py b/pyeed/old/core/organism.py similarity index 100% rename from pyeed/core/organism.py rename to pyeed/old/core/organism.py diff --git a/pyeed/core/pairwisealignmentresult.py b/pyeed/old/core/pairwisealignmentresult.py similarity index 100% rename from pyeed/core/pairwisealignmentresult.py rename to pyeed/old/core/pairwisealignmentresult.py diff --git a/pyeed/core/proteinrecord.py b/pyeed/old/core/proteinrecord.py similarity index 100% rename from pyeed/core/proteinrecord.py rename to pyeed/old/core/proteinrecord.py diff --git a/pyeed/core/region.py b/pyeed/old/core/region.py similarity index 100% rename from pyeed/core/region.py rename to pyeed/old/core/region.py diff --git a/pyeed/core/regionset.py b/pyeed/old/core/regionset.py similarity index 100% rename from pyeed/core/regionset.py rename to pyeed/old/core/regionset.py diff --git a/pyeed/core/sequence.py b/pyeed/old/core/sequence.py similarity index 100% rename from pyeed/core/sequence.py rename to pyeed/old/core/sequence.py diff --git a/pyeed/core/sequencerecord.py b/pyeed/old/core/sequencerecord.py similarity index 100% rename from pyeed/core/sequencerecord.py rename to pyeed/old/core/sequencerecord.py diff --git a/pyeed/core/sequencetype.py b/pyeed/old/core/sequencetype.py similarity index 100% rename from pyeed/core/sequencetype.py rename to pyeed/old/core/sequencetype.py diff --git a/pyeed/core/site.py b/pyeed/old/core/site.py similarity index 100% rename from pyeed/core/site.py rename to pyeed/old/core/site.py diff --git a/pyeed/core/standardnumbering.py b/pyeed/old/core/standardnumbering.py similarity index 100% rename from pyeed/core/standardnumbering.py rename to pyeed/old/core/standardnumbering.py diff --git a/pyeed/old/fetch/__init__.py b/pyeed/old/fetch/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pyeed/fetch/blast.py b/pyeed/old/fetch/blast.py similarity index 100% rename from pyeed/fetch/blast.py rename to pyeed/old/fetch/blast.py diff --git a/pyeed/fetch/dbsort.py b/pyeed/old/fetch/dbsort.py similarity index 100% rename from pyeed/fetch/dbsort.py rename to pyeed/old/fetch/dbsort.py diff --git a/pyeed/fetch/dnafetcher.py b/pyeed/old/fetch/dnafetcher.py similarity index 97% rename from pyeed/fetch/dnafetcher.py rename to pyeed/old/fetch/dnafetcher.py index 92674449..fca3beeb 100644 --- a/pyeed/fetch/dnafetcher.py +++ b/pyeed/old/fetch/dnafetcher.py @@ -6,7 +6,7 @@ from rich.console import Console from rich.progress import Progress -from pyeed.fetch.requester import AsyncRequester +from pyeed.adapter.primary_db_adapter import AsyncRequester from .ncbidnamapper import NCBIDNAMapper diff --git a/pyeed/fetch/ncbidnamapper.py b/pyeed/old/fetch/ncbidnamapper.py similarity index 100% rename from pyeed/fetch/ncbidnamapper.py rename to pyeed/old/fetch/ncbidnamapper.py diff --git a/pyeed/fetch/ncbiproteinmapper.py b/pyeed/old/fetch/ncbiproteinmapper.py similarity index 100% rename from pyeed/fetch/ncbiproteinmapper.py rename to pyeed/old/fetch/ncbiproteinmapper.py diff --git a/pyeed/fetch/pdbmapper.py b/pyeed/old/fetch/pdbmapper.py similarity index 100% rename from pyeed/fetch/pdbmapper.py rename to pyeed/old/fetch/pdbmapper.py diff --git a/pyeed/fetch/proteinfetcher.py b/pyeed/old/fetch/proteinfetcher.py similarity index 99% rename from pyeed/fetch/proteinfetcher.py rename to pyeed/old/fetch/proteinfetcher.py index d9b034b3..8d79c97c 100644 --- a/pyeed/fetch/proteinfetcher.py +++ b/pyeed/old/fetch/proteinfetcher.py @@ -7,11 +7,11 @@ from rich.console import Console from rich.progress import Progress +from pyeed.adapter.primary_db_adapter import AsyncParamRequester, AsyncRequester from pyeed.dbconnect import DatabaseConnector from pyeed.fetch.dbsort import DBPattern, SortIDs from pyeed.fetch.ncbiproteinmapper import NCBIProteinMapper from pyeed.fetch.pdbmapper import PDBMapper -from pyeed.fetch.requester import AsyncParamRequester, AsyncRequester from pyeed.fetch.taxonomymapper import TaxonomyMapper diff --git a/pyeed/fetch/requester.py b/pyeed/old/fetch/requester.py similarity index 98% rename from pyeed/fetch/requester.py rename to pyeed/old/fetch/requester.py index 0115a67a..eba8138e 100644 --- a/pyeed/fetch/requester.py +++ b/pyeed/old/fetch/requester.py @@ -12,7 +12,7 @@ from loguru import logger from rich.progress import Progress, TaskID -from pyeed.fetch.mapper import PrimaryDBtoPyeed +from pyeed.adapter.uniprot_mapper import PrimaryDBtoPyeed T = TypeVar("T") @@ -25,7 +25,7 @@ class RequestPayload(NamedTuple): params: dict[str, str] -class PrimaryDBRequester(Generic[T]): +class PrimaryDBAdapter(Generic[T]): """ Orchestrates the asynchronous HTTP GET requests to a primary sequence database. Mapper classes are injected to map the responses to the pyeed graph object model and diff --git a/pyeed/fetch/taxonomymapper.py b/pyeed/old/fetch/taxonomymapper.py similarity index 100% rename from pyeed/fetch/taxonomymapper.py rename to pyeed/old/fetch/taxonomymapper.py diff --git a/pyeed/network/__init__.py b/pyeed/old/network/__init__.py similarity index 100% rename from pyeed/network/__init__.py rename to pyeed/old/network/__init__.py diff --git a/pyeed/network/network.py b/pyeed/old/network/network.py similarity index 100% rename from pyeed/network/network.py rename to pyeed/old/network/network.py diff --git a/pyeed/schemes/DNARecord.json b/pyeed/old/schemes/DNARecord.json similarity index 100% rename from pyeed/schemes/DNARecord.json rename to pyeed/old/schemes/DNARecord.json diff --git a/pyeed/schemes/ProteinRecord.json b/pyeed/old/schemes/ProteinRecord.json similarity index 100% rename from pyeed/schemes/ProteinRecord.json rename to pyeed/old/schemes/ProteinRecord.json diff --git a/pyeed/schemes/proteinrecord.shex b/pyeed/old/schemes/proteinrecord.shex similarity index 100% rename from pyeed/schemes/proteinrecord.shex rename to pyeed/old/schemes/proteinrecord.shex diff --git a/pyeed/schemes/pyeed_schema.md b/pyeed/old/schemes/pyeed_schema.md similarity index 100% rename from pyeed/schemes/pyeed_schema.md rename to pyeed/old/schemes/pyeed_schema.md diff --git a/specifications/sequence_record.md b/pyeed/old/specifications/sequence_record.md similarity index 100% rename from specifications/sequence_record.md rename to pyeed/old/specifications/sequence_record.md diff --git a/pyeed/tools/__init__.py b/pyeed/old/tools/__init__.py similarity index 100% rename from pyeed/tools/__init__.py rename to pyeed/old/tools/__init__.py diff --git a/pyeed/tools/abstract_tool.py b/pyeed/old/tools/abstract_tool.py similarity index 100% rename from pyeed/tools/abstract_tool.py rename to pyeed/old/tools/abstract_tool.py diff --git a/pyeed/tools/clustalo.py b/pyeed/old/tools/clustalo.py similarity index 100% rename from pyeed/tools/clustalo.py rename to pyeed/old/tools/clustalo.py diff --git a/pyeed/main.py b/pyeed/pyeed.py similarity index 52% rename from pyeed/main.py rename to pyeed/pyeed.py index 4db83ef6..5971b610 100644 --- a/pyeed/main.py +++ b/pyeed/pyeed.py @@ -2,9 +2,9 @@ import nest_asyncio +from pyeed.adapter.primary_db_adapter import PrimaryDBAdapter +from pyeed.adapter.uniprot_mapper import UniprotToPyeed from pyeed.dbconnect import DatabaseConnector -from pyeed.fetch.mapper import UniprotToPyeed -from pyeed.fetch.requester import PrimaryDBRequester class Pyeed: @@ -30,25 +30,53 @@ def fetch_from_primary_db(self, ids: list[str]): "format": "json", } - requester = PrimaryDBRequester( + adapter = PrimaryDBAdapter( ids=ids, ids_attr_name="accession", url="https://www.ebi.ac.uk/proteins/api/proteins", rate_limit=10, n_concurrent=5, - batch_size=1, + batch_size=5, data_mapper=UniprotToPyeed(), progress=None, task_id=None, request_params=params_template, ) - asyncio.run(requester.make_request()) + asyncio.run(adapter.make_request()) if __name__ == "__main__": eedb = Pyeed("bolt://127.0.0.1:7687") - eedb.db._wipe_database() - eedb.fetch_from_primary_db(["P12345", "P67890", "P05062"]) + search = False + if search: + eedb.db._wipe_database() + + eedb.fetch_from_primary_db( + [ + "P04182", + "Q6QDP7", + "P04182", + "P29758", + "A0A851UXD9", + "A0A8C6HVU6", + "A0A8C6GQ10", + "A0A1U7QEB0", + "A0A6I9L5L6", + "G3HVE0", + "A0A8J6G992", + "A0A8C6W4W5", + "A0A8B9YUY7", + "L8I4V3", + "A0A6P3IYQ1", + "A0A452EKJ3", + "A0A6P5B7Q0", + "F1MYG0", + "A0A5J5MK22", + "A0A6J0Y425", + "Q3ZCF5", + ] + ) + print(eedb.db.stats()) diff --git a/pyeed/templates/protein_cypher_template.j2 b/pyeed/templates/protein_cypher_template.j2 deleted file mode 100644 index df2c2cde..00000000 --- a/pyeed/templates/protein_cypher_template.j2 +++ /dev/null @@ -1,88 +0,0 @@ -{% set protein_node_label = "ProteinRecord" %} -{% set protein_identifier = protein_data.get('id', 'protein_id') %} - -MERGE (protein:{{ protein_node_label }} {id: '{{ protein_identifier }}' }) - -{% if protein_data.get('name') %} -SET protein.name = '{{ protein_data['name'] | replace("'", "\\'") }}' -{% endif %} - -{% if protein_data.get('sequence') %} -SET protein.sequence = '{{ protein_data['sequence'] | replace("'", "\\'") }}' -{% endif %} - -{% if protein_data.get('seq_length') is not none %} -SET protein.seq_length = {{ protein_data['seq_length'] }} -{% endif %} - -{% if protein_data.get('ec_number') %} -SET protein.ec_number = '{{ protein_data['ec_number'] | replace("'", "\\'") }}' -{% endif %} - -{% if protein_data.get('mol_weight') is not none %} -SET protein.mol_weight = {{ protein_data['mol_weight'] }} -{% endif %} - -{% if protein_data.get('embedding') %} -SET protein.embedding = {{ protein_data['embedding'] }} -{% endif %} - -{% if protein_data.get('structure_ids') %} -{% for structure_id in protein_data['structure_ids'] %} -MERGE (structure_{{ loop.index }}:Structure {id: '{{ structure_id | replace("'", "\\'") }}' }) -MERGE (protein)-[:HAS_STRUCTURE]->(structure_{{ loop.index }}) -{% endfor %} -{% endif %} - -{% if protein_data.get('organism') %} -MERGE (organism:Organism {taxonomy_id: {{ protein_data['organism'].get('taxonomy_id', 0) }}, name: '{{ protein_data['organism'].get('name', '') | replace("'", "\\'") }}' }) -MERGE (protein)-[:ORIGINATES_FROM]->(organism) -{% for key, value in protein_data['organism'].items() %} -{% if value %} -SET organism.{{ key }} = '{{ value | replace("'", "\\'") }}' -{% endif %} -{% endfor %} -{% endif %} - -{% if protein_data.get('sites') %} -{% for site in protein_data['sites'] %} -MERGE (site_{{ loop.index }}:Site {id: '{{ protein_identifier }}_site_{{ loop.index }}' }) -SET site_{{ loop.index }}.positions = {{ site.get('positions', []) }} -MERGE (protein)-[:HAS_SITE]->(site_{{ loop.index }}) -{% endfor %} -{% endif %} - -{% if protein_data.get('regions') %} -{% for region in protein_data['regions'] %} -MERGE (region_{{ loop.index }}:Region {id: '{{ protein_identifier }}_region_{{ loop.index }}' }) -SET region_{{ loop.index }}.start = {{ region.get('start', 0) }}, - region_{{ loop.index }}.end = {{ region.get('end', 0) }} -MERGE (protein)-[:HAS_REGION]->(region_{{ loop.index }}) -{% endfor %} -{% endif %} - -{% if protein_data.get('region_sets') %} -{% for region_set in protein_data['region_sets'] %} -MERGE (region_set_{{ loop.index }}:RegionSet {id: '{{ protein_identifier }}_regionset_{{ loop.index }}' }) -{% if region_set.get('regions') %} -{% for region in region_set['regions'] %} -MERGE (region_{{ loop.index }}_{{ loop.index0 }}:Region {id: '{{ protein_identifier }}_region_{{ loop.index }}_{{ loop.index0 }}' }) -SET region_{{ loop.index }}_{{ loop.index0 }}.start = {{ region.get('start', 0) }}, - region_{{ loop.index }}_{{ loop.index0 }}.end = {{ region.get('end', 0) }} -MERGE (region_set_{{ loop.index }})-[:CONTAINS_REGION]->(region_{{ loop.index }}_{{ loop.index0 }}) -{% endfor %} -{% endif %} -MERGE (protein)-[:HAS_REGION_SET]->(region_set_{{ loop.index }}) -{% endfor %} -{% endif %} - -{% if protein_data.get('coding_sequence') %} -{% for region in protein_data['coding_sequence'] %} -MERGE (coding_region_{{ loop.index }}:Region {id: '{{ protein_identifier }}_coding_sequence_{{ loop.index }}' }) -SET coding_region_{{ loop.index }}.start = {{ region.get('start', 0) }}, - coding_region_{{ loop.index }}.end = {{ region.get('end', 0) }} -MERGE (protein)-[:HAS_CODING_SEQUENCE]->(coding_region_{{ loop.index }}) -{% endfor %} -{% endif %} - -RETURN protein diff --git a/pyproject.toml b/pyproject.toml index e35ede47..7228765a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyeed" -version = "0.3.7" +version = "0.4.0" description = "Toolkit to create, annotate, and analyze sequence data" authors = ["haeussma <83341109+haeussma@users.noreply.github.com>"] license = "MIT" @@ -12,7 +12,6 @@ python = ">=3.10,<3.13" biopython = ">=1.81,<1.84" networkx = "^3.2.1" plotly = "^5.18.0" -nbformat = "^5.9.2" scipy = "^1.11.3" pyhmmer = "^0.10.11" httpx = "^0.27.0" @@ -23,12 +22,12 @@ joblib = "^1.4.0" requests = "^2.31.0" matplotlib = "^3.9.0" pymsaviz = "^0.4.2" -py4cytoscape = "^1.9.0" tenacity = "^8.3.0" neo4j = "5.19.*" bio = "^1.7.1" loguru = "^0.7.2" neomodel = "^5.3.3" +shapely = "^2.0.6" [tool.poetry.group.dev.dependencies] mkdocs-material = "^9.5.9" @@ -39,8 +38,6 @@ ruff = "^0.4.1" mkdocs-jupyter = "^0.24.7" jupyter-contrib-nbextensions = "^0.7.0" notebook = "^7.1.3" -ipython = "^8.24.0" -nbconvert = "^7.16.4" [build-system] requires = ["poetry-core"] diff --git a/seq.json b/seq.json deleted file mode 100644 index 3094b0e1..00000000 --- a/seq.json +++ /dev/null @@ -1 +0,0 @@ -{"id":"P12345","sequence":"MALLHSARVLSGVASAFHPGLAAAASARASSWWAHVEMGPPDPILGVTEAYKRDTNSKKMNLGVGAYRDDNGKPYVLPSVRKAEAQIAAKGLDKEYLPIGGLAEFCRASAELALGENSEVVKSGRFVTVQTISGTGALRIGASFLQRFFKFSRDVFLPKPSWGNHTPIFRDAGMQLQSYRYYDPKTCGFDFTGALEDISKIPEQSVLLLHACAHNPTGVDPRPEQWKEIATVVKKRNLFAFFDMAYQGFASGDGDKDAWAVRHFIEQGINVCLCQSYAKNMGLYGERVGAFTVICKDADEAKRVESQLKILIRPMYSNPPIHGARIASTILTSPDLRKQWLQEVKGMADRIIGMRTQLVSNLKKEGSTHSWQHITDQIGMFCFTGLKPEQVERLTKEFSIYMTKDGRISVAGVTSGNVGYLAHAIHQVTK","name":"Aspartate aminotransferase, mitochondrial","organism":{"taxonomy_id":9986,"name":null,"domain":null,"kingdom":null,"phylum":null,"tax_class":null,"order":null,"family":null,"genus":null,"species":null,"ld_id":"md:Organism/68d3aa08-ecd9-48ad-ae1e-ce723e1dddc1","ld_type":["md:Organism"],"ld_context":{"md":"http://mdmodel.net/","sio":"http://semanticscience.org/resource/","edam":"http://edamontology.org/","taxonomy_id":"edam:data_1179","name":"edam:data_2909","kingdom":"edam:data_1044","family":"edam:data_2732","genus":"edam:data_1870","species":"edam:data_1045"}},"embedding":[],"seq_length":null,"nucleotide_id":null,"locus_tag":null,"sites":[{"annotation":"http://semanticscience.org/resource/SIO_010040","name":"substrate","positions":[65,162,215,407],"ld_id":"md:Site/e61c1393-ed04-4133-9218-54d85586ab97","ld_type":["md:Site","sio:sio:010049"],"ld_context":{"md":"http://mdmodel.net/","sio":"http://semanticscience.org/resource/","edam":"http://edamontology.org/","positions":"sio:SIO_000056"}}],"regions":[],"structure_ids":[],"ec_number":"2.6.1.1","mol_weight":47409.0,"annotations":[],"go_terms":[],"ld_id":"md:ProteinRecord/e7b867b8-a335-4c93-a5ef-1133a9a66514","ld_type":["md:ProteinRecord","sio:SIO_010043"],"ld_context":{"md":"http://mdmodel.net/","sio":"http://semanticscience.org/resource/","edam":"http://edamontology.org/","id":"sio:SIO_000729","sequence":"sio:SIO_000030","name":"sio:SIO_000116","organism":"sio:SIO_010000","seq_length":"sio:SIO_000041","structure_ids":"sio:SIO_000729","ec_number":"edam:data_1011","mol_weight":"edam:data_1505"}} \ No newline at end of file diff --git a/test.ipynb b/test.ipynb index 481cdf17..103198c6 100644 --- a/test.ipynb +++ b/test.ipynb @@ -2,38 +2,37 @@ "cells": [ { "cell_type": "code", - "execution_count": 8, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "%reload_ext autoreload\n", - "%autoreload 2\n", - "from pyeed.main import Pyeed\n", - "from pyeed.model import Organism, Protein" + "from pyeed import Pyeed\n", + "from pyeed.model import GOAnnotation, Protein" ] }, { - "cell_type": "code", - "execution_count": 11, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "pr = Protein(accession_id=\"P0A8V2\", sequence=\"MKKTAIAIAVALAGFATVAQA\")\n", - "pr.mol_weight = 2.22\n", + "# Create Neo4j DB\n", "\n", + "Create local Neo4j DB without authentication.\n", + "Graph data science plugin is not installed.\n", "\n", - "pr" + "```bash\n", + "docker run -it --name pyeed-neo4j \\\n", + " -p 7474:7474 \\--user=\"$(id -u):$(id -g)\" \\\n", + " -e NEO4J_AUTH=none \\\n", + " -p 7687:7687 \\\n", + " -v $HOME/Documents/db/data:/data \\\n", + " -v $HOME/Documents/db/logs:/logs \\\n", + " -v $HOME/Documents/db/import:/var/lib/neo4j/import \\\n", + " -v $HOME/Documents/db/plugins:/plugins \\\n", + " -e NEO4J_AUTH=neo4j/test \\\n", + " -e NEO4JLABS_PLUGINS='[\"apoc\"]' \\\n", + " -e NEO4J_dbms_security_procedures_unrestricted=\"apoc.*\" \\\n", + " -d neo4j:latest\n", + "```" ] }, { @@ -42,7 +41,8 @@ "source": [ "# Connect to DB\n", "\n", - "Neo4j DB is hosted locally via Docker." + "Neo4j DB is hosted locally via Docker.\n", + "Also possible to use free hosted Neo4j Sandbox (not tested)." ] }, { @@ -54,37 +54,149 @@ "name": "stdout", "output_type": "stream", "text": [ - "📡 Connected to database.\n" + "📡 Connected to database.\n", + "All data has been wiped from the database.\n", + "\n" ] } ], "source": [ "uri = \"bolt://127.0.0.1:7687\"\n", - "eedb = Pyeed(uri)" + "user = None\n", + "password = None\n", + "\n", + "# Create a Pyeed object, automatically connecting to the database\n", + "eedb = Pyeed(uri)\n", + "eedb.db._wipe_database()\n", + "\n", + "# DB connector is a property of the Pyeed object\n", + "print(eedb.db)\n", + "\n", + "# If this is the first time you are running this script, the pyeed graph model needs to be initialized\n", + "first_time = False\n", + "if first_time:\n", + " eedb.db._initialize_db_constraints(user=user, password=password)" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[32m2024-10-14 00:25:34.005\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36mmake_request\u001b[0m:\u001b[36m142\u001b[0m - \u001b[34m\u001b[1mSending 5 requests in batches of 5\u001b[0m\n", + "\u001b[32m2024-10-14 00:25:34.009\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'P04182,Q6QDP7,P04182,P29758,A0A851UXD9'}\u001b[0m\n", + "\u001b[32m2024-10-14 00:25:34.109\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'A0A8C6HVU6,A0A8C6GQ10,A0A1U7QEB0,A0A6I9L5L6,G3HVE0'}\u001b[0m\n", + "\u001b[32m2024-10-14 00:25:34.209\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'A0A8J6G992,A0A8C6W4W5,A0A8B9YUY7,L8I4V3,A0A6P3IYQ1'}\u001b[0m\n", + "\u001b[32m2024-10-14 00:25:34.453\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'A0A452EKJ3,A0A6P5B7Q0,F1MYG0,A0A5J5MK22,A0A6J0Y425'}\u001b[0m\n", + "\u001b[32m2024-10-14 00:25:34.553\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'Q3ZCF5'}\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'nodes': 69, 'relationships': 179}\n" + ] + } + ], + "source": [ + "ids = [\n", + " \"P04182\",\n", + " \"Q6QDP7\",\n", + " \"P04182\",\n", + " \"P29758\",\n", + " \"A0A851UXD9\",\n", + " \"A0A8C6HVU6\",\n", + " \"A0A8C6GQ10\",\n", + " \"A0A1U7QEB0\",\n", + " \"A0A6I9L5L6\",\n", + " \"G3HVE0\",\n", + " \"A0A8J6G992\",\n", + " \"A0A8C6W4W5\",\n", + " \"A0A8B9YUY7\",\n", + " \"L8I4V3\",\n", + " \"A0A6P3IYQ1\",\n", + " \"A0A452EKJ3\",\n", + " \"A0A6P5B7Q0\",\n", + " \"F1MYG0\",\n", + " \"A0A5J5MK22\",\n", + " \"A0A6J0Y425\",\n", + " \"Q3ZCF5\",\n", + "]\n", + "\n", + "# Fetch proteins from primary database\n", + "eedb.fetch_from_primary_db(ids)\n", + "\n", + "# number of nodes and edges in db\n", + "print(eedb.db.stats())" + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "ids = [\"P12345\", \"P67890\", \"P54321\"]" + "To use the web interface, open a browser and go to `http://localhost:7474/`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Query DB" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of proteins in database: 20\n", + "{'accession_id': 'P04182', 'sequence': 'MLSKLASLQTVAALRRGLRTSVASATSVATKKTEQGPPSSEYIFERESKYGAHNYHPLPVALERGKGIYMWDVEGRQYFDFLSAYGAVSQGHCHPKIIEAMKSQVDKLTLTSRAFYNNVLGEYEEYITKLFNYNKVLPMNTGVEAGETACKLARRWGYTVKGIQKYKAKIVFAVGNFWGRTLSAVSSSTDPTSYDGFGPFMPGFETIPYNDLPALERALQDPNVAAFMVEPIQGEAGVIVPDPGYLTGVRELCTRHQVLFIADEIQTGLARTGRWLAVDHENVRPDIVLLGKALSGGLYPVSAVLCDDDIMLTIKPGEHGSTYGGNPLGCRIAIAALEVLEEEHLAENADKMGAILRKELMKLPSDVVTAVRGKGLLNAIVIRETKDCDAWKVCLRLRDNGLLAKPTHGDIIRLAPPLVIKEDEIRESVEIINKTILSF', 'name': 'Ornithine aminotransferase, mitochondrial', 'seq_length': 439, 'mol_weight': 48333.0, 'ec_number': '2.6.1.13', 'nucleotide_id': None, 'locus_tag': None, 'structure_ids': None, 'go_terms': None, 'embedding': None, 'element_id_property': '4:2dbbe7d3-51e1-4903-a514-4dd4aed7696d:203'}\n", + "Number of proteins associated with GO:0005739: 11\n", + "Number of organisms with at least two proteins: 3\n" + ] + } + ], "source": [ - "taxonomy_id = 222222\n", - "organism = Organism.get_or_create(taxonomy_id=taxonomy_id)" + "## Query using pyeed graph objects\n", + "# Get all proteins\n", + "proteins = Protein.nodes.all()\n", + "print(\"Number of proteins in database: \", len(proteins))\n", + "\n", + "# Get protein with id P04182\n", + "protein = Protein.nodes.get(accession_id=\"P04182\")\n", + "print(protein)\n", + "\n", + "# Get all protein which are accociated with GO term GO:0005739 (mitochondrion)\n", + "go_annotation = GOAnnotation.nodes.get(go_id=\"GO:0005739\")\n", + "mito_proteins = protein.go_annotation.all()\n", + "print(\"Number of proteins associated with GO:0005739: \", len(mito_proteins))\n", + "\n", + "\n", + "## Or execute cypher query\n", + "# Get all organisms that have at least two connected proteins\n", + "query = \"\"\"\n", + "MATCH (o:Organism)<-[:ORIGINATES_FROM]-(p:Protein)\n", + "WITH o, COUNT(p) AS proteinCount\n", + "WHERE proteinCount >= 2\n", + "RETURN o\n", + "\"\"\"\n", + "\n", + "organisms = eedb.db.execute_read(query)\n", + "print(\"Number of organisms with at least two proteins: \", len(organisms))" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -96,6 +208,7 @@ } ], "source": [ + "# close connection\n", "eedb.db.close()" ] }