diff --git a/docs/examples/basics.ipynb b/docs/examples/basics.ipynb index 8f45ba36..3154ebc8 100644 --- a/docs/examples/basics.ipynb +++ b/docs/examples/basics.ipynb @@ -37,13 +37,13 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5352b6f5fa4c46119cd65bfc4de44c0d", + "model_id": "68d545568bf04841be62573b15776e10", "version_major": 2, "version_minor": 0 }, @@ -106,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -122,7 +122,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f759e3211ef44d2b81a0f820d2f49e2d", + "model_id": "93adf4bcad844963b3815a8288e2083b", "version_major": 2, "version_minor": 0 }, @@ -178,7 +178,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -186,28 +186,41 @@ "output_type": "stream", "text": [ "\u001b[4mProteinRecord\u001b[0m\n", - "├── \u001b[94mid\u001b[0m = WP_068323110.1\n", - "├── \u001b[94mname\u001b[0m = methionine adenosyltransferase\n", + "├── \u001b[94mid\u001b[0m = Q9YBK2\n", + "├── \u001b[94mname\u001b[0m = S-adenosylmethionine synthase\n", "├── \u001b[94morganism\u001b[0m\n", "│ └── \u001b[4mOrganism\u001b[0m\n", - "│ ├── \u001b[94mid\u001b[0m = 95652fe4-fb9a-40e7-b98a-1dfad46d8e56\n", - "│ ├── \u001b[94mtaxonomy_id\u001b[0m = 1609559\n", - "│ ├── \u001b[94mname\u001b[0m = Pyrococcus kukulkanii\n", + "│ ├── \u001b[94mid\u001b[0m = cb2acb39-692e-4424-a4ed-1b97f2351a83\n", + "│ ├── \u001b[94mtaxonomy_id\u001b[0m = 272557\n", + "│ ├── \u001b[94mname\u001b[0m = Aeropyrum pernix K1\n", "│ ├── \u001b[94mdomain\u001b[0m = Archaea\n", - "│ ├── \u001b[94mphylum\u001b[0m = Euryarchaeota\n", - "│ ├── \u001b[94mtax_class\u001b[0m = Thermococci\n", - "│ ├── \u001b[94morder\u001b[0m = Thermococcales\n", - "│ ├── \u001b[94mfamily\u001b[0m = Thermococcaceae\n", - "│ └── \u001b[94mgenus\u001b[0m = Pyrococcus\n", - "├── \u001b[94msequence\u001b[0m = MARNIVVEEIVRTPVEMQKVELVERKGIGHPDSIADGIAEAVSRALCREYIKRYGVILHHNTDQVEVVGGRAYPKFGGGEVVKPIYILLSGRAVELVDQELFPVHEVAIRAAKEYLKKNIRHLDVENHVVIDSRIGQGSVDLVSVFNKAKENPIPLANDTSFGVGFAPLTETERLVLETERLLNSEKFKKEYPAVGEDIKVMGLRKGDEIDLTIAAAIVDSEVANPKEYMEVKDKIKETVEELAKDITSRKVNIYVNTADDPKKDIYYITVTGTSAEAGDDGSVGRGNRVNGLITPNRHMSMEAAAGKNPVSHVGKIYNILAMFIANDIAKALPVEEVYVRILSQIGKPIDQPLVASIQVIPKQGHTVKEFEKDAYAIADEWLANITKIQKMILEDKITVF\n", + "│ ├── \u001b[94mphylum\u001b[0m = Thermoproteota\n", + "│ ├── \u001b[94mtax_class\u001b[0m = Thermoprotei\n", + "│ ├── \u001b[94morder\u001b[0m = Desulfurococcales\n", + "│ ├── \u001b[94mfamily\u001b[0m = Desulfurococcaceae\n", + "│ └── \u001b[94mgenus\u001b[0m = Aeropyrum\n", + "├── \u001b[94msequence\u001b[0m = MARRIVVESYPYPRVEDLQVELVERKGLGHPDTICDAAAEAVSRELSKYYLERFGKILHHNVDKVLLVGGQAAPRLGGGEVLQPIYILVSGRVTTEVRTGGGVESVPVGPIILRAVKNYIRENFRFLDPEEHVIVDYRVGRGSVDLVGIFEAEDKVPLANDTSIGSGHAPLSTLERLVLETERILNSRETKERLPAVGEDVKVMGVRDGKSITLTVAMAVVSSQVGSVSDYLAVKEEAESLILDLASRIAPDYDVRVNINTGDIPEKKILYLTVTGTSAEHGDDGATGRGNRVNGLITPMRPMSMEAAAGKNPVNHVGKIYNVVANEMAALIHREVKGVEEVYVKLVSQIGKPIDRPRIVDVKVRMEGGREVTADAKREIEAIANSVLDGITGYTEKLVRGDITVY\n", "├── \u001b[94mregions\u001b[0m\n", - "│ └── 0\n", + "│ ├── 0\n", + "│ │ └── \u001b[4mRegion\u001b[0m\n", + "│ │ ├── \u001b[94mid\u001b[0m = 0f3c9ab0-dde3-4cb1-aa12-c25c5fbb514c\n", + "│ │ ├── \u001b[94mname\u001b[0m = S-adenosylmethionine synthetase, archaea\n", + "│ │ ├── \u001b[94mstart\u001b[0m = 2\n", + "│ │ └── \u001b[94mend\u001b[0m = 406\n", + "│ ├── 1\n", + "│ │ └── \u001b[4mRegion\u001b[0m\n", + "│ │ ├── \u001b[94mid\u001b[0m = 4d0151b8-22bb-4695-ac0b-f70f0e7bb7c9\n", + "│ │ ├── \u001b[94mname\u001b[0m = S-adenosylmethionine synthase\n", + "│ │ ├── \u001b[94mstart\u001b[0m = 3\n", + "│ │ └── \u001b[94mend\u001b[0m = 406\n", + "│ └── 2\n", "│ └── \u001b[4mRegion\u001b[0m\n", - "│ ├── \u001b[94mid\u001b[0m = MetK2\n", - "│ ├── \u001b[94mstart\u001b[0m = 2\n", - "│ └── \u001b[94mend\u001b[0m = 401\n", + "│ ├── \u001b[94mid\u001b[0m = a9a6dce1-248d-4b51-bb37-69a6e1b1ed52\n", + "│ ├── \u001b[94mname\u001b[0m = S-adenosylmethionine synthetase, domain 3\n", + "│ ├── \u001b[94mstart\u001b[0m = 144\n", + "│ └── \u001b[94mend\u001b[0m = 248\n", "├── \u001b[94mec_number\u001b[0m = 2.5.1.6\n", - "└── \u001b[94mmol_weight\u001b[0m = 44273.0\n", + "└── \u001b[94mmol_weight\u001b[0m = 44235.0\n", "\n" ] } @@ -215,6 +228,186 @@ "source": [ "print(blast_results[3])" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "💾 Sequence saved to protein.fasta\n" + ] + } + ], + "source": [ + "from pyeed.core import ProteinRecord\n", + "\n", + "protein = ProteinRecord(name=\"test_protein\", sequence=\"MTEITAAMVKELREDKAVQLLREKGLGK\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "from neo4j import GraphDatabase\n", + "\n", + "# import environment variables\n", + "import os\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "\n", + "\n", + "# URI examples: \"neo4j://localhost\", \"neo4j+s://xxx.databases.neo4j.io\"\n", + "URI = \"neo4j+s://ecd986f5.databases.neo4j.io\"\n", + "AUTH = (os.getenv(\"NEO4J_USER\"), os.getenv(\"NEO4J_PASSWORD\"))\n", + "\n", + "with GraphDatabase.driver(URI, auth=AUTH) as driver:\n", + " driver.verify_connectivity()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "def run_query(query):\n", + " with driver.session() as session:\n", + " result = session.run(query)\n", + " records = list(result) # Fetch all records within the session context\n", + " return records" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GO terms and their relationships have been created.\n" + ] + } + ], + "source": [ + "# Sample GO terms data\n", + "go_terms = [\n", + " {\n", + " \"id\": \"GO:0008150\",\n", + " \"name\": \"biological_process\",\n", + " \"namespace\": \"biological_process\",\n", + " },\n", + " {\n", + " \"id\": \"GO:0003674\",\n", + " \"name\": \"molecular_function\",\n", + " \"namespace\": \"molecular_function\",\n", + " },\n", + " {\n", + " \"id\": \"GO:0005575\",\n", + " \"name\": \"cellular_component\",\n", + " \"namespace\": \"cellular_component\",\n", + " },\n", + " {\"id\": \"GO:0007049\", \"name\": \"cell cycle\", \"namespace\": \"biological_process\"},\n", + " {\"id\": \"GO:0009987\", \"name\": \"cellular process\", \"namespace\": \"biological_process\"},\n", + "]\n", + "\n", + "# Create GO Term nodes\n", + "for term in go_terms:\n", + " query = f\"\"\"\n", + " CREATE (g:GO_Term {{id: '{term['id']}', name: '{term['name']}', namespace: '{term['namespace']}'}})\n", + " \"\"\"\n", + " run_query(query)\n", + "\n", + "# Define relationships between GO terms\n", + "relationships = [\n", + " {\"parent_id\": \"GO:0008150\", \"child_id\": \"GO:0007049\", \"type\": \"IS_A\"},\n", + " {\"parent_id\": \"GO:0008150\", \"child_id\": \"GO:0009987\", \"type\": \"IS_A\"},\n", + "]\n", + "\n", + "# Create relationships\n", + "for rel in relationships:\n", + " query = f\"\"\"\n", + " MATCH (p:GO_Term {{id: '{rel['parent_id']}'}}), (c:GO_Term {{id: '{rel['child_id']}'}})\n", + " CREATE (c)-[:{rel['type']}]->(p)\n", + " \"\"\"\n", + " run_query(query)\n", + "\n", + "print(\"GO terms and their relationships have been created.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Proteins and their relationships to GO terms have been created.\n" + ] + } + ], + "source": [ + "# Sample proteins data\n", + "proteins = [\n", + " {\"name\": \"BRCA1 Protein\", \"sequence\": \"M1...1863\", \"length\": 1863},\n", + " {\"name\": \"TP53 Protein\", \"sequence\": \"M1...393\", \"length\": 393},\n", + "]\n", + "\n", + "# Create Protein nodes\n", + "for protein in proteins:\n", + " query = f\"\"\"\n", + " CREATE (p:Protein {{name: '{protein['name']}', sequence: '{protein['sequence']}', length: {protein['length']}}})\n", + " \"\"\"\n", + " run_query(query)\n", + "\n", + "# Define relationships between proteins and GO terms\n", + "protein_go_relationships = [\n", + " {\n", + " \"protein_name\": \"BRCA1 Protein\",\n", + " \"go_id\": \"GO:0007049\",\n", + " \"relationship\": \"INVOLVED_IN\",\n", + " },\n", + " {\n", + " \"protein_name\": \"TP53 Protein\",\n", + " \"go_id\": \"GO:0009987\",\n", + " \"relationship\": \"INVOLVED_IN\",\n", + " },\n", + "]\n", + "\n", + "# Create relationships\n", + "for rel in protein_go_relationships:\n", + " query = f\"\"\"\n", + " MATCH (p:Protein {{name: '{rel['protein_name']}'}}), (g:GO_Term {{id: '{rel['go_id']}'}})\n", + " CREATE (p)-[:{rel['relationship']}]->(g)\n", + " \"\"\"\n", + " run_query(query)\n", + "\n", + "print(\"Proteins and their relationships to GO terms have been created.\")" + ] } ], "metadata": { diff --git a/docs/quick_start/basics.md b/docs/quick_start/basics.md index af116b0c..04552c92 100644 --- a/docs/quick_start/basics.md +++ b/docs/quick_start/basics.md @@ -7,17 +7,20 @@ A sequence object can be created by passing a sequence string to the constructor === "Protein" ``` py - from pyeed.core import ProteinInfo + from pyeed.core import ProteinRecord - protein = ProteinInfo(sequence="MTEITAAMVKELREDKAVQLLREKGLGK") + protein = ProteinRecord( + name="My Protein", + sequence="MTEITAAMVKELREDKAVQLLREKGLGK" + ) ``` === "DNA" ``` py - from pyeed.core import DNAInfo + from pyeed.core import DNARecord - dna = DNAInfo(sequence="ATGCGTACGTCGATCGATCGATCGATCGATCGATCGATCGATCGTAGTC") + dna = DNARecord(sequence="ATGCGTACGTCGATCGATCGATCGATCGATCGATCGATCGATCGTAGTC") ``` @@ -28,27 +31,13 @@ Besides adding sequence information manually, PyEED also allows searching for se === "Protein" ``` py - protein = ProteinInfo.get_id("UCS38941.1") + protein = ProteinRecord.get_id("UCS38941.1") ``` === "DNA" ``` py - dna = DNAInfo.get_id("NC_000913.3") - ``` - -Alternatively, the sequence can be initiated from a sequence string, triggering a BLAST search in the NCBI database. If the sequence is found, the sequence object is filled with the corresponding information. - -=== "Protein" - - ``` py - # Not yet implemented - ``` - -=== "DNA" - - ``` py - # Not yet implemented + # Not implemented ``` ## ⬇️ Save a sequence @@ -58,12 +47,6 @@ Alternatively, the sequence can be initiated from a sequence string, triggering The sequence can be stored in a `FASTA`, `JSON`, `YAML`, or `XML` file format. Therefore, the respective method can be used. The file path is passed as an argument to the method. -=== "FASTA" - - ``` py - protein.to_fasta("protein.fasta") - ``` - === "JSON" ``` py @@ -82,8 +65,14 @@ The file path is passed as an argument to the method. protein.to_xml("protein.xml") ``` +=== "FASTA" + + ``` py + protein.to_fasta("protein.fasta") + ``` + ### To database -Alternatively, sequence data can be stored in a `PostgreSQL` database. Therefore, the `to_db()` method can be used. +Alternatively, sequence data can be stored in a graph database. Therefore, the `to_db()` method can be used. ```py # Feature is currently implemented diff --git a/docs/quick_start/blast.md b/docs/quick_start/blast.md index 72fce36d..47ca8336 100644 --- a/docs/quick_start/blast.md +++ b/docs/quick_start/blast.md @@ -1,34 +1,23 @@ # Using BLAST ## Using NCBI BLAST -NCBI offers a web interface for blasting. With PyEED this can be programmatically accessed. A BLAST search can be initiated by calling the `ncbi_blast()` method on a `ProteinInfo` object. The method returns the found sequences as a list of `ProteinInfo` objects. +NCBI offers a web interface for blasting. With PyEED this can be programmatically accessed. A BLAST search can be initiated by calling the `ncbi_blast()` method on a `ProteinRecord` object. The method returns the found sequences as a list of `ProteinRecord` objects. As additional parameters, the `ncbi_blast()` method accepts the following arguments: + +- n_hits (int): The number of hits to return. +- e_value (float): The e-value threshold for the search. +- db (str): The database to search in. The default is `swissprot`. +- matrix (str): The matrix to use for the search. The default is `BLOSUM62`. +- identity (float): The minimum identity percentage for the search. The default is `0.0`. ``` py -from pyEED.core import ProteinInfo +from pyeed.core import ProteinRecord # Create a ProteinInfo object -protein = ProteinInfo.get_id("UCS38941.1") +protein = ProteinRecord.get_id("UCS38941.1") # Perform a BLAST search -blast_results = protein.ncbi_blastp() +blast_results = protein.ncbi_blast() ``` !!! info "NCBI BLAST performance" Due to server-side limitations of NCBI, BLAST searches might be slowed down or even be blocked, if multiple searches are performed in a short time. - - - - -## Using BLAST with a local database - -Building a local BLAST database is a good way to speed up BLAST searches. PyEED allows BLAST searches against local databases. The `blastp()` method can be called on a `ProteinInfo` object. The method returns the found sequences as a list of `ProteinInfo` objects. - -``` py - - blast_results = protein.blastp( - db_path="/PATH/TO/LOCAL/BLAST/DB", - n_hits=200, - e_value=0.001, - word_size=3, - ) -``` \ No newline at end of file diff --git a/docs/quick_start/index.md b/docs/quick_start/index.md index cf2a9da2..c79d637a 100644 --- a/docs/quick_start/index.md +++ b/docs/quick_start/index.md @@ -10,13 +10,7 @@ hide: - :material-walk: __[Basics]__ – How to work with sequence data - :octicons-search-16: __[Search Sequences]__ – How to search for individual sequences or search using BLAST -- :fontawesome-solid-align-justify: __[Alignments]__ – How to make different alignments -- :material-select-group: __[Clustering]__ – How to cluster sequences -- :material-graph-outline: __[Networks]__ – How to construct sequence networks [Basics]: basics.md [Search Sequences]: blast.md - [Alignments]: alignments.md - [Clustering]: clustering.md - [Networks]: networks.md \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index a1eb91e4..0ba50a6d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -14,9 +14,9 @@ nav: - quick_start/index.md - The Sequence objects: quick_start/basics.md - Finding Sequences: quick_start/blast.md - - Aligning Sequences: quick_start/alignments.md - - Clustering Sequences: quick_start/clustering.md - - Creating Sequence Networks: quick_start/networks.md + #- Aligning Sequences: quick_start/alignments.md + #- Clustering Sequences: quick_start/clustering.md + #- Creating Sequence Networks: quick_start/networks.md - ⬇️ Installation: - PyEED Docker Service: installation/docker.md - via PIP: installation/via_pip.md diff --git a/pyeed/core/abstractannotation.py b/pyeed/core/abstractannotation.py index 55e0a080..b318d275 100644 --- a/pyeed/core/abstractannotation.py +++ b/pyeed/core/abstractannotation.py @@ -49,7 +49,7 @@ class AbstractAnnotation( _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed") _commit: Optional[str] = PrivateAttr( - default="c4dc30f3647be7da5ea591f8946893ffad69d647" + default="ad73396d3a347dd8d413a3cbe77883edb2777380" ) _raw_xml_data: Dict = PrivateAttr(default_factory=dict) diff --git a/pyeed/core/alignmentresult.py b/pyeed/core/alignmentresult.py index f1a07feb..6249d23a 100644 --- a/pyeed/core/alignmentresult.py +++ b/pyeed/core/alignmentresult.py @@ -61,7 +61,7 @@ class AlignmentResult( _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed") _commit: Optional[str] = PrivateAttr( - default="c4dc30f3647be7da5ea591f8946893ffad69d647" + default="ad73396d3a347dd8d413a3cbe77883edb2777380" ) _raw_xml_data: Dict = PrivateAttr(default_factory=dict) diff --git a/pyeed/core/blastdata.py b/pyeed/core/blastdata.py index c6d36ab0..76adc572 100644 --- a/pyeed/core/blastdata.py +++ b/pyeed/core/blastdata.py @@ -90,7 +90,7 @@ class BlastData( _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed") _commit: Optional[str] = PrivateAttr( - default="c4dc30f3647be7da5ea591f8946893ffad69d647" + default="ad73396d3a347dd8d413a3cbe77883edb2777380" ) _raw_xml_data: Dict = PrivateAttr(default_factory=dict) diff --git a/pyeed/core/clustalomegaresult.py b/pyeed/core/clustalomegaresult.py index b66d5686..fda99d2e 100644 --- a/pyeed/core/clustalomegaresult.py +++ b/pyeed/core/clustalomegaresult.py @@ -32,7 +32,7 @@ class ClustalOmegaResult( _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed") _commit: Optional[str] = PrivateAttr( - default="c4dc30f3647be7da5ea591f8946893ffad69d647" + default="ad73396d3a347dd8d413a3cbe77883edb2777380" ) _raw_xml_data: Dict = PrivateAttr(default_factory=dict) diff --git a/pyeed/core/dnarecord.py b/pyeed/core/dnarecord.py index 11443801..777f5596 100644 --- a/pyeed/core/dnarecord.py +++ b/pyeed/core/dnarecord.py @@ -32,7 +32,7 @@ class DNARecord( _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed") _commit: Optional[str] = PrivateAttr( - default="c4dc30f3647be7da5ea591f8946893ffad69d647" + default="ad73396d3a347dd8d413a3cbe77883edb2777380" ) _raw_xml_data: Dict = PrivateAttr(default_factory=dict) diff --git a/pyeed/core/numberedsequence.py b/pyeed/core/numberedsequence.py index ab9ff028..b76aa8c9 100644 --- a/pyeed/core/numberedsequence.py +++ b/pyeed/core/numberedsequence.py @@ -40,7 +40,7 @@ class NumberedSequence( _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed") _commit: Optional[str] = PrivateAttr( - default="c4dc30f3647be7da5ea591f8946893ffad69d647" + default="ad73396d3a347dd8d413a3cbe77883edb2777380" ) _raw_xml_data: Dict = PrivateAttr(default_factory=dict) diff --git a/pyeed/core/organism.py b/pyeed/core/organism.py index b3ffff74..78cb1158 100644 --- a/pyeed/core/organism.py +++ b/pyeed/core/organism.py @@ -108,7 +108,7 @@ class Organism( _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed") _commit: Optional[str] = PrivateAttr( - default="c4dc30f3647be7da5ea591f8946893ffad69d647" + default="ad73396d3a347dd8d413a3cbe77883edb2777380" ) _raw_xml_data: Dict = PrivateAttr(default_factory=dict) diff --git a/pyeed/core/pairwisealignmentresult.py b/pyeed/core/pairwisealignmentresult.py index c661b18a..a281aaa6 100644 --- a/pyeed/core/pairwisealignmentresult.py +++ b/pyeed/core/pairwisealignmentresult.py @@ -60,7 +60,7 @@ class PairwiseAlignmentResult( _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed") _commit: Optional[str] = PrivateAttr( - default="c4dc30f3647be7da5ea591f8946893ffad69d647" + default="ad73396d3a347dd8d413a3cbe77883edb2777380" ) _raw_xml_data: Dict = PrivateAttr(default_factory=dict) diff --git a/pyeed/core/proteinrecord.py b/pyeed/core/proteinrecord.py index 6a27cd72..ed4f434c 100644 --- a/pyeed/core/proteinrecord.py +++ b/pyeed/core/proteinrecord.py @@ -75,7 +75,7 @@ class ProteinRecord( _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed") _commit: Optional[str] = PrivateAttr( - default="c4dc30f3647be7da5ea591f8946893ffad69d647" + default="ad73396d3a347dd8d413a3cbe77883edb2777380" ) _raw_xml_data: Dict = PrivateAttr(default_factory=dict) diff --git a/pyeed/core/region.py b/pyeed/core/region.py index 9a6e13cc..2b6ade9c 100644 --- a/pyeed/core/region.py +++ b/pyeed/core/region.py @@ -43,7 +43,7 @@ class Region( _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed") _commit: Optional[str] = PrivateAttr( - default="c4dc30f3647be7da5ea591f8946893ffad69d647" + default="ad73396d3a347dd8d413a3cbe77883edb2777380" ) _raw_xml_data: Dict = PrivateAttr(default_factory=dict) diff --git a/pyeed/core/regionset.py b/pyeed/core/regionset.py index e78df3e1..fb1ced44 100644 --- a/pyeed/core/regionset.py +++ b/pyeed/core/regionset.py @@ -35,7 +35,7 @@ class RegionSet( _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed") _commit: Optional[str] = PrivateAttr( - default="c4dc30f3647be7da5ea591f8946893ffad69d647" + default="ad73396d3a347dd8d413a3cbe77883edb2777380" ) _object_terms: Set[str] = PrivateAttr( diff --git a/pyeed/core/sequence.py b/pyeed/core/sequence.py index 2a3e1278..28df01a8 100644 --- a/pyeed/core/sequence.py +++ b/pyeed/core/sequence.py @@ -38,7 +38,7 @@ class Sequence( _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed") _commit: Optional[str] = PrivateAttr( - default="c4dc30f3647be7da5ea591f8946893ffad69d647" + default="ad73396d3a347dd8d413a3cbe77883edb2777380" ) _raw_xml_data: Dict = PrivateAttr(default_factory=dict) diff --git a/pyeed/core/sequencerecord.py b/pyeed/core/sequencerecord.py index 0f7c65cf..4587b579 100644 --- a/pyeed/core/sequencerecord.py +++ b/pyeed/core/sequencerecord.py @@ -103,7 +103,7 @@ class SequenceRecord( _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed") _commit: Optional[str] = PrivateAttr( - default="c4dc30f3647be7da5ea591f8946893ffad69d647" + default="ad73396d3a347dd8d413a3cbe77883edb2777380" ) _raw_xml_data: Dict = PrivateAttr(default_factory=dict) diff --git a/pyeed/core/site.py b/pyeed/core/site.py index 240d0ea2..4613011b 100644 --- a/pyeed/core/site.py +++ b/pyeed/core/site.py @@ -35,7 +35,7 @@ class Site( _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed") _commit: Optional[str] = PrivateAttr( - default="c4dc30f3647be7da5ea591f8946893ffad69d647" + default="ad73396d3a347dd8d413a3cbe77883edb2777380" ) _raw_xml_data: Dict = PrivateAttr(default_factory=dict) diff --git a/pyeed/core/standardnumbering.py b/pyeed/core/standardnumbering.py index 95729740..b88629cf 100644 --- a/pyeed/core/standardnumbering.py +++ b/pyeed/core/standardnumbering.py @@ -42,7 +42,7 @@ class StandardNumbering( _repo: Optional[str] = PrivateAttr(default="https://github.com/PyEED/pyeed") _commit: Optional[str] = PrivateAttr( - default="c4dc30f3647be7da5ea591f8946893ffad69d647" + default="ad73396d3a347dd8d413a3cbe77883edb2777380" ) _raw_xml_data: Dict = PrivateAttr(default_factory=dict) diff --git a/pyeed/fetch/requester.py b/pyeed/fetch/requester.py index 9807b3c4..87259edb 100644 --- a/pyeed/fetch/requester.py +++ b/pyeed/fetch/requester.py @@ -1,8 +1,9 @@ import asyncio import logging -from typing import List, NamedTuple, Optional, Dict +from typing import Dict, List, NamedTuple, Optional import aiometer +import tenacity from httpx import AsyncClient, Limits, Response from rich.progress import Progress, TaskID @@ -51,6 +52,10 @@ def _create_progress(self): self.progress = Progress(disable=True) self.task_id = self.progress.add_task("Requesting data...", total=len(self.ids)) + @tenacity.retry( + wait=tenacity.wait_fixed(1), + stop=tenacity.stop_after_attempt(3), + ) async def send_request(self, args: RequestArgs) -> str: """ Sends an asynchronous HTTP GET request to the specified URL using the provided @@ -84,6 +89,10 @@ async def send_request(self, args: RequestArgs) -> str: return response.text + @tenacity.retry( + wait=tenacity.wait_fixed(0.5), + stop=tenacity.stop_after_attempt(3), + ) async def make_request(self) -> List[str]: """ Makes asynchronous HTTP GET requests to the specified URL using the provided @@ -102,7 +111,7 @@ async def make_request(self) -> List[str]: async def update_progress(response: Response): if self.progress: - self.progress.update(self.task_id, advance=self.batch_size) # type: ignore + self.progress.update(self.task_id, advance=self.batch_size) # type: ignore async with AsyncClient( event_hooks={"response": [update_progress]}, @@ -145,6 +154,7 @@ def make_batches(self) -> List[str]: self.ids = batches return batches + class AsyncParamRequester: """Updated Requester utilizing parameters as dict for the request""" @@ -209,24 +219,24 @@ async def send_request(self, args: RequestArgs) -> str: if response.status_code == 429: LOGGER.warning("Rate limit exceeded. Waiting for 1 second...") - await asyncio.sleep(1) + await asyncio.sleep(0.5) return await self.send_request(args) return response.text async def make_request(self) -> List[str]: + """Handles the asynchronous HTTP GET and configures rate limits and progress bar.""" all_responses = [] async def update_progress(response: Response): if self.progress: - self.progress.update(self.task_id, advance=self.batch_size) # type: ignore + self.progress.update(self.task_id, advance=self.batch_size) # type: ignore async with AsyncClient( event_hooks={"response": [update_progress]}, - limits=Limits(max_connections=self.n_concurrent), + limits=Limits(max_connections=self.n_concurrent, keepalive_expiry=30), ) as client: - tasks = [] for id in self.ids: params = self.params.copy() diff --git a/pyproject.toml b/pyproject.toml index cf6aefb7..4c2643b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyeed" -version = "0.3.4" +version = "0.3.5" description = "Toolkit to create, annotate, and analyze sequence data" authors = ["haeussma <83341109+haeussma@users.noreply.github.com>"] license = "MIT" @@ -25,6 +25,8 @@ sdrdm = { git = "https://github.com/JR-1991/software-driven-rdm.git" } matplotlib = "^3.9.0" pymsaviz = "^0.4.2" py4cytoscape = "^1.9.0" +tenacity = "^8.3.0" +neo4j = "^5.20.0" [tool.poetry.group.dev.dependencies] mkdocs-material = "^9.5.9"