Skip to content

Commit

Permalink
Fix organism error (#84)
Browse files Browse the repository at this point in the history
* fixed retries upon failed requests and improved docs

* API update

---------

Co-authored-by: sdRDM Bot <sdRDM@bot.com>
  • Loading branch information
haeussma and sdRDM Bot authored Jun 10, 2024
1 parent 725b8c9 commit a0e9dcc
Show file tree
Hide file tree
Showing 22 changed files with 277 additions and 100 deletions.
235 changes: 214 additions & 21 deletions docs/examples/basics.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5352b6f5fa4c46119cd65bfc4de44c0d",
"model_id": "68d545568bf04841be62573b15776e10",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -106,7 +106,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand All @@ -122,7 +122,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f759e3211ef44d2b81a0f820d2f49e2d",
"model_id": "93adf4bcad844963b3815a8288e2083b",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -178,43 +178,236 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[4mProteinRecord\u001b[0m\n",
"├── \u001b[94mid\u001b[0m = WP_068323110.1\n",
"├── \u001b[94mname\u001b[0m = methionine adenosyltransferase\n",
"├── \u001b[94mid\u001b[0m = Q9YBK2\n",
"├── \u001b[94mname\u001b[0m = S-adenosylmethionine synthase\n",
"├── \u001b[94morganism\u001b[0m\n",
"│ └── \u001b[4mOrganism\u001b[0m\n",
"│ ├── \u001b[94mid\u001b[0m = 95652fe4-fb9a-40e7-b98a-1dfad46d8e56\n",
"│ ├── \u001b[94mtaxonomy_id\u001b[0m = 1609559\n",
"│ ├── \u001b[94mname\u001b[0m = Pyrococcus kukulkanii\n",
"│ ├── \u001b[94mid\u001b[0m = cb2acb39-692e-4424-a4ed-1b97f2351a83\n",
"│ ├── \u001b[94mtaxonomy_id\u001b[0m = 272557\n",
"│ ├── \u001b[94mname\u001b[0m = Aeropyrum pernix K1\n",
"│ ├── \u001b[94mdomain\u001b[0m = Archaea\n",
"│ ├── \u001b[94mphylum\u001b[0m = Euryarchaeota\n",
"│ ├── \u001b[94mtax_class\u001b[0m = Thermococci\n",
"│ ├── \u001b[94morder\u001b[0m = Thermococcales\n",
"│ ├── \u001b[94mfamily\u001b[0m = Thermococcaceae\n",
"│ └── \u001b[94mgenus\u001b[0m = Pyrococcus\n",
"├── \u001b[94msequence\u001b[0m = MARNIVVEEIVRTPVEMQKVELVERKGIGHPDSIADGIAEAVSRALCREYIKRYGVILHHNTDQVEVVGGRAYPKFGGGEVVKPIYILLSGRAVELVDQELFPVHEVAIRAAKEYLKKNIRHLDVENHVVIDSRIGQGSVDLVSVFNKAKENPIPLANDTSFGVGFAPLTETERLVLETERLLNSEKFKKEYPAVGEDIKVMGLRKGDEIDLTIAAAIVDSEVANPKEYMEVKDKIKETVEELAKDITSRKVNIYVNTADDPKKDIYYITVTGTSAEAGDDGSVGRGNRVNGLITPNRHMSMEAAAGKNPVSHVGKIYNILAMFIANDIAKALPVEEVYVRILSQIGKPIDQPLVASIQVIPKQGHTVKEFEKDAYAIADEWLANITKIQKMILEDKITVF\n",
"│ ├── \u001b[94mphylum\u001b[0m = Thermoproteota\n",
"│ ├── \u001b[94mtax_class\u001b[0m = Thermoprotei\n",
"│ ├── \u001b[94morder\u001b[0m = Desulfurococcales\n",
"│ ├── \u001b[94mfamily\u001b[0m = Desulfurococcaceae\n",
"│ └── \u001b[94mgenus\u001b[0m = Aeropyrum\n",
"├── \u001b[94msequence\u001b[0m = MARRIVVESYPYPRVEDLQVELVERKGLGHPDTICDAAAEAVSRELSKYYLERFGKILHHNVDKVLLVGGQAAPRLGGGEVLQPIYILVSGRVTTEVRTGGGVESVPVGPIILRAVKNYIRENFRFLDPEEHVIVDYRVGRGSVDLVGIFEAEDKVPLANDTSIGSGHAPLSTLERLVLETERILNSRETKERLPAVGEDVKVMGVRDGKSITLTVAMAVVSSQVGSVSDYLAVKEEAESLILDLASRIAPDYDVRVNINTGDIPEKKILYLTVTGTSAEHGDDGATGRGNRVNGLITPMRPMSMEAAAGKNPVNHVGKIYNVVANEMAALIHREVKGVEEVYVKLVSQIGKPIDRPRIVDVKVRMEGGREVTADAKREIEAIANSVLDGITGYTEKLVRGDITVY\n",
"├── \u001b[94mregions\u001b[0m\n",
"│ └── 0\n",
"│ ├── 0\n",
"│ │ └── \u001b[4mRegion\u001b[0m\n",
"│ │ ├── \u001b[94mid\u001b[0m = 0f3c9ab0-dde3-4cb1-aa12-c25c5fbb514c\n",
"│ │ ├── \u001b[94mname\u001b[0m = S-adenosylmethionine synthetase, archaea\n",
"│ │ ├── \u001b[94mstart\u001b[0m = 2\n",
"│ │ └── \u001b[94mend\u001b[0m = 406\n",
"│ ├── 1\n",
"│ │ └── \u001b[4mRegion\u001b[0m\n",
"│ │ ├── \u001b[94mid\u001b[0m = 4d0151b8-22bb-4695-ac0b-f70f0e7bb7c9\n",
"│ │ ├── \u001b[94mname\u001b[0m = S-adenosylmethionine synthase\n",
"│ │ ├── \u001b[94mstart\u001b[0m = 3\n",
"│ │ └── \u001b[94mend\u001b[0m = 406\n",
"│ └── 2\n",
"│ └── \u001b[4mRegion\u001b[0m\n",
"│ ├── \u001b[94mid\u001b[0m = MetK2\n",
"│ ├── \u001b[94mstart\u001b[0m = 2\n",
"│ └── \u001b[94mend\u001b[0m = 401\n",
"│ ├── \u001b[94mid\u001b[0m = a9a6dce1-248d-4b51-bb37-69a6e1b1ed52\n",
"│ ├── \u001b[94mname\u001b[0m = S-adenosylmethionine synthetase, domain 3\n",
"│ ├── \u001b[94mstart\u001b[0m = 144\n",
"│ └── \u001b[94mend\u001b[0m = 248\n",
"├── \u001b[94mec_number\u001b[0m = 2.5.1.6\n",
"└── \u001b[94mmol_weight\u001b[0m = 44273.0\n",
"└── \u001b[94mmol_weight\u001b[0m = 44235.0\n",
"\n"
]
}
],
"source": [
"print(blast_results[3])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"💾 Sequence saved to protein.fasta\n"
]
}
],
"source": [
"from pyeed.core import ProteinRecord\n",
"\n",
"protein = ProteinRecord(name=\"test_protein\", sequence=\"MTEITAAMVKELREDKAVQLLREKGLGK\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"from neo4j import GraphDatabase\n",
"\n",
"# import environment variables\n",
"import os\n",
"from dotenv import load_dotenv\n",
"\n",
"load_dotenv()\n",
"\n",
"\n",
"# URI examples: \"neo4j://localhost\", \"neo4j+s://xxx.databases.neo4j.io\"\n",
"URI = \"neo4j+s://ecd986f5.databases.neo4j.io\"\n",
"AUTH = (os.getenv(\"NEO4J_USER\"), os.getenv(\"NEO4J_PASSWORD\"))\n",
"\n",
"with GraphDatabase.driver(URI, auth=AUTH) as driver:\n",
" driver.verify_connectivity()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"def run_query(query):\n",
" with driver.session() as session:\n",
" result = session.run(query)\n",
" records = list(result) # Fetch all records within the session context\n",
" return records"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GO terms and their relationships have been created.\n"
]
}
],
"source": [
"# Sample GO terms data\n",
"go_terms = [\n",
" {\n",
" \"id\": \"GO:0008150\",\n",
" \"name\": \"biological_process\",\n",
" \"namespace\": \"biological_process\",\n",
" },\n",
" {\n",
" \"id\": \"GO:0003674\",\n",
" \"name\": \"molecular_function\",\n",
" \"namespace\": \"molecular_function\",\n",
" },\n",
" {\n",
" \"id\": \"GO:0005575\",\n",
" \"name\": \"cellular_component\",\n",
" \"namespace\": \"cellular_component\",\n",
" },\n",
" {\"id\": \"GO:0007049\", \"name\": \"cell cycle\", \"namespace\": \"biological_process\"},\n",
" {\"id\": \"GO:0009987\", \"name\": \"cellular process\", \"namespace\": \"biological_process\"},\n",
"]\n",
"\n",
"# Create GO Term nodes\n",
"for term in go_terms:\n",
" query = f\"\"\"\n",
" CREATE (g:GO_Term {{id: '{term['id']}', name: '{term['name']}', namespace: '{term['namespace']}'}})\n",
" \"\"\"\n",
" run_query(query)\n",
"\n",
"# Define relationships between GO terms\n",
"relationships = [\n",
" {\"parent_id\": \"GO:0008150\", \"child_id\": \"GO:0007049\", \"type\": \"IS_A\"},\n",
" {\"parent_id\": \"GO:0008150\", \"child_id\": \"GO:0009987\", \"type\": \"IS_A\"},\n",
"]\n",
"\n",
"# Create relationships\n",
"for rel in relationships:\n",
" query = f\"\"\"\n",
" MATCH (p:GO_Term {{id: '{rel['parent_id']}'}}), (c:GO_Term {{id: '{rel['child_id']}'}})\n",
" CREATE (c)-[:{rel['type']}]->(p)\n",
" \"\"\"\n",
" run_query(query)\n",
"\n",
"print(\"GO terms and their relationships have been created.\")"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Proteins and their relationships to GO terms have been created.\n"
]
}
],
"source": [
"# Sample proteins data\n",
"proteins = [\n",
" {\"name\": \"BRCA1 Protein\", \"sequence\": \"M1...1863\", \"length\": 1863},\n",
" {\"name\": \"TP53 Protein\", \"sequence\": \"M1...393\", \"length\": 393},\n",
"]\n",
"\n",
"# Create Protein nodes\n",
"for protein in proteins:\n",
" query = f\"\"\"\n",
" CREATE (p:Protein {{name: '{protein['name']}', sequence: '{protein['sequence']}', length: {protein['length']}}})\n",
" \"\"\"\n",
" run_query(query)\n",
"\n",
"# Define relationships between proteins and GO terms\n",
"protein_go_relationships = [\n",
" {\n",
" \"protein_name\": \"BRCA1 Protein\",\n",
" \"go_id\": \"GO:0007049\",\n",
" \"relationship\": \"INVOLVED_IN\",\n",
" },\n",
" {\n",
" \"protein_name\": \"TP53 Protein\",\n",
" \"go_id\": \"GO:0009987\",\n",
" \"relationship\": \"INVOLVED_IN\",\n",
" },\n",
"]\n",
"\n",
"# Create relationships\n",
"for rel in protein_go_relationships:\n",
" query = f\"\"\"\n",
" MATCH (p:Protein {{name: '{rel['protein_name']}'}}), (g:GO_Term {{id: '{rel['go_id']}'}})\n",
" CREATE (p)-[:{rel['relationship']}]->(g)\n",
" \"\"\"\n",
" run_query(query)\n",
"\n",
"print(\"Proteins and their relationships to GO terms have been created.\")"
]
}
],
"metadata": {
Expand Down
43 changes: 16 additions & 27 deletions docs/quick_start/basics.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,20 @@ A sequence object can be created by passing a sequence string to the constructor
=== "Protein"

``` py
from pyeed.core import ProteinInfo
from pyeed.core import ProteinRecord

protein = ProteinInfo(sequence="MTEITAAMVKELREDKAVQLLREKGLGK")
protein = ProteinRecord(
name="My Protein",
sequence="MTEITAAMVKELREDKAVQLLREKGLGK"
)
```

=== "DNA"

``` py
from pyeed.core import DNAInfo
from pyeed.core import DNARecord

dna = DNAInfo(sequence="ATGCGTACGTCGATCGATCGATCGATCGATCGATCGATCGATCGTAGTC")
dna = DNARecord(sequence="ATGCGTACGTCGATCGATCGATCGATCGATCGATCGATCGATCGTAGTC")
```


Expand All @@ -28,27 +31,13 @@ Besides adding sequence information manually, PyEED also allows searching for se
=== "Protein"

``` py
protein = ProteinInfo.get_id("UCS38941.1")
protein = ProteinRecord.get_id("UCS38941.1")
```

=== "DNA"

``` py
dna = DNAInfo.get_id("NC_000913.3")
```

Alternatively, the sequence can be initiated from a sequence string, triggering a BLAST search in the NCBI database. If the sequence is found, the sequence object is filled with the corresponding information.

=== "Protein"

``` py
# Not yet implemented
```

=== "DNA"

``` py
# Not yet implemented
# Not implemented
```

## ⬇️ Save a sequence
Expand All @@ -58,12 +47,6 @@ Alternatively, the sequence can be initiated from a sequence string, triggering
The sequence can be stored in a `FASTA`, `JSON`, `YAML`, or `XML` file format. Therefore, the respective method can be used.
The file path is passed as an argument to the method.

=== "FASTA"

``` py
protein.to_fasta("protein.fasta")
```

=== "JSON"

``` py
Expand All @@ -82,8 +65,14 @@ The file path is passed as an argument to the method.
protein.to_xml("protein.xml")
```

=== "FASTA"

``` py
protein.to_fasta("protein.fasta")
```

### To database
Alternatively, sequence data can be stored in a `PostgreSQL` database. Therefore, the `to_db()` method can be used.
Alternatively, sequence data can be stored in a graph database. Therefore, the `to_db()` method can be used.

```py
# Feature is currently implemented
Expand Down
Loading

0 comments on commit a0e9dcc

Please sign in to comment.