Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix organism error #84

Merged
merged 2 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
235 changes: 214 additions & 21 deletions docs/examples/basics.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5352b6f5fa4c46119cd65bfc4de44c0d",
"model_id": "68d545568bf04841be62573b15776e10",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -106,7 +106,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand All @@ -122,7 +122,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f759e3211ef44d2b81a0f820d2f49e2d",
"model_id": "93adf4bcad844963b3815a8288e2083b",
"version_major": 2,
"version_minor": 0
},
Expand Down Expand Up @@ -178,43 +178,236 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[4mProteinRecord\u001b[0m\n",
"├── \u001b[94mid\u001b[0m = WP_068323110.1\n",
"├── \u001b[94mname\u001b[0m = methionine adenosyltransferase\n",
"├── \u001b[94mid\u001b[0m = Q9YBK2\n",
"├── \u001b[94mname\u001b[0m = S-adenosylmethionine synthase\n",
"├── \u001b[94morganism\u001b[0m\n",
"│ └── \u001b[4mOrganism\u001b[0m\n",
"│ ├── \u001b[94mid\u001b[0m = 95652fe4-fb9a-40e7-b98a-1dfad46d8e56\n",
"│ ├── \u001b[94mtaxonomy_id\u001b[0m = 1609559\n",
"│ ├── \u001b[94mname\u001b[0m = Pyrococcus kukulkanii\n",
"│ ├── \u001b[94mid\u001b[0m = cb2acb39-692e-4424-a4ed-1b97f2351a83\n",
"│ ├── \u001b[94mtaxonomy_id\u001b[0m = 272557\n",
"│ ├── \u001b[94mname\u001b[0m = Aeropyrum pernix K1\n",
"│ ├── \u001b[94mdomain\u001b[0m = Archaea\n",
"│ ├── \u001b[94mphylum\u001b[0m = Euryarchaeota\n",
"│ ├── \u001b[94mtax_class\u001b[0m = Thermococci\n",
"│ ├── \u001b[94morder\u001b[0m = Thermococcales\n",
"│ ├── \u001b[94mfamily\u001b[0m = Thermococcaceae\n",
"│ └── \u001b[94mgenus\u001b[0m = Pyrococcus\n",
"├── \u001b[94msequence\u001b[0m = MARNIVVEEIVRTPVEMQKVELVERKGIGHPDSIADGIAEAVSRALCREYIKRYGVILHHNTDQVEVVGGRAYPKFGGGEVVKPIYILLSGRAVELVDQELFPVHEVAIRAAKEYLKKNIRHLDVENHVVIDSRIGQGSVDLVSVFNKAKENPIPLANDTSFGVGFAPLTETERLVLETERLLNSEKFKKEYPAVGEDIKVMGLRKGDEIDLTIAAAIVDSEVANPKEYMEVKDKIKETVEELAKDITSRKVNIYVNTADDPKKDIYYITVTGTSAEAGDDGSVGRGNRVNGLITPNRHMSMEAAAGKNPVSHVGKIYNILAMFIANDIAKALPVEEVYVRILSQIGKPIDQPLVASIQVIPKQGHTVKEFEKDAYAIADEWLANITKIQKMILEDKITVF\n",
"│ ├── \u001b[94mphylum\u001b[0m = Thermoproteota\n",
"│ ├── \u001b[94mtax_class\u001b[0m = Thermoprotei\n",
"│ ├── \u001b[94morder\u001b[0m = Desulfurococcales\n",
"│ ├── \u001b[94mfamily\u001b[0m = Desulfurococcaceae\n",
"│ └── \u001b[94mgenus\u001b[0m = Aeropyrum\n",
"├── \u001b[94msequence\u001b[0m = MARRIVVESYPYPRVEDLQVELVERKGLGHPDTICDAAAEAVSRELSKYYLERFGKILHHNVDKVLLVGGQAAPRLGGGEVLQPIYILVSGRVTTEVRTGGGVESVPVGPIILRAVKNYIRENFRFLDPEEHVIVDYRVGRGSVDLVGIFEAEDKVPLANDTSIGSGHAPLSTLERLVLETERILNSRETKERLPAVGEDVKVMGVRDGKSITLTVAMAVVSSQVGSVSDYLAVKEEAESLILDLASRIAPDYDVRVNINTGDIPEKKILYLTVTGTSAEHGDDGATGRGNRVNGLITPMRPMSMEAAAGKNPVNHVGKIYNVVANEMAALIHREVKGVEEVYVKLVSQIGKPIDRPRIVDVKVRMEGGREVTADAKREIEAIANSVLDGITGYTEKLVRGDITVY\n",
"├── \u001b[94mregions\u001b[0m\n",
"│ └── 0\n",
"│ ├── 0\n",
"│ │ └── \u001b[4mRegion\u001b[0m\n",
"│ │ ├── \u001b[94mid\u001b[0m = 0f3c9ab0-dde3-4cb1-aa12-c25c5fbb514c\n",
"│ │ ├── \u001b[94mname\u001b[0m = S-adenosylmethionine synthetase, archaea\n",
"│ │ ├── \u001b[94mstart\u001b[0m = 2\n",
"│ │ └── \u001b[94mend\u001b[0m = 406\n",
"│ ├── 1\n",
"│ │ └── \u001b[4mRegion\u001b[0m\n",
"│ │ ├── \u001b[94mid\u001b[0m = 4d0151b8-22bb-4695-ac0b-f70f0e7bb7c9\n",
"│ │ ├── \u001b[94mname\u001b[0m = S-adenosylmethionine synthase\n",
"│ │ ├── \u001b[94mstart\u001b[0m = 3\n",
"│ │ └── \u001b[94mend\u001b[0m = 406\n",
"│ └── 2\n",
"│ └── \u001b[4mRegion\u001b[0m\n",
"│ ├── \u001b[94mid\u001b[0m = MetK2\n",
"│ ├── \u001b[94mstart\u001b[0m = 2\n",
"│ └── \u001b[94mend\u001b[0m = 401\n",
"│ ├── \u001b[94mid\u001b[0m = a9a6dce1-248d-4b51-bb37-69a6e1b1ed52\n",
"│ ├── \u001b[94mname\u001b[0m = S-adenosylmethionine synthetase, domain 3\n",
"│ ├── \u001b[94mstart\u001b[0m = 144\n",
"│ └── \u001b[94mend\u001b[0m = 248\n",
"├── \u001b[94mec_number\u001b[0m = 2.5.1.6\n",
"└── \u001b[94mmol_weight\u001b[0m = 44273.0\n",
"└── \u001b[94mmol_weight\u001b[0m = 44235.0\n",
"\n"
]
}
],
"source": [
"print(blast_results[3])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"💾 Sequence saved to protein.fasta\n"
]
}
],
"source": [
"from pyeed.core import ProteinRecord\n",
"\n",
"protein = ProteinRecord(name=\"test_protein\", sequence=\"MTEITAAMVKELREDKAVQLLREKGLGK\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"from neo4j import GraphDatabase\n",
"\n",
"# import environment variables\n",
"import os\n",
"from dotenv import load_dotenv\n",
"\n",
"load_dotenv()\n",
"\n",
"\n",
"# URI examples: \"neo4j://localhost\", \"neo4j+s://xxx.databases.neo4j.io\"\n",
"URI = \"neo4j+s://ecd986f5.databases.neo4j.io\"\n",
"AUTH = (os.getenv(\"NEO4J_USER\"), os.getenv(\"NEO4J_PASSWORD\"))\n",
"\n",
"with GraphDatabase.driver(URI, auth=AUTH) as driver:\n",
" driver.verify_connectivity()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"def run_query(query):\n",
" with driver.session() as session:\n",
" result = session.run(query)\n",
" records = list(result) # Fetch all records within the session context\n",
" return records"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"GO terms and their relationships have been created.\n"
]
}
],
"source": [
"# Sample GO terms data\n",
"go_terms = [\n",
" {\n",
" \"id\": \"GO:0008150\",\n",
" \"name\": \"biological_process\",\n",
" \"namespace\": \"biological_process\",\n",
" },\n",
" {\n",
" \"id\": \"GO:0003674\",\n",
" \"name\": \"molecular_function\",\n",
" \"namespace\": \"molecular_function\",\n",
" },\n",
" {\n",
" \"id\": \"GO:0005575\",\n",
" \"name\": \"cellular_component\",\n",
" \"namespace\": \"cellular_component\",\n",
" },\n",
" {\"id\": \"GO:0007049\", \"name\": \"cell cycle\", \"namespace\": \"biological_process\"},\n",
" {\"id\": \"GO:0009987\", \"name\": \"cellular process\", \"namespace\": \"biological_process\"},\n",
"]\n",
"\n",
"# Create GO Term nodes\n",
"for term in go_terms:\n",
" query = f\"\"\"\n",
" CREATE (g:GO_Term {{id: '{term['id']}', name: '{term['name']}', namespace: '{term['namespace']}'}})\n",
" \"\"\"\n",
" run_query(query)\n",
"\n",
"# Define relationships between GO terms\n",
"relationships = [\n",
" {\"parent_id\": \"GO:0008150\", \"child_id\": \"GO:0007049\", \"type\": \"IS_A\"},\n",
" {\"parent_id\": \"GO:0008150\", \"child_id\": \"GO:0009987\", \"type\": \"IS_A\"},\n",
"]\n",
"\n",
"# Create relationships\n",
"for rel in relationships:\n",
" query = f\"\"\"\n",
" MATCH (p:GO_Term {{id: '{rel['parent_id']}'}}), (c:GO_Term {{id: '{rel['child_id']}'}})\n",
" CREATE (c)-[:{rel['type']}]->(p)\n",
" \"\"\"\n",
" run_query(query)\n",
"\n",
"print(\"GO terms and their relationships have been created.\")"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Proteins and their relationships to GO terms have been created.\n"
]
}
],
"source": [
"# Sample proteins data\n",
"proteins = [\n",
" {\"name\": \"BRCA1 Protein\", \"sequence\": \"M1...1863\", \"length\": 1863},\n",
" {\"name\": \"TP53 Protein\", \"sequence\": \"M1...393\", \"length\": 393},\n",
"]\n",
"\n",
"# Create Protein nodes\n",
"for protein in proteins:\n",
" query = f\"\"\"\n",
" CREATE (p:Protein {{name: '{protein['name']}', sequence: '{protein['sequence']}', length: {protein['length']}}})\n",
" \"\"\"\n",
" run_query(query)\n",
"\n",
"# Define relationships between proteins and GO terms\n",
"protein_go_relationships = [\n",
" {\n",
" \"protein_name\": \"BRCA1 Protein\",\n",
" \"go_id\": \"GO:0007049\",\n",
" \"relationship\": \"INVOLVED_IN\",\n",
" },\n",
" {\n",
" \"protein_name\": \"TP53 Protein\",\n",
" \"go_id\": \"GO:0009987\",\n",
" \"relationship\": \"INVOLVED_IN\",\n",
" },\n",
"]\n",
"\n",
"# Create relationships\n",
"for rel in protein_go_relationships:\n",
" query = f\"\"\"\n",
" MATCH (p:Protein {{name: '{rel['protein_name']}'}}), (g:GO_Term {{id: '{rel['go_id']}'}})\n",
" CREATE (p)-[:{rel['relationship']}]->(g)\n",
" \"\"\"\n",
" run_query(query)\n",
"\n",
"print(\"Proteins and their relationships to GO terms have been created.\")"
]
}
],
"metadata": {
Expand Down
43 changes: 16 additions & 27 deletions docs/quick_start/basics.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,20 @@ A sequence object can be created by passing a sequence string to the constructor
=== "Protein"

``` py
from pyeed.core import ProteinInfo
from pyeed.core import ProteinRecord

protein = ProteinInfo(sequence="MTEITAAMVKELREDKAVQLLREKGLGK")
protein = ProteinRecord(
name="My Protein",
sequence="MTEITAAMVKELREDKAVQLLREKGLGK"
)
```

=== "DNA"

``` py
from pyeed.core import DNAInfo
from pyeed.core import DNARecord

dna = DNAInfo(sequence="ATGCGTACGTCGATCGATCGATCGATCGATCGATCGATCGATCGTAGTC")
dna = DNARecord(sequence="ATGCGTACGTCGATCGATCGATCGATCGATCGATCGATCGATCGTAGTC")
```


Expand All @@ -28,27 +31,13 @@ Besides adding sequence information manually, PyEED also allows searching for se
=== "Protein"

``` py
protein = ProteinInfo.get_id("UCS38941.1")
protein = ProteinRecord.get_id("UCS38941.1")
```

=== "DNA"

``` py
dna = DNAInfo.get_id("NC_000913.3")
```

Alternatively, the sequence can be initiated from a sequence string, triggering a BLAST search in the NCBI database. If the sequence is found, the sequence object is filled with the corresponding information.

=== "Protein"

``` py
# Not yet implemented
```

=== "DNA"

``` py
# Not yet implemented
# Not implemented
```

## ⬇️ Save a sequence
Expand All @@ -58,12 +47,6 @@ Alternatively, the sequence can be initiated from a sequence string, triggering
The sequence can be stored in a `FASTA`, `JSON`, `YAML`, or `XML` file format. Therefore, the respective method can be used.
The file path is passed as an argument to the method.

=== "FASTA"

``` py
protein.to_fasta("protein.fasta")
```

=== "JSON"

``` py
Expand All @@ -82,8 +65,14 @@ The file path is passed as an argument to the method.
protein.to_xml("protein.xml")
```

=== "FASTA"

``` py
protein.to_fasta("protein.fasta")
```

### To database
Alternatively, sequence data can be stored in a `PostgreSQL` database. Therefore, the `to_db()` method can be used.
Alternatively, sequence data can be stored in a graph database. Therefore, the `to_db()` method can be used.

```py
# Feature is currently implemented
Expand Down
Loading
Loading