Skip to content

Commit

Permalink
Embeddings esm (#93)
Browse files Browse the repository at this point in the history
* added new implementation

* defined data graph model

* renamed pyeed class

* implemented db connector with respect to neomodel

* added working uniprot fetcher with db transaction

* refactored requester

* 😵‍💫

* added implementation to calculate embeddings
  • Loading branch information
haeussma authored Oct 14, 2024
1 parent ee41af2 commit 50ce1f1
Show file tree
Hide file tree
Showing 63 changed files with 3,388 additions and 588 deletions.
12 changes: 5 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
[![Documentation](https://github.com/PyEED/pyeed/actions/workflows/make_docs.yaml/badge.svg)](https://github.com/PyEED/pyeed/actions/workflows/make_docs.yaml)

## About 📖
pyEED is a toolkit enabling object-oriented analysis of protein sequences, instead of working with sequences in a file-oriented fashion. This will enable the user to easily access and manipulate sequence information and to perform analyses on the sequence data.
pyeed is a toolkit enabling object-oriented analysis of protein sequences, instead of working with sequences in a file-oriented fashion. This will enable the user to easily access and manipulate sequence information and to perform analyses on the sequence data.
This library is currently under development and thus the API is subject to change.


Expand All @@ -20,9 +20,7 @@ pip install git+https://github.com/PyEED/pyeed.git

## Quick start 🚀

Library is currently refactored, quick start will be updated soon!

## Documentation 📘

Check out the [documentation](https://pyeed.github.io/pyeed/) for in-depth information on how to setup `pyeed`,
use the build-in tools, and store sequence data in databases.
### Launch Neo4j database via Docker and mount to a local directory
```bash
docker run --name pyeed-neo4j -p 7474:7474 -p 7687:7687 -v $PWD/data:/data -v $PWD/logs:/logs -v $PWD/import:/var/lib/neo4j/import -v $PWD/plugins:/plugins -e NEO4J_AUTH=neo4j/test -d neo4j
```
69 changes: 1 addition & 68 deletions docs/examples/ids.json
Original file line number Diff line number Diff line change
@@ -1,68 +1 @@
[
"WP_211530438",
"RUM32465",
"NOZ59931",
"MCI2415466",
"MCD6512403",
"MCL4350655",
"RLF67685",
"RLE93954",
"TMJ11138",
"WP_248897180",
"MBU7022768",
"NPA53530",
"MCL4451275",
"MBP7070042",
"NHV96297",
"NQE45598",
"PLJ76955",
"Q6L123",
"Q8TZW1",
"Q8ZYP7",
"Q46CW6",
"Q18H49",
"C3MJ04",
"Q8TV85",
"A6UU75",
"Q97CT6",
"O59488",
"WCN31494",
"O27429",
"Q2FN14",
"P0DF56",
"A0B742",
"Q2NEB3",
"Q9HM12",
"A5UMW7",
"Q9YBK2",
"A4G0T1",
"Q12WC8",
"A2BIZ8",
"O30186",
"C3NF87",
"A6UQS6",
"A1RSD7",
"A8MD44",
"P0CW62",
"A6VHQ4",
"A9A923",
"B6YUL1",
"Q5V2S5",
"Q4JAL1",
"B1YC36",
"P0CW63",
"Q980S9",
"Q3IQF5",
"Q9V1P7",
"Q8PWS4",
"Q5JF22",
"Q8TU57",
"C5A4B7",
"B0R5A8",
"P26498",
"O67275",
"A7I771",
"Q976F3",
"A3MY01",
"Q58605"
]
["Q9YBK2", "A6UQS6", "A1RSD7", "Q46CW6", "Q4JAL1", "Q8TU57", "P26498", "Q980S9", "B1YC36", "C3MJ04", "Q9HM12", "A3MY01", "A9A923", "Q5V2S5", "A4G0T1", "Q8TZW1", "Q6L123", "Q2FN14", "Q8PWS4", "Q18H49", "Q8ZYP7", "A8MD44", "P0DF56", "B0R5A8", "B6YUL1", "O30186", "A5UMW7", "Q2NEB3", "A6VHQ4", "Q12WC8", "A7I771", "Q5JF22", "Q8TV85", "O67275", "A2BIZ8", "Q3IQF5", "Q976F3", "P0CW63", "P0CW62", "C5A4B7", "O27429", "C3NF87", "Q97CT6", "Q58605", "Q9V1P7", "A6UU75", "O59488", "A0B742", "RUM32465.1", "NPA53530.1", "RLF67685.1", "MCL4350655.1", "NOZ59931.1", "WP_248897180.1", "MCL4451275.1", "RLE93954.1", "MBU7022768.1", "NHV96297.1", "PLJ76955.1", "WCN31494.1", "WP_211530438.1", "MBP7070042.1", "MCD6512403.1", "MCI2415466.1", "TMJ11138.1", "NQE45598.1"]
247 changes: 247 additions & 0 deletions docs/model_diagram.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
{
"style": {
"node-color": "#ffffff",
"border-color": "#000000",
"caption-color": "#000000",
"arrow-color": "#000000",
"label-background-color": "#ffffff",
"directionality": "directed",
"arrow-width": 5
},
"nodes": [
{
"id": "n0",
"position": {
"x": 0,
"y": 0
},
"caption": "",
"style": {},
"labels": [
"StrictStructuredNode"
],
"properties": {}
},
{
"id": "n1",
"position": {
"x": 346.4101615137755,
"y": 199.99999999999997
},
"caption": "",
"style": {},
"labels": [
"Organism"
],
"properties": {
"taxonomy_id": "int - required",
"name": "str",
"domain": "str",
"kingdom": "str",
"phylum": "str",
"tax_class": "str",
"order": "str",
"family": "str",
"genus": "str",
"species": "str"
}
},
{
"id": "n2",
"position": {
"x": 2.4492935982947064e-14,
"y": 400.0
},
"caption": "",
"style": {},
"labels": [
"Site"
],
"properties": {
"site_id": "id - unique",
"name": "str",
"positions": "list[int] - required",
"annotation": "str - required"
}
},
{
"id": "n3",
"position": {
"x": -346.4101615137754,
"y": 200.00000000000014
},
"caption": "",
"style": {},
"labels": [
"Region"
],
"properties": {
"region_id": "id - unique",
"start": "int - required",
"end": "int - required",
"annotation": "str - required"
}
},
{
"id": "n4",
"position": {
"x": -346.4101615137755,
"y": -199.99999999999991
},
"caption": "",
"style": {},
"labels": [
"GOAnnotation"
],
"properties": {
"go_id": "str - required",
"term": "str",
"definition": "str"
}
},
{
"id": "n5",
"position": {
"x": -7.347880794884119e-14,
"y": -400.0
},
"caption": "",
"style": {},
"labels": [
"Protein"
],
"properties": {
"accession_id": "str - required",
"sequence": "str - required",
"name": "str",
"seq_length": "int - required",
"mol_weight": "float",
"ec_number": "str",
"nucleotide_id": "str",
"locus_tag": "str",
"structure_ids": "list[str]",
"go_terms": "list[str]",
"embedding": "list[float]"
}
},
{
"id": "n6",
"position": {
"x": 346.41016151377534,
"y": -200.00000000000017
},
"caption": "",
"style": {},
"labels": [
"DNA"
],
"properties": {
"accession_id": "str - required",
"sequence": "str - required",
"name": "str",
"seq_length": "int - required",
"go_terms": "list[str]",
"embedding": "list[float]",
"gc_content": "float"
}
}
],
"relationships": [
{
"id": "e0",
"type": "ORIGINATES_FROM",
"style": {},
"properties": {},
"fromId": "n5",
"toId": "n1"
},
{
"id": "e1",
"type": "ORIGINATES_FROM",
"style": {},
"properties": {},
"fromId": "n6",
"toId": "n1"
},
{
"id": "e2",
"type": "ASSOCIATED_WITH",
"style": {},
"properties": {},
"fromId": "n4",
"toId": "n5"
},
{
"id": "e3",
"type": "ASSOCIATED_WITH",
"style": {},
"properties": {},
"fromId": "n4",
"toId": "n6"
},
{
"id": "e4",
"type": "ORIGINATES_FROM",
"style": {},
"properties": {},
"fromId": "n5",
"toId": "n1"
},
{
"id": "e5",
"type": "HAS_SITE",
"style": {},
"properties": {},
"fromId": "n5",
"toId": "n2"
},
{
"id": "e6",
"type": "HAS_REGION",
"style": {},
"properties": {},
"fromId": "n5",
"toId": "n3"
},
{
"id": "e7",
"type": "ASSOCIATED_WITH",
"style": {},
"properties": {},
"fromId": "n5",
"toId": "n4"
},
{
"id": "e8",
"type": "ORIGINATES_FROM",
"style": {},
"properties": {},
"fromId": "n6",
"toId": "n1"
},
{
"id": "e9",
"type": "HAS_SITE",
"style": {},
"properties": {},
"fromId": "n6",
"toId": "n2"
},
{
"id": "e10",
"type": "HAS_REGION",
"style": {},
"properties": {},
"fromId": "n6",
"toId": "n3"
},
{
"id": "e11",
"type": "ASSOCIATED_WITH",
"style": {},
"properties": {},
"fromId": "n6",
"toId": "n4"
}
]
}
22 changes: 1 addition & 21 deletions pyeed/__init__.py
Original file line number Diff line number Diff line change
@@ -1,21 +1 @@
import os

from .core.abstractannotation import AbstractAnnotation
from .core.alignmentresult import AlignmentResult
from .core.annotation import Annotation
from .core.blastdata import BlastData
from .core.clustalomegaresult import ClustalOmegaResult
from .core.cluster import Cluster
from .core.dnarecord import DNARecord
from .core.numberedsequence import NumberedSequence
from .core.ontology import Ontology
from .core.organism import Organism
from .core.pairwisealignmentresult import PairwiseAlignmentResult
from .core.proteinrecord import ProteinRecord
from .core.region import Region
from .core.regionset import RegionSet
from .core.sequence import Sequence
from .core.sequencerecord import SequenceRecord
from .core.sequencetype import SequenceType
from .core.site import Site
from .core.standardnumbering import StandardNumbering
from pyeed.pyeed import Pyeed
File renamed without changes.
Loading

0 comments on commit 50ce1f1

Please sign in to comment.