From e6b469dfdb65e628828ac2d84028d6d0caaa1299 Mon Sep 17 00:00:00 2001 From: max Date: Mon, 14 Oct 2024 16:23:27 +0200 Subject: [PATCH] added implementation to calculate embeddings --- pyeed/embedding.py | 54 ++ pyeed/model.py | 6 + pyeed/pyeed.py | 24 + pyproject.toml | 6 +- test.ipynb | 1213 +++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 1289 insertions(+), 14 deletions(-) create mode 100644 pyeed/embedding.py diff --git a/pyeed/embedding.py b/pyeed/embedding.py new file mode 100644 index 0000000..a633191 --- /dev/null +++ b/pyeed/embedding.py @@ -0,0 +1,54 @@ +import gc + +import torch +from transformers import EsmModel, EsmTokenizer + + +def get_batch_embeddings(sequences: list[str], batch_size: int = 16): + # Load the ESM2 model and tokenizer + model_name = "facebook/esm2_t33_650M_UR50D" + model = EsmModel.from_pretrained(model_name) + tokenizer = EsmTokenizer.from_pretrained(model_name) + + # Check if MPS (Metal Performance Shaders) is available and use it + device = ( + torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu") + ) + model = model.to(device) + + embedding_list = [] + model.eval() + + with torch.no_grad(): + # Process sequences in batches + for i in range(0, len(sequences), batch_size): + batch = sequences[i : i + batch_size] + + # Tokenize the input sequences (must be a list of strings) + inputs = tokenizer( + batch, padding=True, truncation=True, return_tensors="pt" + ).to(device) + + # Get model outputs + outputs = model(**inputs) + embeddings = outputs.last_hidden_state + + # Process each sequence in the batch + for j in range(len(batch)): + valid_token_mask = inputs["attention_mask"][j].bool() + seq_embeddings = embeddings[j][valid_token_mask].mean(dim=0).cpu() + embedding_list.append(seq_embeddings) + + return embedding_list + + +def free_memory(): + gc.collect() # Python garbage collection + if torch.backends.mps.is_built(): + torch.mps.empty_cache() + elif torch.cuda.is_available(): + torch.cuda.empty_cache() + + +if __name__ == "__main__": + free_memory() diff --git a/pyeed/model.py b/pyeed/model.py index b3476ca..877283e 100644 --- a/pyeed/model.py +++ b/pyeed/model.py @@ -52,7 +52,13 @@ def _class_properties(cls): def save(self, *args, **kwargs): """Validates the properties and then saves the node.""" + allowed_properties = self.__class__._class_properties() + + # Only validate properties defined in the model schema for field, prop in self.__dict__.items(): + if field not in allowed_properties: + continue # Skip non-class properties (like internal Neo4j fields) + if prop is None or callable(prop): continue diff --git a/pyeed/pyeed.py b/pyeed/pyeed.py index 5971b61..7b3b402 100644 --- a/pyeed/pyeed.py +++ b/pyeed/pyeed.py @@ -1,10 +1,13 @@ import asyncio import nest_asyncio +from loguru import logger from pyeed.adapter.primary_db_adapter import PrimaryDBAdapter from pyeed.adapter.uniprot_mapper import UniprotToPyeed from pyeed.dbconnect import DatabaseConnector +from pyeed.embedding import free_memory, get_batch_embeddings +from pyeed.model import Protein class Pyeed: @@ -45,6 +48,27 @@ def fetch_from_primary_db(self, ids: list[str]): asyncio.run(adapter.make_request()) + def calculate_sequence_embeddings(self): + """ + Calculates embeddings for all sequences in the database that do not have embeddings. + """ + + proteins = Protein.nodes.filter(embedding__isnull=True) + logger.debug(f"Found {len(proteins)} proteins without embeddings.") + accessions = [protein.accession_id for protein in proteins] + sequences = [protein.sequence for protein in proteins] + + logger.debug(f"Calculating embeddings for {len(sequences)} sequences.") + embeddings = get_batch_embeddings(sequences) + + for i, protein in enumerate(proteins): + if not protein.accession_id == accessions[i]: + raise ValueError("Protein accessions do not match.") + protein.embedding = embeddings[i].tolist() + protein.save() + + free_memory() + if __name__ == "__main__": eedb = Pyeed("bolt://127.0.0.1:7687") diff --git a/pyproject.toml b/pyproject.toml index 7228765..b4ebd88 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "pyeed" -version = "0.4.0" +version = "0.4.1" description = "Toolkit to create, annotate, and analyze sequence data" authors = ["haeussma <83341109+haeussma@users.noreply.github.com>"] license = "MIT" @@ -28,6 +28,10 @@ bio = "^1.7.1" loguru = "^0.7.2" neomodel = "^5.3.3" shapely = "^2.0.6" +torch = "^2.4.1" +transformers = "^4.45.2" +scikit-learn = "^1.5.2" +numpy = "^2.1.2" [tool.poetry.group.dev.dependencies] mkdocs-material = "^9.5.9" diff --git a/test.ipynb b/test.ipynb index 103198c..5550747 100644 --- a/test.ipynb +++ b/test.ipynb @@ -4,7 +4,16 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/max/Library/Caches/pypoetry/virtualenvs/pyeed-iiMJg_Qc-py3.11/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "from pyeed import Pyeed\n", "from pyeed.model import GOAnnotation, Protein" @@ -56,7 +65,7 @@ "text": [ "📡 Connected to database.\n", "All data has been wiped from the database.\n", - "\n" + "\n" ] } ], @@ -87,19 +96,28 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-10-14 00:25:34.005\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36mmake_request\u001b[0m:\u001b[36m142\u001b[0m - \u001b[34m\u001b[1mSending 5 requests in batches of 5\u001b[0m\n", - "\u001b[32m2024-10-14 00:25:34.009\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'P04182,Q6QDP7,P04182,P29758,A0A851UXD9'}\u001b[0m\n", - "\u001b[32m2024-10-14 00:25:34.109\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'A0A8C6HVU6,A0A8C6GQ10,A0A1U7QEB0,A0A6I9L5L6,G3HVE0'}\u001b[0m\n", - "\u001b[32m2024-10-14 00:25:34.209\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'A0A8J6G992,A0A8C6W4W5,A0A8B9YUY7,L8I4V3,A0A6P3IYQ1'}\u001b[0m\n", - "\u001b[32m2024-10-14 00:25:34.453\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'A0A452EKJ3,A0A6P5B7Q0,F1MYG0,A0A5J5MK22,A0A6J0Y425'}\u001b[0m\n", - "\u001b[32m2024-10-14 00:25:34.553\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'Q3ZCF5'}\u001b[0m\n" + "\u001b[32m2024-10-14 16:18:48.472\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36mmake_request\u001b[0m:\u001b[36m142\u001b[0m - \u001b[34m\u001b[1mSending 9 requests in batches of 5\u001b[0m\n", + "\u001b[32m2024-10-14 16:18:48.476\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'P04182,Q6QDP7,P04182,P29758,A0A851UXD9'}\u001b[0m\n", + "\u001b[32m2024-10-14 16:18:48.577\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'A0A8C6HVU6,A0A8C6GQ10,A0A1U7QEB0,A0A6I9L5L6,G3HVE0'}\u001b[0m\n", + "\u001b[32m2024-10-14 16:18:48.806\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'A0A8J6G992,A0A8C6W4W5,A0A8B9YUY7,L8I4V3,A0A6P3IYQ1'}\u001b[0m\n", + "\u001b[32m2024-10-14 16:18:48.985\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'A0A452EKJ3,A0A6P5B7Q0,F1MYG0,A0A5J5MK22,A0A6J0Y425'}\u001b[0m\n", + "\u001b[32m2024-10-14 16:18:49.249\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'Q3ZCF5,P00330,J8LIG6,A0AA35J9C9,P00331'}\u001b[0m\n", + "\u001b[32m2024-10-14 16:18:49.470\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'J8Q680,J5PRJ1,A0A1X7R1I9,Q6FQA4,C5DNB7'}\u001b[0m\n", + "\u001b[32m2024-10-14 16:18:49.573\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'Q9P4C2,C5DHM6,Q757I1,A0A7H9HSD9,P20369'}\u001b[0m\n", + "\u001b[32m2024-10-14 16:18:49.838\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'H2AXS6,G0W4V9,A0A1G4M9V8,A0A1G4KF85,A0A1G4JJF2'}\u001b[0m\n", + "\u001b[32m2024-10-14 16:18:49.967\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.adapter.primary_db_adapter\u001b[0m:\u001b[36msend_request\u001b[0m:\u001b[36m123\u001b[0m - \u001b[34m\u001b[1mSending request to https://www.ebi.ac.uk/proteins/api/proteins with parameters: {'format': 'json', 'accession': 'G8ZTZ5,A0A1G4MBD6,A0A7H9HSJ3,J7SA96,G0VK69'}\u001b[0m\n", + "\u001b[32m2024-10-14 16:18:50.503\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.pyeed\u001b[0m:\u001b[36mcalculate_sequence_embeddings\u001b[0m:\u001b[36m57\u001b[0m - \u001b[34m\u001b[1mFound 44 proteins without embeddings.\u001b[0m\n", + "\u001b[32m2024-10-14 16:18:50.509\u001b[0m | \u001b[34m\u001b[1mDEBUG \u001b[0m | \u001b[36mpyeed.pyeed\u001b[0m:\u001b[36mcalculate_sequence_embeddings\u001b[0m:\u001b[36m61\u001b[0m - \u001b[34m\u001b[1mCalculating embeddings for 44 sequences.\u001b[0m\n", + "Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "{'nodes': 69, 'relationships': 179}\n" + "{'nodes': 135, 'relationships': 293}\n" ] } ], @@ -126,10 +144,35 @@ " \"A0A5J5MK22\",\n", " \"A0A6J0Y425\",\n", " \"Q3ZCF5\",\n", + " \"P00330\", # ADH\n", + " \"J8LIG6\",\n", + " \"A0AA35J9C9\",\n", + " \"P00331\",\n", + " \"J8Q680\",\n", + " \"J5PRJ1\",\n", + " \"A0A1X7R1I9\",\n", + " \"Q6FQA4\",\n", + " \"C5DNB7\",\n", + " \"Q9P4C2\",\n", + " \"C5DHM6\",\n", + " \"Q757I1\",\n", + " \"A0A7H9HSD9\",\n", + " \"P20369\",\n", + " \"H2AXS6\",\n", + " \"G0W4V9\",\n", + " \"A0A1G4M9V8\",\n", + " \"A0A1G4KF85\",\n", + " \"A0A1G4JJF2\",\n", + " \"G8ZTZ5\",\n", + " \"A0A1G4MBD6\",\n", + " \"A0A7H9HSJ3\",\n", + " \"J7SA96\",\n", + " \"G0VK69\",\n", "]\n", "\n", "# Fetch proteins from primary database\n", "eedb.fetch_from_primary_db(ids)\n", + "eedb.calculate_sequence_embeddings()\n", "\n", "# number of nodes and edges in db\n", "print(eedb.db.stats())" @@ -158,10 +201,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "Number of proteins in database: 20\n", - "{'accession_id': 'P04182', 'sequence': 'MLSKLASLQTVAALRRGLRTSVASATSVATKKTEQGPPSSEYIFERESKYGAHNYHPLPVALERGKGIYMWDVEGRQYFDFLSAYGAVSQGHCHPKIIEAMKSQVDKLTLTSRAFYNNVLGEYEEYITKLFNYNKVLPMNTGVEAGETACKLARRWGYTVKGIQKYKAKIVFAVGNFWGRTLSAVSSSTDPTSYDGFGPFMPGFETIPYNDLPALERALQDPNVAAFMVEPIQGEAGVIVPDPGYLTGVRELCTRHQVLFIADEIQTGLARTGRWLAVDHENVRPDIVLLGKALSGGLYPVSAVLCDDDIMLTIKPGEHGSTYGGNPLGCRIAIAALEVLEEEHLAENADKMGAILRKELMKLPSDVVTAVRGKGLLNAIVIRETKDCDAWKVCLRLRDNGLLAKPTHGDIIRLAPPLVIKEDEIRESVEIINKTILSF', 'name': 'Ornithine aminotransferase, mitochondrial', 'seq_length': 439, 'mol_weight': 48333.0, 'ec_number': '2.6.1.13', 'nucleotide_id': None, 'locus_tag': None, 'structure_ids': None, 'go_terms': None, 'embedding': None, 'element_id_property': '4:2dbbe7d3-51e1-4903-a514-4dd4aed7696d:203'}\n", + "Number of proteins in database: 44\n", + "{'accession_id': 'P04182', 'sequence': 'MLSKLASLQTVAALRRGLRTSVASATSVATKKTEQGPPSSEYIFERESKYGAHNYHPLPVALERGKGIYMWDVEGRQYFDFLSAYGAVSQGHCHPKIIEAMKSQVDKLTLTSRAFYNNVLGEYEEYITKLFNYNKVLPMNTGVEAGETACKLARRWGYTVKGIQKYKAKIVFAVGNFWGRTLSAVSSSTDPTSYDGFGPFMPGFETIPYNDLPALERALQDPNVAAFMVEPIQGEAGVIVPDPGYLTGVRELCTRHQVLFIADEIQTGLARTGRWLAVDHENVRPDIVLLGKALSGGLYPVSAVLCDDDIMLTIKPGEHGSTYGGNPLGCRIAIAALEVLEEEHLAENADKMGAILRKELMKLPSDVVTAVRGKGLLNAIVIRETKDCDAWKVCLRLRDNGLLAKPTHGDIIRLAPPLVIKEDEIRESVEIINKTILSF', 'name': 'Ornithine aminotransferase, mitochondrial', 'seq_length': 439, 'mol_weight': 48333.0, 'ec_number': '2.6.1.13', 'nucleotide_id': None, 'locus_tag': None, 'structure_ids': None, 'go_terms': None, 'embedding': [0.02024856023490429, -0.10120689868927002, -0.054875459522008896, 0.05940677598118782, -0.08618494868278503, -0.029551653191447258, 0.09230533987283707, -0.051247984170913696, -0.14654004573822021, 0.02974345162510872, 0.057191673666238785, -0.08889872580766678, 0.14024749398231506, 0.17638203501701355, 0.03450097143650055, 0.08764003217220306, 0.024486850947141647, 0.1010783389210701, -0.03536701574921608, 0.010831189341843128, 0.13994470238685608, -0.008208542130887508, 0.08778387308120728, -0.06917404383420944, -0.14067070186138153, 0.012566761113703251, 0.007692092098295689, -0.0831497460603714, 0.02742845192551613, -0.22349904477596283, 0.02116805873811245, 0.0315057709813118, 0.13410906493663788, 0.023250417783856392, -0.005980873480439186, 0.08939117193222046, -0.11512637883424759, 0.07771635055541992, 0.0008585193427279592, 0.019618116319179535, 0.03975960984826088, -0.034718647599220276, 0.20850276947021484, -0.14582329988479614, -0.016126694157719612, -0.05384471267461777, -0.03726068139076233, 0.12207911163568497, -0.011061558499932289, -0.11828935891389847, -0.20746827125549316, 0.02503921277821064, 0.0532228983938694, 0.144418403506279, 0.029017755761742592, -0.0142306387424469, -0.0313822440803051, -0.06383419781923294, -7.995564374141395e-05, 0.13239522278308868, -0.11705058813095093, 0.028009044006466866, -0.06976060569286346, 0.09761255234479904, -0.063786581158638, 0.05692310631275177, 0.15504327416419983, -0.018106607720255852, 0.11691496521234512, -0.0655292272567749, 0.02884158119559288, -0.05444662272930145, 0.018265506252646446, 0.02579304203391075, -0.04605093225836754, 0.014390230178833008, -0.012364896014332771, 0.23866473138332367, -0.009956937283277512, 0.0681992843747139, 0.014134470373392105, 0.02794279344379902, -0.10487978160381317, 0.08972960710525513, -0.0912482962012291, -0.09848162531852722, 0.02055089920759201, 0.029058776795864105, 0.14218510687351227, 0.02496551349759102, 0.031931325793266296, 0.16415734589099884, 0.04803087189793587, 0.006804416421800852, -0.008162754587829113, 0.04275595396757126, 0.015986235812306404, -0.03268745541572571, 0.08824224770069122, 0.09635967016220093, 0.1823301464319229, -0.07360901683568954, -0.09456747025251389, -0.07724674046039581, -0.16656287014484406, -0.14528372883796692, -0.08138983696699142, 0.061651408672332764, 0.0021833048667758703, -0.04141315817832947, 0.0076423645950853825, -0.04459812864661217, 0.07975580543279648, 0.02211489900946617, -0.029687168076634407, 0.005693844519555569, -0.17859789729118347, -0.06285973638296127, -0.04598310589790344, 0.17897313833236694, -0.08883719891309738, -0.006319078616797924, 0.06714661419391632, -0.04161020368337631, -0.10419580340385437, -0.07310382276773453, 0.146962970495224, 0.08816196024417877, -0.10586261749267578, -0.031728193163871765, -0.04375051334500313, -0.047378286719322205, 0.030666152015328407, -0.1353800892829895, -0.014605922624468803, 0.1563176065683365, 0.15227675437927246, 0.05652746930718422, 0.02366151660680771, -0.08338523656129837, 0.09168574959039688, -0.15236665308475494, -0.06580774486064911, 0.022683361545205116, 0.18279708921909332, 0.30025309324264526, -0.005162137560546398, 0.04054735228419304, -0.11110367625951767, 0.1477659046649933, 0.14293374121189117, -0.17026466131210327, 0.0051727802492678165, 0.04558205232024193, 0.010130445472896099, -0.09954708069562912, 0.05359900742769241, 0.15307672321796417, 0.04194203019142151, -0.04382958263158798, -0.1927614063024521, 0.11217568814754486, 0.07116460055112839, -0.07825478911399841, 0.07008392363786697, 0.10073629766702652, 0.16728831827640533, 0.08653615415096283, 0.08844856172800064, 0.06653032451868057, -0.1302960067987442, -0.06850302964448929, 0.16061058640480042, -0.14906901121139526, -0.11815643310546875, -0.0945635512471199, -0.07603438198566437, 0.08288383483886719, 0.04238039627671242, 0.004362911451607943, -0.01701570488512516, -0.09059951454401016, 0.09787600487470627, 0.03224221244454384, -0.011840535327792168, -0.11788751929998398, -0.11228067427873611, 0.01578008197247982, 0.08044200390577316, -0.050935640931129456, 0.006653961725533009, 0.04129105806350708, -0.016114138066768646, -0.1804916113615036, 0.12004414200782776, 0.08699070662260056, 0.05172829329967499, 0.01498870924115181, -0.1566164195537567, -0.12350823730230331, 0.030897095799446106, 0.1268661618232727, -0.14114680886268616, -0.0042837243527174, -0.07116500288248062, -0.05182066559791565, 0.017959758639335632, -0.02647753804922104, -0.2292952835559845, -0.1586570292711258, -0.0003755104844458401, 0.10735882818698883, 0.06963561475276947, -0.013053910806775093, -0.1742338240146637, 0.05144227296113968, 0.042464517056941986, -0.06541845947504044, 0.20838643610477448, -0.060216788202524185, 0.0571635402739048, -0.14732177555561066, 0.24624955654144287, -0.11695057153701782, 0.08509759604930878, -0.051088809967041016, -0.13435448706150055, 0.06139412522315979, -0.26016560196876526, 0.017035098746418953, 0.15031199157238007, 0.04677288606762886, -0.07068074494600296, -0.048508986830711365, 0.29367393255233765, 0.15618152916431427, 0.2820050120353699, 0.043994076550006866, -0.160979762673378, -0.24123862385749817, -0.12286628037691116, 0.15596705675125122, 0.002191054867580533, 0.04166155681014061, 0.01566813886165619, 0.0059838066808879375, 0.10580437630414963, 0.0005123030277900398, -0.05302106589078903, 0.08531664311885834, -0.3055320382118225, 0.11194870620965958, -0.010545526631176472, -0.02240646444261074, -0.1403033435344696, 0.029825836420059204, 0.09574121981859207, -0.010166076011955738, -0.07026774436235428, -0.26879456639289856, -0.025891803205013275, 0.10971372574567795, 0.03595172241330147, 0.09800836443901062, -0.13005897402763367, -0.09532411396503448, -0.16931308805942535, -0.08578775823116302, 0.05837809666991234, -0.005076675675809383, -0.0056008100509643555, 0.06914830952882767, -0.08532405644655228, 0.09586989134550095, -0.020517682656645775, -0.014993557706475258, 0.03339797630906105, 0.08349625766277313, -0.01848652958869934, 0.045515935868024826, 0.08933272212743759, -0.03875561058521271, 0.17289817333221436, -0.02203812077641487, 0.028600890189409256, -0.3312375247478485, -0.017497489228844643, 0.18198943138122559, 0.13468331098556519, 0.11163417249917984, 0.02053936757147312, 0.0741971880197525, 0.05844476819038391, -0.02745579369366169, -0.050529636442661285, 0.01913490891456604, 0.04597477242350578, 0.13102567195892334, 0.1383875161409378, -0.013494573533535004, 0.21210713684558868, 0.10059105604887009, -0.1482100784778595, 0.14469875395298004, 0.02646399848163128, 0.08720823377370834, -0.003809984540566802, 0.0763804018497467, -0.06920450925827026, -0.06119345501065254, -0.06375177949666977, -0.03699449077248573, 0.011271377094089985, -0.06528067588806152, -0.009433823637664318, 0.0014158852864056826, -0.07885332405567169, -0.050813715904951096, 0.030872460454702377, -0.024577734991908073, -0.010174094699323177, -0.02980596385896206, -0.026072382926940918, -0.04333676025271416, -0.22810889780521393, -0.0939566120505333, 0.10181492567062378, 0.026783794164657593, -0.014762178994715214, -0.013456735759973526, -0.1313604712486267, -0.043617136776447296, -0.02049385942518711, -0.049815379083156586, -0.07851074635982513, -0.022455329075455666, 0.19660161435604095, -0.10905441641807556, 0.038022320717573166, 0.0014186608605086803, 0.0923498347401619, 0.040489763021469116, -0.06285952031612396, -0.02051447331905365, -0.06787365674972534, -0.15376047790050507, -0.095383420586586, -0.0003280747914686799, -0.03407712280750275, -0.02108774147927761, 0.006160557735711336, 0.019009821116924286, 0.00033684735535643995, -0.1021285280585289, 0.30446287989616394, -0.09730297327041626, 0.07469581812620163, -0.13798987865447998, 0.01917666383087635, 0.04283057525753975, -0.0536748506128788, 0.04248344898223877, -0.1026068925857544, 0.0017098721582442522, -0.013081393204629421, -0.01233616378158331, 0.08622084558010101, -0.08948462456464767, 0.1467512547969818, -0.14721159636974335, 0.1255239099264145, 0.08868531882762909, -0.036404799669981, -0.09460549056529999, 0.0028824806213378906, 0.08929192274808884, 0.033463481813669205, 0.1493336409330368, 0.18243302404880524, 0.005216342397034168, 0.01778189279139042, 0.059227216988801956, -0.009260527789592743, -0.08512048423290253, 0.1051655113697052, 0.020001472905278206, -0.08737093955278397, 0.02480717934668064, -0.07241638749837875, 0.04852200672030449, 0.0024197744205594063, -0.06279566884040833, 0.047167401760816574, 0.018892256543040276, 0.0023336068261414766, -0.10302535444498062, 0.10976292192935944, 0.11765879392623901, 0.11515920609235764, -0.08366940915584564, 0.019063729792833328, -0.002999610500410199, -0.1782609224319458, 0.15108954906463623, -0.013570889830589294, 0.11654751747846603, -0.025835838168859482, -0.07280769944190979, -0.016377035528421402, -0.11234492808580399, 0.010014514438807964, -0.027213219553232193, 0.17620645463466644, -0.15494410693645477, 0.08090037107467651, 0.12706249952316284, 0.1470927596092224, 0.17929023504257202, 0.19450244307518005, -0.02966410294175148, -0.05005091801285744, 0.08243829011917114, -0.09022480249404907, 0.04065537452697754, -0.002519000554457307, -0.025284478440880775, -0.04351834952831268, 0.04616590961813927, 0.12006843090057373, 0.012185708619654179, -0.009103317745029926, -0.04189823940396309, -0.041814591735601425, 0.08847274631261826, 0.06028532236814499, 0.14623475074768066, 0.01387952733784914, -0.010984296910464764, -0.14735060930252075, -0.015200859867036343, -0.02784755825996399, -0.020713338628411293, 0.12101811170578003, -0.17540663480758667, 0.057228293269872665, -0.1106448695063591, 0.06890348345041275, -0.03264918923377991, 0.10605201125144958, -0.01991543360054493, 0.11312338709831238, 0.029462451115250587, 0.06109760329127312, -0.06062428653240204, -0.041553620249032974, 0.12487564980983734, 0.06535398960113525, 0.14045560359954834, -0.10569396615028381, 0.2600948214530945, 0.0807153657078743, 0.049442924559116364, 0.030626796185970306, -0.029005402699112892, 0.022074809297919273, -0.08597365766763687, 0.00031049398239701986, 0.022770436480641365, -0.013262610882520676, 0.09789551049470901, 0.01812072843313217, 0.05302220955491066, 0.1267828643321991, -0.060110487043857574, 0.10897479206323624, -0.06584092974662781, -0.07712274044752121, 0.025456175208091736, -0.050237443298101425, -0.07959134131669998, -0.1272030770778656, -0.09985269606113434, -0.09860970079898834, 0.07897760719060898, 0.06900321692228317, 0.026292139664292336, 0.01792265474796295, 0.037446245551109314, 0.03919033333659172, -0.0352974459528923, -0.019194146618247032, -0.02049831673502922, -0.08958612382411957, 0.04343271255493164, 0.06868553161621094, 0.08785471320152283, 0.08740116655826569, -0.05674044042825699, 0.05336346849799156, -0.10339788347482681, -0.23855671286582947, -0.15420715510845184, -0.15423451364040375, -0.015500390902161598, -0.021716145798563957, 0.04922948032617569, 0.09897596389055252, 0.26406827569007874, 0.06140316650271416, 0.10921572893857956, 0.01940695196390152, -0.017130140215158463, 0.004089604131877422, -0.03490280732512474, 0.12443830817937851, -0.11853078007698059, 0.05865868553519249, -0.17225712537765503, 0.06290682405233383, -0.2054528445005417, 0.060116905719041824, -0.1378956139087677, -0.0359942764043808, -0.04071856290102005, -0.11661553382873535, -0.17280980944633484, -0.011577999219298363, 0.020944803953170776, 0.1294027864933014, -0.11107338219881058, 0.06643210351467133, 0.10235550999641418, -0.03726346418261528, -0.14067699015140533, -0.1344561129808426, -0.08124163746833801, 0.035732902586460114, -0.15031319856643677, -0.3293832838535309, -0.004662099294364452, -0.09775374084711075, -0.12812276184558868, 0.03596089035272598, 0.026845093816518784, -0.07316321134567261, -0.0490727499127388, 0.042702775448560715, 0.07023901492357254, 0.05402340739965439, -0.08028438687324524, -0.03499436751008034, 0.06712958961725235, -0.15439504384994507, 0.23653313517570496, -0.002085133222863078, -0.14339089393615723, -0.17206282913684845, -0.016279302537441254, 0.034319330006837845, 0.07099718600511551, -0.1098177507519722, 0.11373815685510635, 0.055597491562366486, 0.0683194100856781, 0.19847805798053741, -0.10339944809675217, 0.10560564696788788, 0.07138531655073166, 0.17929230630397797, 0.003437547944486141, -0.1428922861814499, 0.3423954248428345, -0.0706791952252388, 0.04645683616399765, -0.06315120309591293, 0.14840300381183624, 0.028263572603464127, 0.1167556568980217, -0.03096133843064308, 0.02880556881427765, -0.05438739433884621, -0.08203216642141342, 0.04271448031067848, 0.12751607596874237, 0.043448831886053085, 0.0807381123304367, 0.0013076410396024585, 0.1797882467508316, -0.12761518359184265, 0.10751351714134216, -0.09137621521949768, -0.1855749785900116, 0.1176833063364029, 0.09625124931335449, 0.1433810144662857, 0.1432649791240692, -0.0004332279204390943, -0.06642164289951324, 0.02638934552669525, -0.053056661039590836, -0.04425548017024994, 0.09958036243915558, 0.023525893688201904, -0.08794252574443817, -0.017259882763028145, -0.03994952514767647, -0.006985538173466921, -0.0741734579205513, 0.020498478785157204, 0.028516877442598343, -0.006524977274239063, 0.10634642094373703, -0.12504811584949493, 0.07255450636148453, -0.0423094667494297, -0.009143682196736336, -0.03481057658791542, -0.06569904088973999, 0.040476784110069275, -0.05309542268514633, -0.09613602608442307, -0.03770593926310539, -0.03438562899827957, 0.04392457753419876, -0.0481325201690197, 0.1407165229320526, 0.03972296044230461, -0.033222101628780365, 0.17605775594711304, -0.13402539491653442, 0.0006810897029936314, 0.019407112151384354, 0.00377766415476799, 0.09363320469856262, 0.06214385852217674, -0.07423675805330276, -0.05837290361523628, -0.008451418951153755, 0.015009402297437191, -0.02300109900534153, -0.1797635704278946, -0.13766421377658844, 0.14759740233421326, -0.08850917965173721, -0.01113794557750225, -0.11841753125190735, 0.11984574794769287, -0.0011192884994670749, -0.06350966542959213, 0.06972872465848923, 0.06312360614538193, 0.28447049856185913, 0.09141506254673004, -0.09978833794593811, 0.13340063393115997, 0.0003196934994775802, 0.019351491704583168, 0.18355481326580048, -0.22981563210487366, 0.22609420120716095, 0.023419765755534172, -0.09055270254611969, -0.13085849583148956, -0.06363873928785324, 0.18052780628204346, -0.05926336720585823, -0.12221534550189972, 0.1034654751420021, 0.0133915850892663, -0.0024950499646365643, 0.12711454927921295, -0.10337857156991959, 0.028463860973715782, -0.15540336072444916, -0.04307795315980911, 0.04374640807509422, -0.030178451910614967, -0.03271753340959549, 0.009815776720643044, 0.008050522767007351, 0.09385637938976288, 0.025749146938323975, -0.1621808111667633, 0.2813403904438019, 0.04740709438920021, -0.1749933958053589, 0.03495623916387558, 0.16400285065174103, -0.0312114879488945, -0.07321140915155411, -0.1341012716293335, -0.014672468416392803, -0.010986143723130226, -0.06743758916854858, 0.012777184136211872, 0.05768841505050659, 0.004030787386000156, -0.13529808819293976, 0.09876207262277603, -0.0468611866235733, 0.16974088549613953, 0.02501581609249115, 0.1264452338218689, 0.06082111969590187, -0.04366388916969299, -0.1348801702260971, 0.11086985468864441, -0.04927532374858856, 0.04213939607143402, 0.0022172981407493353, -0.0407545305788517, -0.1574772745370865, -0.04986068978905678, -0.0014988631010055542, 0.078058160841465, 0.1313028633594513, 0.057681307196617126, 0.1949898898601532, -0.0628645196557045, -0.0016831703251227736, 0.08270734548568726, -0.017322279512882233, 0.07633494585752487, 0.03549951687455177, -0.08475852757692337, -0.010466239415109158, 0.020711777731776237, 0.08277484029531479, 0.19449098408222198, 0.06682382524013519, -0.021322594955563545, -0.06364545226097107, 0.15940390527248383, 0.10968450456857681, 0.006011195480823517, 0.15889807045459747, 0.05654590576887131, -0.030204879119992256, -0.09032057970762253, -0.09764868766069412, -0.047035444527864456, 0.05938642472028732, 0.038441140204668045, -0.07379613816738129, -0.06486465781927109, -0.051326218992471695, -0.007158554159104824, -0.08362764865159988, 0.1434265673160553, 0.065452940762043, 0.022280436009168625, 0.06983362883329391, 0.0655921995639801, -0.12224975228309631, -0.033968400210142136, -0.03560259938240051, 0.15075121819972992, 0.013390030711889267, -0.022397074848413467, -0.030516093596816063, -0.28465282917022705, 0.15296955406665802, 0.16366440057754517, 0.05877356976270676, -0.08421339839696884, 0.045836590230464935, -0.10028616338968277, -0.015635836869478226, -0.017035694792866707, 0.056484222412109375, 0.07420908659696579, 0.025918789207935333, -0.03058515302836895, -0.05792619660496712, -0.022607674822211266, 0.25674837827682495, -0.0014167989138513803, -0.004049188923090696, 0.12579216063022614, -0.07458031177520752, -0.17107318341732025, 0.047931741923093796, -0.18749310076236725, 0.04970903694629669, -0.061829593032598495, 0.027982240542769432, 0.1599223017692566, -0.00226485263556242, 0.001498253783211112, 0.077382892370224, 0.10719826072454453, 0.07760629802942276, -0.014595226384699345, 0.12043289095163345, 0.00364903686568141, -0.018542015925049782, 0.09836103022098541, -0.0463043749332428, -0.03663811460137367, 0.0602332204580307, 0.09662344306707382, 0.17383702099323273, -0.014050626195967197, 0.04072811082005501, -0.0599776916205883, -0.0003507315705064684, 0.07573600858449936, -0.016957415267825127, -0.06819498538970947, -0.023668017238378525, 0.07168107479810715, -0.10938413441181183, 0.11158768087625504, 0.04963001608848572, 0.06237021088600159, -0.07346033304929733, 0.07000505179166794, -0.08804061263799667, 0.12148215621709824, 0.02390368841588497, -0.013349214568734169, -0.05994154140353203, -0.08506876975297928, 0.07834086567163467, -0.1575133353471756, 0.05813775956630707, 0.005945154000073671, -0.2141299843788147, 0.13088612258434296, 0.01053778175264597, -0.08737921714782715, -0.04595981538295746, -0.09423398971557617, 0.041714493185281754, 0.006146503612399101, 0.06865673512220383, 0.1765589863061905, 0.04925873503088951, -0.1008756086230278, 0.120145782828331, -0.08902676403522491, -0.1649080365896225, 0.005595429800450802, 0.0852993056178093, -0.02678864076733589, 0.154261514544487, -0.12273611128330231, -0.11140090227127075, 0.09432768821716309, -0.08696961402893066, 0.07798225432634354, -0.09729574620723724, 0.0619821697473526, -0.06626961380243301, -0.040821559727191925, -0.00639294134452939, 0.07503759860992432, -0.07308249175548553, 0.03398127108812332, -0.09971911460161209, 0.09805792570114136, -0.06189515069127083, -0.009234662167727947, -0.10652638971805573, -0.10740005970001221, -0.02302333153784275, 0.17406855523586273, -0.1372072547674179, 0.01680213212966919, 0.05663597956299782, -0.11098883301019669, 0.04227234050631523, 0.044499147683382034, -0.001520995399914682, -0.05206606164574623, 0.02838118001818657, 0.048317231237888336, 0.015240950509905815, 0.11754842102527618, -0.0352788046002388, 0.21274851262569427, -0.0329473577439785, -0.028921179473400116, -0.07112666964530945, -0.09715057164430618, -0.06778454035520554, 0.022032231092453003, 0.018970537930727005, 0.03126441314816475, 0.06691600382328033, -0.10067760944366455, 0.03824927657842636, -0.03312929347157478, 0.11001989990472794, -0.05194339156150818, 0.0930628702044487, -0.05583773925900459, -0.11217235773801804, -0.19513151049613953, 0.07856069505214691, 0.10589566826820374, 0.027995122596621513, 0.02027960494160652, 0.048165787011384964, -0.06600058823823929, 0.10910913348197937, -0.15872815251350403, 0.041132859885692596, 0.06503879278898239, -0.16081373393535614, -0.13546857237815857, 0.06100233271718025, -0.014645378105342388, -0.05948017165064812, -0.030159970745444298, -0.15036892890930176, -0.07771341502666473, 0.05175530165433884, 0.06613273173570633, 0.18414521217346191, 0.09452458471059799, -0.008265281096100807, -0.023393195122480392, 0.14289632439613342, -0.18488292396068573, -0.021262196823954582, 0.09066349267959595, 0.01606258936226368, -0.007634471170604229, 0.00475526787340641, 0.18113818764686584, -0.047970250248909, -0.14326009154319763, -0.17971082031726837, -0.10171552002429962, 0.03649020567536354, 0.15469276905059814, 0.20238709449768066, 0.0016547990962862968, 0.0719163715839386, -0.013998174108564854, -0.21352717280387878, 0.17589367926120758, 0.1794709712266922, -0.10998624563217163, -0.005287290550768375, 0.07963693886995316, -0.07185697555541992, -0.12464912235736847, -0.03830643743276596, 0.12657229602336884, -0.011505347676575184, 0.1710480898618698, 0.009055320173501968, -0.054782915860414505, -0.20316065847873688, -0.04156111553311348, -0.01828072965145111, -0.094750314950943, 0.05611417442560196, 0.06745430082082748, 0.12387123703956604, -0.11713898181915283, -0.11337531358003616, 0.05419988930225372, 0.06234319880604744, -0.04764193296432495, -0.08524003624916077, 0.09431871771812439, -0.023813113570213318, -0.14496110379695892, 0.21064816415309906, 0.12100528925657272, 0.034939274191856384, 0.15224230289459229, 0.08137591928243637, -0.026566853746771812, 0.08891071379184723, 0.08593989163637161, -0.042530518025159836, -0.1674778312444687, -0.12656015157699585, -0.014471560716629028, -0.06510739773511887, -0.03426086902618408, 0.07466757297515869, -0.2082493156194687, -0.08307965844869614, -0.10420498996973038, 0.1558913290500641, 0.028072470799088478, -0.09716740250587463, -0.0400727279484272, 0.026112109422683716, -0.09331346303224564, -0.16026845574378967, -0.06261994689702988, -0.06983047723770142, 0.17970888316631317, 0.03440011665225029, -0.2055584192276001, -0.038036469370126724, -0.062006875872612, -0.11172258853912354, -0.01173026580363512, 0.048689160495996475, -0.15363647043704987, 0.01403244212269783, -0.04444508999586105, -0.11052147299051285, 0.17785683274269104, -0.11111737787723541, 0.01503710262477398, -0.07455936074256897, 0.130684033036232, 0.03330593928694725, -0.07816138118505478, 0.025988135486841202, -0.03797400742769241, 0.17918355762958527, -0.20728078484535217, 0.1412011831998825, 0.13217709958553314, -0.014316472224891186, 0.005081279203295708, -0.1542651206254959, -0.1471462845802307, -0.09261956065893173, 0.002631417941302061, -0.18974906206130981, 0.04395926743745804, 0.0046070897951722145, 0.02943548560142517, 0.10677950084209442, -0.04952944070100784, 0.18688569962978363, 0.017092207446694374, 0.09805809706449509, 0.0049245948903262615, 0.03453328087925911, -0.06016426533460617, 0.025776885449886322, -0.013974452391266823, -0.03190900757908821, -0.009159940294921398, -0.0430099219083786, 0.03210628032684326, 0.1152113825082779, 0.012184173800051212, -0.17191091179847717, -0.07570149004459381, 0.012117799371480942, 0.07418178021907806, -0.00985792838037014, -0.0775870755314827, 0.00496249133720994, -0.0061594764702022076, 0.11624157428741455, -0.07058001309633255, -0.15174758434295654, -0.1581566035747528, 0.056423790752887726, -0.05476456135511398, 0.0769076943397522, -0.16390331089496613, -0.01811409741640091, -0.08917003124952316, 0.05810399726033211, -0.009724590927362442, 0.02164386957883835, -0.0851968303322792, -0.15616726875305176, 0.06831739842891693, -0.0495242178440094, -0.013792694546282291, -0.05393576622009277, 0.017160745337605476, 0.15392270684242249, -0.11543647199869156, 0.033591095358133316, 0.08896441757678986, -0.012205970473587513, -0.18523526191711426, -0.10642052441835403, 0.09048060327768326, 0.0031809716019779444, -0.016715984791517258, 0.12602239847183228, 0.1810091733932495, 0.04803398251533508, -0.02864702232182026, 0.034848134964704514, 0.16351750493049622, -0.08177510648965836, 0.0799950361251831, 0.049287762492895126, -0.21479704976081848, -0.05787177383899689, 0.20870274305343628, 0.19670717418193817, -0.02917899191379547, -0.014673368073999882, 0.1077510267496109, -0.11776024103164673, -0.050609856843948364, -0.12978774309158325, 0.04686860740184784, 0.21039901673793793, -0.08759512007236481, -0.05787206441164017, -0.04313012585043907, 0.19960099458694458, -0.07932645827531815, -0.08118028193712234, 0.013904622755944729, 0.005269734188914299, 0.03049347922205925, 0.13643129169940948, -0.03759344294667244, -0.03999306261539459, 0.08956252038478851, -0.25976264476776123, -0.12035997956991196, -0.1726505160331726, 0.07922554016113281, -0.08997832983732224, -0.08604609221220016, -0.010965113528072834, 0.04433104023337364, -0.020485447719693184, 0.00030001599225215614, -0.0828426256775856, 0.06677427142858505, 0.15722711384296417, 0.039878327399492264, -0.11692765355110168, -0.09322967380285263, -0.0693441852927208, -0.049696072936058044, -0.0005302872741594911, -0.06479810923337936, -0.12493802607059479, -0.0847337618470192, 0.03811774030327797, -0.12163253128528595, 0.07494144886732101, 0.14497092366218567, -0.04469547048211098, -0.07968832552433014, 0.11526643484830856, -0.08222935348749161, -0.12995383143424988, -0.02542913518846035, -0.05531290918588638, -0.0524732731282711, -0.0205096323043108, -0.043371353298425674, 0.06826610118150711, -0.1230267658829689, 0.09968044608831406, -2.8656516075134277, -0.14637483656406403, 0.1458456814289093, -0.07855939120054245, 0.003973464947193861, 0.04395592212677002, -0.14763695001602173, -0.1054278165102005, 0.25260257720947266, -0.06251044571399689, 0.13749021291732788, -0.04347370192408562, 0.022073261439800262, -0.031100915744900703, -0.1201740950345993, 0.04671158641576767, 0.038935620337724686, 0.11523193120956421, -0.012500127777457237, 0.043766628950834274, 0.07818104326725006, -0.03578841686248779, 0.017005236819386482, 0.3377397656440735, 0.05754644051194191, -0.07806985080242157, 0.06679005175828934, 0.07263384014368057, 0.1570320725440979, 0.15458165109157562, -0.07175267487764359, 0.014109877869486809, 0.020472025498747826, 0.18863946199417114, 0.07830576598644257, 0.1482079178094864, -0.0048288521356880665, 0.03671528398990631, 0.13236811757087708, -0.02578289993107319, 0.3087286651134491, -0.2583500146865845, -0.06534243375062943, -0.01825057528913021, 0.078977070748806, -0.04472985118627548, -0.18951094150543213, -0.15289969742298126, 0.15358944237232208, -0.05776436626911163, 0.09396965056657791, -0.011777237057685852, 0.05286586657166481, -0.10848789662122726, 0.011509508825838566, -0.046027541160583496, -0.06410644203424454, 0.17234528064727783, 0.097799152135849, 0.20909744501113892, 0.03900535777211189, 0.0025442042388021946, -0.058702096343040466, 0.07223135977983475, 0.11101941019296646, -0.11970861256122589, -0.2834143340587616, -0.12498477846384048, -0.03989219665527344, -0.061618901789188385, 0.12398082762956619, -0.12197677791118622, -0.1326797902584076, 0.05142227187752724, 0.008867129683494568, 0.02931891195476055, -0.1950065642595291, -0.24886098504066467, 0.175762340426445, 0.006455547176301479, -0.2528555691242218, -0.014882724732160568, -0.11596345901489258, 0.1824953854084015, 0.19246917963027954, -0.004538584966212511, 0.07001838833093643, 0.03681835159659386, -0.054071009159088135, 0.02595287747681141, 0.13316476345062256, 0.13073943555355072, 0.04011424258351326, -0.018131745979189873, 0.07026994973421097, -0.06842922419309616, -0.06123876944184303, -0.03567569702863693, 0.07594582438468933, 0.09212852269411087, 0.11816873401403427, 0.041473619639873505, 0.0419541634619236, 0.10429178178310394, 0.15900875627994537, -0.07231968641281128, -0.000486445554997772, -0.0281683262437582, -0.0066808899864554405, 0.06101728975772858, 0.011451605707406998, -0.06977646797895432, -0.08920690417289734, -0.03161873668432236, -0.021749207749962807, -0.044524289667606354, 0.04088452085852623, -0.30895310640335083, 0.07990328967571259, 0.14182297885417938], 'element_id_property': '4:2dbbe7d3-51e1-4903-a514-4dd4aed7696d:138'}\n", "Number of proteins associated with GO:0005739: 11\n", - "Number of organisms with at least two proteins: 3\n" + "Number of organisms with at least two proteins: 9\n" ] } ], @@ -194,9 +237,1153 @@ "print(\"Number of organisms with at least two proteins: \", len(organisms))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "plotlyServerURL": "https://plot.ly" + }, + "data": [ + { + "marker": { + "color": "red", + "size": 8 + }, + "mode": "markers", + "name": "ADH", + "text": [ + "accession_id: Q6FQA4
name: alcohol dehydrogenase
seq_length: 352
mol_weight: 37545.0
ec_number: 1.1.1.1", + "accession_id: A0A7H9HSD9
name: alcohol dehydrogenase
seq_length: 348
mol_weight: 36974.0
ec_number: 1.1.1.1", + "accession_id: C5DHM6
name: alcohol dehydrogenase
seq_length: 348
mol_weight: 36599.0
ec_number: 1.1.1.1", + "accession_id: P20369
name: Alcohol dehydrogenase 1
seq_length: 350
mol_weight: 37261.0
ec_number: 1.1.1.1", + "accession_id: Q757I1
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37158.0
ec_number: 1.1.1.1", + "accession_id: Q9P4C2
name: Alcohol dehydrogenase 2
seq_length: 348
mol_weight: 36968.0
ec_number: 1.1.1.1", + "accession_id: A0A1X7R1I9
name: alcohol dehydrogenase
seq_length: 348
mol_weight: 36784.0
ec_number: 1.1.1.1", + "accession_id: A0A1G4JJF2
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37147.0
ec_number: 1.1.1.1", + "accession_id: A0A1G4KF85
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37390.0
ec_number: 1.1.1.1", + "accession_id: C5DNB7
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37069.0
ec_number: 1.1.1.1", + "accession_id: A0A1G4M9V8
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37261.0
ec_number: 1.1.1.1", + "accession_id: G0W4V9
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37144.0
ec_number: 1.1.1.1", + "accession_id: H2AXS6
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37171.0
ec_number: 1.1.1.1", + "accession_id: A0A1G4MBD6
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37009.0
ec_number: 1.1.1.1", + "accession_id: G0VK69
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37045.0
ec_number: 1.1.1.1", + "accession_id: P00330
name: Alcohol dehydrogenase 1
seq_length: 348
mol_weight: 36849.0
ec_number: 1.1.1.1", + "accession_id: J8Q680
name: alcohol dehydrogenase
seq_length: 348
mol_weight: 36673.0
ec_number: 1.1.1.1", + "accession_id: P00331
name: Alcohol dehydrogenase 2
seq_length: 348
mol_weight: 36732.0
ec_number: 1.1.1.1" + ], + "type": "scatter", + "x": [ + -186.77227783203125, + 23.778718948364258, + 40.66227722167969, + -193.2668914794922, + -8.318355560302734, + -130.4921417236328, + -151.20591735839844, + 134.99868774414062, + -63.87946319580078, + 103.29740905761719, + -30.65559959411621, + -28.87839698791504, + 131.7727508544922, + 6.913041114807129, + -108.14468383789062, + 208.11013793945312, + 110.35122680664062, + 137.19406127929688 + ], + "y": [ + -341.30322265625, + -499.9996337890625, + -648.2156982421875, + -296.458740234375, + -686.1292114257812, + -260.89154052734375, + -83.85225677490234, + -284.53155517578125, + -467.99359130859375, + -218.72593688964844, + -244.74966430664062, + -324.8351135253906, + -173.20262145996094, + -268.6771545410156, + -98.4156723022461, + -467.0985107421875, + -619.5795288085938, + -525.6121826171875 + ] + }, + { + "marker": { + "color": "blue", + "size": 8 + }, + "mode": "markers", + "name": "proline biosynthesis", + "text": [ + "accession_id: A0A8B9YUY7
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48104.0
ec_number: 2.6.1.13", + "accession_id: A0A8C6W4W5
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48284.0
ec_number: 2.6.1.13", + "accession_id: A0A8J6G992
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48239.0
ec_number: 2.6.1.13", + "accession_id: L8I4V3
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48104.0
ec_number: 2.6.1.13", + "accession_id: A0A1U7QEB0
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48328.0
ec_number: 2.6.1.13", + "accession_id: A0A8C6GQ10
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48314.0
ec_number: 2.6.1.13", + "accession_id: A0A452EKJ3
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48132.0
ec_number: 2.6.1.13", + "accession_id: A0A8C6HVU6
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48370.0
ec_number: 2.6.1.13", + "accession_id: A0A6I9L5L6
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48331.0
ec_number: 2.6.1.13", + "accession_id: A0A5J5MK22
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48074.0
ec_number: 2.6.1.13", + "accession_id: A0A6J0Y425
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48090.0
ec_number: 2.6.1.13", + "accession_id: A0A6P5B7Q0
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48118.0
ec_number: 2.6.1.13", + "accession_id: F1MYG0
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48118.0
ec_number: 2.6.1.13", + "accession_id: A0A851UXD9
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48355.0
ec_number: 2.6.1.13", + "accession_id: P04182
name: Ornithine aminotransferase, mitochondrial
seq_length: 439
mol_weight: 48333.0
ec_number: 2.6.1.13", + "accession_id: G3HVE0
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48431.0
ec_number: 2.6.1.13", + "accession_id: P29758
name: Ornithine aminotransferase, mitochondrial
seq_length: 439
mol_weight: 48355.0
ec_number: 2.6.1.13", + "accession_id: A0A6P3IYQ1
name: Ornithine aminotransferase
seq_length: 439
mol_weight: 48104.0
ec_number: 2.6.1.13", + "accession_id: Q3ZCF5
name: Ornithine aminotransferase, mitochondrial
seq_length: 439
mol_weight: 48075.0
ec_number: 2.6.1.13" + ], + "type": "scatter", + "x": [ + -410.7798767089844, + 204.80616760253906, + 473.3436279296875, + -457.86761474609375, + 396.9206848144531, + 379.79766845703125, + -311.87481689453125, + 345.0708312988281, + 420.676025390625, + -385.58782958984375, + -354.5928039550781, + -404.9271545410156, + -404.9271545410156, + 410.7213134765625, + 271.6178894042969, + 327.4793395996094, + 410.7213134765625, + -468.85711669921875, + -484.5874938964844 + ], + "y": [ + 530.3589477539062, + 669.9793090820312, + 732.4609985351562, + 498.2153625488281, + 757.3943481445312, + 550.4390258789062, + 597.9725952148438, + 619.3108520507812, + 712.5540161132812, + 691.67138671875, + 489.0978088378906, + 598.8034057617188, + 598.8034057617188, + 607.118896484375, + 651.4642333984375, + 773.041259765625, + 607.118896484375, + 557.7855834960938, + 663.695068359375 + ] + }, + { + "marker": { + "color": "green", + "size": 8 + }, + "mode": "markers", + "name": "no annotation", + "text": [ + "accession_id: A0A7H9HSJ3
name: alcohol dehydrogenase
seq_length: 350
mol_weight: 36998.0
ec_number: 1.1.1.1", + "accession_id: J5PRJ1
name: alcohol dehydrogenase
seq_length: 348
mol_weight: 36645.0
ec_number: 1.1.1.1", + "accession_id: G8ZTZ5
name: alcohol dehydrogenase
seq_length: 351
mol_weight: 37303.0
ec_number: 1.1.1.1", + "accession_id: A0AA35J9C9
name: alcohol dehydrogenase
seq_length: 348
mol_weight: 36808.0
ec_number: 1.1.1.1", + "accession_id: J8LIG6
name: alcohol dehydrogenase
seq_length: 348
mol_weight: 36765.0
ec_number: 1.1.1.1", + "accession_id: J7SA96
name: alcohol dehydrogenase
seq_length: 350
mol_weight: 37091.0
ec_number: 1.1.1.1", + "accession_id: Q6QDP7
name: Cyclic AMP-responsive element-binding protein 3-like protein 2
seq_length: 521
mol_weight: 57379.0
ec_number: None" + ], + "type": "scatter", + "x": [ + 46.96210479736328, + 89.03892517089844, + 290.4442138671875, + 183.9209442138672, + 233.75531005859375, + 192.45892333984375, + 293.7549743652344 + ], + "y": [ + -434.9825439453125, + -574.7026977539062, + -258.2119140625, + -415.0147705078125, + -411.8081970214844, + -148.38223266601562, + -182.1365203857422 + ] + } + ], + "layout": { + "height": 600, + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "fillpattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "title": { + "text": "2D t-SNE Visualization of Protein Embeddings" + }, + "width": 900, + "xaxis": { + "title": { + "text": "t-SNE Dimension 1" + } + }, + "yaxis": { + "title": { + "text": "t-SNE Dimension 2" + } + } + } + } + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.manifold import TSNE\n", + "import plotly.graph_objects as go\n", + "\n", + "# Annotations of interest (Alcohol dehydrogenase annotation and proline biosynthesis annotation --> two different protein families)\n", + "adh_go_id = \"GO:0004022\"\n", + "ploline_biosynthesis_go_id = \"GO:0055129\"\n", + "\n", + "# Query to get all proteins with embeddings and get the label based on the annotations\n", + "query = \"\"\"\n", + "MATCH (p:Protein)\n", + "OPTIONAL MATCH (p)-[:ASSOCIATED_WITH]-(g:GOAnnotation)\n", + "WITH p, collect(g.go_id) AS go_ids\n", + "RETURN p.accession_id AS protein_id, \n", + " p.embedding AS embedding,\n", + " CASE \n", + " WHEN 'GO:0055129' IN go_ids THEN 'proline biosynthesis'\n", + " WHEN 'GO:0004022' IN go_ids THEN 'ADH'\n", + " ELSE 'no annotation'\n", + " END AS label\n", + "\"\"\"\n", + "\n", + "result = eedb.db.execute_read(query)\n", + "\n", + "# Prepare data for visualization\n", + "data = dict(\n", + " protein_id=[],\n", + " embedding=[],\n", + " label=[],\n", + ")\n", + "for record in result:\n", + " data[\"protein_id\"].append(record[\"protein_id\"])\n", + " data[\"embedding\"].append(record[\"embedding\"])\n", + " data[\"label\"].append(record[\"label\"])\n", + "\n", + "protein_ids, embeddings, labels = (\n", + " data[\"protein_id\"],\n", + " np.array(data[\"embedding\"]),\n", + " data[\"label\"],\n", + ")\n", + "\n", + "colors = []\n", + "for label in labels:\n", + " if label == \"ADH\":\n", + " colors.append(\"red\")\n", + " elif label == \"proline biosynthesis\":\n", + " colors.append(\"blue\")\n", + " else:\n", + " colors.append(\"green\")\n", + "\n", + "hover_texts = [\n", + " \"
\".join(\n", + " [\n", + " f\"{key}: {value}\"\n", + " for key, value in Protein.nodes.get(\n", + " accession_id=protein_id\n", + " ).__dict__.items()\n", + " if key\n", + " in [\n", + " \"accession_id\",\n", + " \"mol_weight\",\n", + " \"ec_number\",\n", + " \"seq_length\",\n", + " \"mol_weight\",\n", + " \"name\",\n", + " ]\n", + " ]\n", + " )\n", + " for protein_id in protein_ids\n", + "]\n", + "\n", + "\n", + "# Apply t-SNE to Reduce Embeddings to 2D\n", + "tsne = TSNE(n_components=2, random_state=42, perplexity=5, max_iter=3000)\n", + "embeddings_2d = tsne.fit_transform(embeddings)\n", + "\n", + "unique_labels = set(labels) # Find the unique labels\n", + "traces = []\n", + "\n", + "for label in unique_labels:\n", + " indices = [i for i, l in enumerate(labels) if l == label]\n", + " trace = go.Scatter(\n", + " x=[embeddings_2d[i, 0] for i in indices],\n", + " y=[embeddings_2d[i, 1] for i in indices],\n", + " mode=\"markers\",\n", + " marker=dict(\n", + " size=8,\n", + " color=colors[indices[0]],\n", + " ),\n", + " name=label,\n", + " text=[hover_texts[i] for i in indices],\n", + " )\n", + " traces.append(trace)\n", + "\n", + "layout = go.Layout(\n", + " title=\"2D t-SNE Visualization of Protein Embeddings\",\n", + " xaxis_title=\"t-SNE Dimension 1\",\n", + " yaxis_title=\"t-SNE Dimension 2\",\n", + " width=900,\n", + " height=600,\n", + ")\n", + "\n", + "fig = go.Figure(data=traces, layout=layout)\n", + "\n", + "fig.show()" + ] + }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ {