From 0f9c4575448f76efcd2448bf58cbf57dce665d5b Mon Sep 17 00:00:00 2001
From: Vincent Emonet <vincent.emonet@gmail.com>
Date: Wed, 4 Dec 2024 17:29:28 +0100
Subject: [PATCH] comment indexing of gene and prots entities for uniprot (too
 many)

---
 .gitignore                       |  2 +-
 src/sparql_llm/api.py            |  6 +--
 src/sparql_llm/embed_entities.py | 81 ++++++++++++++++++--------------
 3 files changed, 49 insertions(+), 40 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2e68311..6db8678 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,5 +8,5 @@ __pycache__/
 
 notebooks/*.csv
 notebooks/*.txt
-entities_embeddings.csv
+entities_embeddings*.csv
 uv.lock
diff --git a/src/sparql_llm/api.py b/src/sparql_llm/api.py
index 3f6c98e..d1acdf4 100644
--- a/src/sparql_llm/api.py
+++ b/src/sparql_llm/api.py
@@ -183,7 +183,7 @@ async def chat(request: ChatCompletionRequest):
     # # We also provide the example queries as previous messages to the LLM
     # system_msg: list[Message] = [{"role": "system", "content": settings.system_prompt}]
 
-    # Get the most relevant examples SPARQL queries from the search engine
+    # 1. Get the most relevant examples SPARQL queries from the search engine
     query_hits = vectordb.search(
         collection_name=settings.docs_collection_name,
         query_vector=query_embeddings,
@@ -201,7 +201,7 @@ async def chat(request: ChatCompletionRequest):
         prompt_with_context += f"{query_hit.payload['question']}:\n\n```sparql\n# {query_hit.payload['endpoint_url']}\n{query_hit.payload['answer']}\n```\n\n"
         # prompt_with_context += f"{query_hit.payload['question']}\nQuery to run in SPARQL endpoint {query_hit.payload['endpoint_url']}\n\n{query_hit.payload['answer']}\n\n"
 
-    # Get the most relevant documents other than SPARQL query examples from the search engine (ShEx shapes, general infos)
+    # 2. Get the most relevant documents other than SPARQL query examples from the search engine (ShEx shapes, general infos)
     docs_hits = vectordb.search(
         collection_name=settings.docs_collection_name,
         query_vector=query_embeddings,
@@ -234,7 +234,7 @@ async def chat(request: ChatCompletionRequest):
         else:
             prompt_with_context += f"Information about: {docs_hit.payload['question']}\nRelated to SPARQL endpoint {docs_hit.payload['endpoint_url']}\n\n{docs_hit.payload['answer']}\n\n"
 
-    # Now extract entities from the user question
+    # 3. Extract potential entities from the user question
     entities_list = extract_entities(question)
     for entity in entities_list:
         prompt_with_context += f'\n\nEntities found in the user question for "{" ".join(entity["term"])}":\n\n'
diff --git a/src/sparql_llm/embed_entities.py b/src/sparql_llm/embed_entities.py
index 328a801..bf2500d 100644
--- a/src/sparql_llm/embed_entities.py
+++ b/src/sparql_llm/embed_entities.py
@@ -10,6 +10,10 @@
 from sparql_llm.config import get_embedding_model, get_vectordb, settings
 from sparql_llm.utils import query_sparql
 
+# Run the script to extract entities from endpoints and generate embeddings for them (long):
+# uv run python src/sparql_llm/embed_entities.py
+
+
 entities_embeddings_dir = os.path.join("data", "embeddings")
 entities_embeddings_filepath = os.path.join(entities_embeddings_dir, "entities_embeddings.csv")
 
@@ -78,8 +82,9 @@ def generate_embeddings_for_entities():
             "query": """PREFIX genex: <http://purl.org/genex#>
     PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
     SELECT DISTINCT ?uri ?label {
-    ?uri a <http://www.ebi.ac.uk/efo/EFO_0000399> .
-    ?uri rdfs:label ?label .}""",
+        ?uri a <http://www.ebi.ac.uk/efo/EFO_0000399> .
+        ?uri rdfs:label ?label .
+    }""",
         },
         "bgee_gene": {
             "uri": "http://purl.org/net/orth#Gene",
@@ -92,10 +97,12 @@ def generate_embeddings_for_entities():
     PREFIX dc: <http://purl.org/dc/terms/>
     SELECT DISTINCT ?uri ?label {
         ?uri a orth:Gene .
-        {?uri rdfs:label ?label .}
-        UNION {
-        ?uri dc:identifier ?label .}
-     }""",
+        {
+            ?uri rdfs:label ?label .
+        } UNION {
+            ?uri dc:identifier ?label .
+        }
+    }""",
         },
         "oma_protein": {
             "uri": "http://purl.org/net/orth#Protein",
@@ -109,9 +116,10 @@ def generate_embeddings_for_entities():
 
     SELECT DISTINCT ?uri ?label {
     ?uri a orth:Protein .
-    {?uri rdfs:label ?label .}
-    UNION {
-    ?uri dc:identifier ?label .}
+    {
+        ?uri rdfs:label ?label .
+    } UNION {
+        ?uri dc:identifier ?label .}
     }""",
         },
         "oma_gene": {
@@ -126,31 +134,32 @@ def generate_embeddings_for_entities():
     ?uri a orth:Protein .
     ?uri rdfs:label ?label .}""",
         },
-        "uniprot_gene": {
-            "uri": "http://purl.uniprot.org/core/Gene",
-            "label": "Gene",
-            "description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.",
-            "endpoint": "https://sparql.uniprot.org/sparql/",
-            "pagination": True,
-            "query": """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
-    PREFIX up: <http://purl.uniprot.org/core/>
-    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
-    SELECT  ?uri ?label {
-    ?uri a up:Gene .
-    ?uri skos:prefLabel ?label .}""",
-        },
-        "uniprot_protein": {
-            "uri": "http://purl.uniprot.org/core/Protein",
-            "label": "Protein",
-            "description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.",
-            "endpoint": "https://sparql.uniprot.org/sparql/",
-            "pagination": True,
-            "query": """PREFIX up: <http://purl.uniprot.org/core/>
-    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
-    SELECT  ?uri ?label {
-    ?uri a up:Protein .
-    ?uri rdfs:label ?label .}""",
-        },
+        # TODO: way too many UniProt genes, should we just ignore indexing genes?
+    #     "uniprot_gene": {
+    #         "uri": "http://purl.uniprot.org/core/Gene",
+    #         "label": "Gene",
+    #         "description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.",
+    #         "endpoint": "https://sparql.uniprot.org/sparql/",
+    #         "pagination": True,
+    #         "query": """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
+    # PREFIX up: <http://purl.uniprot.org/core/>
+    # PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+    # SELECT  ?uri ?label {
+    # ?uri a up:Gene .
+    # ?uri skos:prefLabel ?label .}""",
+    #     },
+    #     "uniprot_protein": {
+    #         "uri": "http://purl.uniprot.org/core/Protein",
+    #         "label": "Protein",
+    #         "description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.",
+    #         "endpoint": "https://sparql.uniprot.org/sparql/",
+    #         "pagination": True,
+    #         "query": """PREFIX up: <http://purl.uniprot.org/core/>
+    # PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+    # SELECT  ?uri ?label {
+    # ?uri a up:Protein .
+    # ?uri rdfs:label ?label .}""",
+    #     },
         "uniprot_species": {
             "uri": "http://purl.uniprot.org/core/Taxon",
             "label": "species",
@@ -228,8 +237,8 @@ def generate_embeddings_for_entities():
     PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
     PREFIX up: <http://purl.uniprot.org/core/>
     SELECT ?uri ?label ?type WHERE {
-      ?uri a up:Disease ;
-      	   skos:prefLabel ?label .
+        ?uri a up:Disease ;
+            skos:prefLabel ?label .
     }""",
         },
     }