From 0f9c4575448f76efcd2448bf58cbf57dce665d5b Mon Sep 17 00:00:00 2001 From: Vincent Emonet Date: Wed, 4 Dec 2024 17:29:28 +0100 Subject: [PATCH] comment indexing of gene and prots entities for uniprot (too many) --- .gitignore | 2 +- src/sparql_llm/api.py | 6 +-- src/sparql_llm/embed_entities.py | 81 ++++++++++++++++++-------------- 3 files changed, 49 insertions(+), 40 deletions(-) diff --git a/.gitignore b/.gitignore index 2e68311..6db8678 100644 --- a/.gitignore +++ b/.gitignore @@ -8,5 +8,5 @@ __pycache__/ notebooks/*.csv notebooks/*.txt -entities_embeddings.csv +entities_embeddings*.csv uv.lock diff --git a/src/sparql_llm/api.py b/src/sparql_llm/api.py index 3f6c98e..d1acdf4 100644 --- a/src/sparql_llm/api.py +++ b/src/sparql_llm/api.py @@ -183,7 +183,7 @@ async def chat(request: ChatCompletionRequest): # # We also provide the example queries as previous messages to the LLM # system_msg: list[Message] = [{"role": "system", "content": settings.system_prompt}] - # Get the most relevant examples SPARQL queries from the search engine + # 1. Get the most relevant examples SPARQL queries from the search engine query_hits = vectordb.search( collection_name=settings.docs_collection_name, query_vector=query_embeddings, @@ -201,7 +201,7 @@ async def chat(request: ChatCompletionRequest): prompt_with_context += f"{query_hit.payload['question']}:\n\n```sparql\n# {query_hit.payload['endpoint_url']}\n{query_hit.payload['answer']}\n```\n\n" # prompt_with_context += f"{query_hit.payload['question']}\nQuery to run in SPARQL endpoint {query_hit.payload['endpoint_url']}\n\n{query_hit.payload['answer']}\n\n" - # Get the most relevant documents other than SPARQL query examples from the search engine (ShEx shapes, general infos) + # 2. Get the most relevant documents other than SPARQL query examples from the search engine (ShEx shapes, general infos) docs_hits = vectordb.search( collection_name=settings.docs_collection_name, query_vector=query_embeddings, @@ -234,7 +234,7 @@ async def chat(request: ChatCompletionRequest): else: prompt_with_context += f"Information about: {docs_hit.payload['question']}\nRelated to SPARQL endpoint {docs_hit.payload['endpoint_url']}\n\n{docs_hit.payload['answer']}\n\n" - # Now extract entities from the user question + # 3. Extract potential entities from the user question entities_list = extract_entities(question) for entity in entities_list: prompt_with_context += f'\n\nEntities found in the user question for "{" ".join(entity["term"])}":\n\n' diff --git a/src/sparql_llm/embed_entities.py b/src/sparql_llm/embed_entities.py index 328a801..bf2500d 100644 --- a/src/sparql_llm/embed_entities.py +++ b/src/sparql_llm/embed_entities.py @@ -10,6 +10,10 @@ from sparql_llm.config import get_embedding_model, get_vectordb, settings from sparql_llm.utils import query_sparql +# Run the script to extract entities from endpoints and generate embeddings for them (long): +# uv run python src/sparql_llm/embed_entities.py + + entities_embeddings_dir = os.path.join("data", "embeddings") entities_embeddings_filepath = os.path.join(entities_embeddings_dir, "entities_embeddings.csv") @@ -78,8 +82,9 @@ def generate_embeddings_for_entities(): "query": """PREFIX genex: PREFIX rdfs: SELECT DISTINCT ?uri ?label { - ?uri a . - ?uri rdfs:label ?label .}""", + ?uri a . + ?uri rdfs:label ?label . + }""", }, "bgee_gene": { "uri": "http://purl.org/net/orth#Gene", @@ -92,10 +97,12 @@ def generate_embeddings_for_entities(): PREFIX dc: SELECT DISTINCT ?uri ?label { ?uri a orth:Gene . - {?uri rdfs:label ?label .} - UNION { - ?uri dc:identifier ?label .} - }""", + { + ?uri rdfs:label ?label . + } UNION { + ?uri dc:identifier ?label . + } + }""", }, "oma_protein": { "uri": "http://purl.org/net/orth#Protein", @@ -109,9 +116,10 @@ def generate_embeddings_for_entities(): SELECT DISTINCT ?uri ?label { ?uri a orth:Protein . - {?uri rdfs:label ?label .} - UNION { - ?uri dc:identifier ?label .} + { + ?uri rdfs:label ?label . + } UNION { + ?uri dc:identifier ?label .} }""", }, "oma_gene": { @@ -126,31 +134,32 @@ def generate_embeddings_for_entities(): ?uri a orth:Protein . ?uri rdfs:label ?label .}""", }, - "uniprot_gene": { - "uri": "http://purl.uniprot.org/core/Gene", - "label": "Gene", - "description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.", - "endpoint": "https://sparql.uniprot.org/sparql/", - "pagination": True, - "query": """PREFIX skos: - PREFIX up: - PREFIX rdfs: - SELECT ?uri ?label { - ?uri a up:Gene . - ?uri skos:prefLabel ?label .}""", - }, - "uniprot_protein": { - "uri": "http://purl.uniprot.org/core/Protein", - "label": "Protein", - "description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.", - "endpoint": "https://sparql.uniprot.org/sparql/", - "pagination": True, - "query": """PREFIX up: - PREFIX rdfs: - SELECT ?uri ?label { - ?uri a up:Protein . - ?uri rdfs:label ?label .}""", - }, + # TODO: way too many UniProt genes, should we just ignore indexing genes? + # "uniprot_gene": { + # "uri": "http://purl.uniprot.org/core/Gene", + # "label": "Gene", + # "description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.", + # "endpoint": "https://sparql.uniprot.org/sparql/", + # "pagination": True, + # "query": """PREFIX skos: + # PREFIX up: + # PREFIX rdfs: + # SELECT ?uri ?label { + # ?uri a up:Gene . + # ?uri skos:prefLabel ?label .}""", + # }, + # "uniprot_protein": { + # "uri": "http://purl.uniprot.org/core/Protein", + # "label": "Protein", + # "description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.", + # "endpoint": "https://sparql.uniprot.org/sparql/", + # "pagination": True, + # "query": """PREFIX up: + # PREFIX rdfs: + # SELECT ?uri ?label { + # ?uri a up:Protein . + # ?uri rdfs:label ?label .}""", + # }, "uniprot_species": { "uri": "http://purl.uniprot.org/core/Taxon", "label": "species", @@ -228,8 +237,8 @@ def generate_embeddings_for_entities(): PREFIX skos: PREFIX up: SELECT ?uri ?label ?type WHERE { - ?uri a up:Disease ; - skos:prefLabel ?label . + ?uri a up:Disease ; + skos:prefLabel ?label . }""", }, }