Skip to content

Commit

Permalink
comment indexing of gene and prots entities for uniprot (too many)
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Dec 4, 2024
1 parent 67990b3 commit 0f9c457
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 40 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ __pycache__/

notebooks/*.csv
notebooks/*.txt
entities_embeddings.csv
entities_embeddings*.csv
uv.lock
6 changes: 3 additions & 3 deletions src/sparql_llm/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ async def chat(request: ChatCompletionRequest):
# # We also provide the example queries as previous messages to the LLM
# system_msg: list[Message] = [{"role": "system", "content": settings.system_prompt}]

# Get the most relevant examples SPARQL queries from the search engine
# 1. Get the most relevant examples SPARQL queries from the search engine
query_hits = vectordb.search(
collection_name=settings.docs_collection_name,
query_vector=query_embeddings,
Expand All @@ -201,7 +201,7 @@ async def chat(request: ChatCompletionRequest):
prompt_with_context += f"{query_hit.payload['question']}:\n\n```sparql\n# {query_hit.payload['endpoint_url']}\n{query_hit.payload['answer']}\n```\n\n"
# prompt_with_context += f"{query_hit.payload['question']}\nQuery to run in SPARQL endpoint {query_hit.payload['endpoint_url']}\n\n{query_hit.payload['answer']}\n\n"

# Get the most relevant documents other than SPARQL query examples from the search engine (ShEx shapes, general infos)
# 2. Get the most relevant documents other than SPARQL query examples from the search engine (ShEx shapes, general infos)
docs_hits = vectordb.search(
collection_name=settings.docs_collection_name,
query_vector=query_embeddings,
Expand Down Expand Up @@ -234,7 +234,7 @@ async def chat(request: ChatCompletionRequest):
else:
prompt_with_context += f"Information about: {docs_hit.payload['question']}\nRelated to SPARQL endpoint {docs_hit.payload['endpoint_url']}\n\n{docs_hit.payload['answer']}\n\n"

# Now extract entities from the user question
# 3. Extract potential entities from the user question
entities_list = extract_entities(question)
for entity in entities_list:
prompt_with_context += f'\n\nEntities found in the user question for "{" ".join(entity["term"])}":\n\n'
Expand Down
81 changes: 45 additions & 36 deletions src/sparql_llm/embed_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
from sparql_llm.config import get_embedding_model, get_vectordb, settings
from sparql_llm.utils import query_sparql

# Run the script to extract entities from endpoints and generate embeddings for them (long):
# uv run python src/sparql_llm/embed_entities.py


entities_embeddings_dir = os.path.join("data", "embeddings")
entities_embeddings_filepath = os.path.join(entities_embeddings_dir, "entities_embeddings.csv")

Expand Down Expand Up @@ -78,8 +82,9 @@ def generate_embeddings_for_entities():
"query": """PREFIX genex: <http://purl.org/genex#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT DISTINCT ?uri ?label {
?uri a <http://www.ebi.ac.uk/efo/EFO_0000399> .
?uri rdfs:label ?label .}""",
?uri a <http://www.ebi.ac.uk/efo/EFO_0000399> .
?uri rdfs:label ?label .
}""",
},
"bgee_gene": {
"uri": "http://purl.org/net/orth#Gene",
Expand All @@ -92,10 +97,12 @@ def generate_embeddings_for_entities():
PREFIX dc: <http://purl.org/dc/terms/>
SELECT DISTINCT ?uri ?label {
?uri a orth:Gene .
{?uri rdfs:label ?label .}
UNION {
?uri dc:identifier ?label .}
}""",
{
?uri rdfs:label ?label .
} UNION {
?uri dc:identifier ?label .
}
}""",
},
"oma_protein": {
"uri": "http://purl.org/net/orth#Protein",
Expand All @@ -109,9 +116,10 @@ def generate_embeddings_for_entities():
SELECT DISTINCT ?uri ?label {
?uri a orth:Protein .
{?uri rdfs:label ?label .}
UNION {
?uri dc:identifier ?label .}
{
?uri rdfs:label ?label .
} UNION {
?uri dc:identifier ?label .}
}""",
},
"oma_gene": {
Expand All @@ -126,31 +134,32 @@ def generate_embeddings_for_entities():
?uri a orth:Protein .
?uri rdfs:label ?label .}""",
},
"uniprot_gene": {
"uri": "http://purl.uniprot.org/core/Gene",
"label": "Gene",
"description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.",
"endpoint": "https://sparql.uniprot.org/sparql/",
"pagination": True,
"query": """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX up: <http://purl.uniprot.org/core/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?uri ?label {
?uri a up:Gene .
?uri skos:prefLabel ?label .}""",
},
"uniprot_protein": {
"uri": "http://purl.uniprot.org/core/Protein",
"label": "Protein",
"description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.",
"endpoint": "https://sparql.uniprot.org/sparql/",
"pagination": True,
"query": """PREFIX up: <http://purl.uniprot.org/core/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?uri ?label {
?uri a up:Protein .
?uri rdfs:label ?label .}""",
},
# TODO: way too many UniProt genes, should we just ignore indexing genes?
# "uniprot_gene": {
# "uri": "http://purl.uniprot.org/core/Gene",
# "label": "Gene",
# "description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.",
# "endpoint": "https://sparql.uniprot.org/sparql/",
# "pagination": True,
# "query": """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
# PREFIX up: <http://purl.uniprot.org/core/>
# PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
# SELECT ?uri ?label {
# ?uri a up:Gene .
# ?uri skos:prefLabel ?label .}""",
# },
# "uniprot_protein": {
# "uri": "http://purl.uniprot.org/core/Protein",
# "label": "Protein",
# "description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.",
# "endpoint": "https://sparql.uniprot.org/sparql/",
# "pagination": True,
# "query": """PREFIX up: <http://purl.uniprot.org/core/>
# PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
# SELECT ?uri ?label {
# ?uri a up:Protein .
# ?uri rdfs:label ?label .}""",
# },
"uniprot_species": {
"uri": "http://purl.uniprot.org/core/Taxon",
"label": "species",
Expand Down Expand Up @@ -228,8 +237,8 @@ def generate_embeddings_for_entities():
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT ?uri ?label ?type WHERE {
?uri a up:Disease ;
skos:prefLabel ?label .
?uri a up:Disease ;
skos:prefLabel ?label .
}""",
},
}
Expand Down

0 comments on commit 0f9c457

Please sign in to comment.