improve indexing workflow

sib-swiss · Dec 18, 2024 · e7f88bb · e7f88bb
1 parent ea8d32c
commit e7f88bb
Show file tree

Hide file tree

Showing 6 changed files with 100 additions and 69 deletions.
diff --git a/README.md b/README.md
@@ -165,39 +165,58 @@ print("\n".join(issues))
 >
 > It can easily be adapted to use any LLM served through an OpenAI-compatible API. We plan to make configuration and deployment of complete SPARQL LLM chat system easier in the future, let us know if you are interested in the GitHub issues!
 
-Create a `.env` file at the root of the repository to provide secrets and API keys:
+1. Create a `.env` file at the root of the repository to provide secrets and API keys:
 
-```bash
-OPENAI_API_KEY=sk-proj-YYY
-GLHF_API_KEY=APIKEY_FOR_glhf.chat_USED_FOR_TEST_OPEN_SOURCE_MODELS
-EXPASY_API_KEY=NOT_SO_SECRET_API_KEY_USED_BY_FRONTEND_TO_AVOID_SPAM_FROM_CRAWLERS
-LOGS_API_KEY=SECRET_PASSWORD_TO_EASILY_ACCESS_LOGS_THROUGH_THE_API
-```
+   ```sh
+   OPENAI_API_KEY=sk-proj-YYY
+   GLHF_API_KEY=APIKEY_FOR_glhf.chat_USED_FOR_TEST_OPEN_SOURCE_MODELS
+   EXPASY_API_KEY=NOT_SO_SECRET_API_KEY_USED_BY_FRONTEND_TO_AVOID_SPAM_FROM_CRAWLERS
+   LOGS_API_KEY=SECRET_PASSWORD_TO_EASILY_ACCESS_LOGS_THROUGH_THE_API
+   ```
 
-Build the chat webpage (will be better integrated in the workflow in the future):
+2. Build the chat UI webpage (will be better integrated in the workflow in the future):
 
-```bash
-cd chat-with-context
-npm i
-npm run build:demo
-cd ..
-```
+   ```sh
+   cd chat-with-context
+   npm i
+   npm run build:demo
+   cd ..
+   ```
 
-Start the web UI, API, and similarity search engine in production (you might need to make some changes to the `compose.yml` file to adapt it to your server/proxy setup):
+3. Start the vector database and web server
 
-```bash
-docker compose up
-```
+   In production (you might need to make some changes to the `compose.yml` file to adapt it to your server/proxy setup):
 
-Start the stack locally for development, with code from `src` folder mounted in the container and automatic API reload on changes to the code:
+   ```bash
+   docker compose up
+   ```
 
-```bash
-docker compose -f compose.dev.yml up
-```
+   Start the stack locally for development, with code from `src` folder mounted in the container and automatic API reload on changes to the code:
+
+   ```bash
+   docker compose -f compose.dev.yml up
+   ```
 
-* Chat web UI available at http://localhost:8000
-* OpenAPI Swagger UI available at http://localhost:8000/docs
-* Vector database dashboard UI available at http://localhost:6333/dashboard
+   * Chat web UI available at http://localhost:8000
+   * OpenAPI Swagger UI available at http://localhost:8000/docs
+   * Vector database dashboard UI available at http://localhost:6333/dashboard
+
+4. Run the script to index the resources (SPARQL endpoints listed in config file):
+
+   ```sh
+   docker compose run api python src/sparql_llm/embed.py
+   ```
+
+> [!WARNING]
+>
+> **Experimental entities indexing**: it can take a lot of time to generate embeddings for entities. So we recommend to run the script to generate embeddings on a machine with GPU (does not need to be a powerful one, but at least with a GPU, checkout [fastembed GPU docs](https://qdrant.github.io/fastembed/examples/FastEmbed_GPU/) to install the GPU drivers and dependencies)
+>
+> ```sh
+> pip install -e ".[chat,gpu]"
+> python src/sparql_llm/embed_entities.py
+> ```
+>
+> Then move the CSV containing the embeddings in `data/embeddings/entities_embeddings.py` before running the `embed.py` script
 
 ## 🧑‍💻 Contributing
 

diff --git a/prestart.sh b/prestart.sh
diff --git a/src/sparql_llm/api.py b/src/sparql_llm/api.py
@@ -216,9 +216,18 @@ async def chat(request: ChatCompletionRequest):
             ]
         ),
         limit=settings.retrieved_docs_count,
+        # group_by="iri",
+        # with_payload=True,
     )
+    # TODO: vectordb.search_groups(
+    # https://qdrant.tech/documentation/concepts/search/#search-groups
+
+    # TODO: hybrid search? https://qdrant.github.io/fastembed/examples/Hybrid_Search/#about-qdrant
+    # we might want to group by iri for shex docs https://qdrant.tech/documentation/concepts/hybrid-queries/?q=hybrid+se#grouping
+    # https://qdrant.tech/documentation/concepts/search/#search-groups
 
     prompt_with_context += "Here is some additional information that could be useful to answer the user question:\n\n"
+    # for docs_hit in docs_hits.groups:
     for docs_hit in docs_hits:
         if docs_hit.payload["doc_type"] == "shex":
             prompt_with_context += f"ShEx shape for {docs_hit.payload['question']} in {docs_hit.payload['endpoint_url']}:\n```\n{docs_hit.payload['answer']}\n```\n\n"

diff --git a/src/sparql_llm/embed.py b/src/sparql_llm/embed.py
@@ -81,6 +81,7 @@ def load_schemaorg_description(endpoint: dict[str, str]) -> list[Document]:
                     "question": question,
                     "answer": "\n".join(descs),
                     "endpoint_url": endpoint["endpoint_url"],
+                    "iri": endpoint["homepage"],
                     "doc_type": "schemaorg_description",
                 },
             )
@@ -119,6 +120,7 @@ def load_ontology(endpoint: dict[str, str]) -> list[Document]:
                 "question": split.page_content,
                 "answer": "",
                 "endpoint_url": endpoint["endpoint_url"],
+                "iri": endpoint["ontology"],
                 "doc_type": "ontology",
             },
         )
@@ -175,6 +177,7 @@ def init_vectordb(vectordb_host: str = settings.vectordb_host) -> None:
     The UniProt consortium is headed by Alex Bateman, Alan Bridge and Cathy Wu, supported by key staff, and receives valuable input from an independent Scientific Advisory Board.
     """,
                     "endpoint_url": "https://sparql.uniprot.org/sparql/",
+                    "iri": "http://www.uniprot.org/help/about",
                     "doc_type": "schemaorg_description",
                 },
             )
@@ -201,8 +204,9 @@ def init_vectordb(vectordb_host: str = settings.vectordb_host) -> None:
             collection_name=settings.entities_collection_name,
             vectors_config=VectorParams(size=settings.embedding_dimensions, distance=Distance.COSINE),
         )
-    if vectordb.get_collection(settings.entities_collection_name).points_count == 0:
-        load_entities_embeddings_to_vectordb()
+    # if vectordb.get_collection(settings.entities_collection_name).points_count == 0:
+    load_entities_embeddings_to_vectordb()
+
 
     # docs = []
     # # TODO: Add entities list to the vectordb

diff --git a/src/sparql_llm/embed_entities.py b/src/sparql_llm/embed_entities.py
@@ -134,32 +134,6 @@ def generate_embeddings_for_entities():
     ?uri a orth:Protein .
     ?uri rdfs:label ?label .}""",
         },
-        # TODO: way too many UniProt genes, should we just ignore indexing genes?
-    #     "uniprot_gene": {
-    #         "uri": "http://purl.uniprot.org/core/Gene",
-    #         "label": "Gene",
-    #         "description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.",
-    #         "endpoint": "https://sparql.uniprot.org/sparql/",
-    #         "pagination": True,
-    #         "query": """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
-    # PREFIX up: <http://purl.uniprot.org/core/>
-    # PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
-    # SELECT  ?uri ?label {
-    # ?uri a up:Gene .
-    # ?uri skos:prefLabel ?label .}""",
-    #     },
-    #     "uniprot_protein": {
-    #         "uri": "http://purl.uniprot.org/core/Protein",
-    #         "label": "Protein",
-    #         "description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.",
-    #         "endpoint": "https://sparql.uniprot.org/sparql/",
-    #         "pagination": True,
-    #         "query": """PREFIX up: <http://purl.uniprot.org/core/>
-    # PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
-    # SELECT  ?uri ?label {
-    # ?uri a up:Protein .
-    # ?uri rdfs:label ?label .}""",
-    #     },
         "uniprot_species": {
             "uri": "http://purl.uniprot.org/core/Taxon",
             "label": "species",
@@ -201,19 +175,6 @@ def generate_embeddings_for_entities():
                 orth:taxRange  ?label .
         }""",
         },
-    #     "uniprot_mnemonics": {
-    #         "uri": "http://purl.uniprot.org/core/Protein",
-    #         "label": "mnemonic",
-    #         "description": "uniprot mnemonic",
-    #         "endpoint": "https://sparql.uniprot.org/sparql/",
-    #         "pagination": True,
-    #         "query": """PREFIX up: <http://purl.uniprot.org/core/>
-    # SELECT ?uri ?label
-    # WHERE {
-    #     ?uri a up:Protein ;
-    #         up:mnemonic  ?label .
-    #     }""",
-    #     },
         "uniprot_taxon": {
             "uri": "http://purl.uniprot.org/core/Taxon",
             "label": "species",
@@ -241,6 +202,45 @@ def generate_embeddings_for_entities():
             skos:prefLabel ?label .
     }""",
         },
+    # TODO: way too many UniProt genes, should we just ignore indexing genes?
+    #     "uniprot_gene": {
+    #         "uri": "http://purl.uniprot.org/core/Gene",
+    #         "label": "Gene",
+    #         "description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.",
+    #         "endpoint": "https://sparql.uniprot.org/sparql/",
+    #         "pagination": True,
+    #         "query": """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
+    # PREFIX up: <http://purl.uniprot.org/core/>
+    # PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+    # SELECT  ?uri ?label {
+    # ?uri a up:Gene .
+    # ?uri skos:prefLabel ?label .}""",
+    #     },
+    #     "uniprot_protein": {
+    #         "uri": "http://purl.uniprot.org/core/Protein",
+    #         "label": "Protein",
+    #         "description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.",
+    #         "endpoint": "https://sparql.uniprot.org/sparql/",
+    #         "pagination": True,
+    #         "query": """PREFIX up: <http://purl.uniprot.org/core/>
+    # PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+    # SELECT  ?uri ?label {
+    # ?uri a up:Protein .
+    # ?uri rdfs:label ?label .}""",
+    #     },
+    #     "uniprot_mnemonics": {
+    #         "uri": "http://purl.uniprot.org/core/Protein",
+    #         "label": "mnemonic",
+    #         "description": "uniprot mnemonic",
+    #         "endpoint": "https://sparql.uniprot.org/sparql/",
+    #         "pagination": True,
+    #         "query": """PREFIX up: <http://purl.uniprot.org/core/>
+    # SELECT ?uri ?label
+    # WHERE {
+    #     ?uri a up:Protein ;
+    #         up:mnemonic  ?label .
+    #     }""",
+    #     },
     }
 
     docs: list[Document] = []
@@ -301,7 +301,7 @@ def load_entities_embeddings_to_vectordb():
     docs = []
     embeddings = []
 
-    print("Reading entities embeddings from the .csv file")
+    print(f"Reading entities embeddings from the .csv file at {entities_embeddings_filepath}")
     with open(entities_embeddings_filepath) as file:
         reader = csv.DictReader(file)
         for row in tqdm(reader, desc="Extracting embeddings from CSV file"):

diff --git a/src/sparql_llm/sparql_void_shapes_loader.py b/src/sparql_llm/sparql_void_shapes_loader.py
@@ -40,7 +40,7 @@ def load(self) -> list[Document]:
             metadata_dict = {
                 "answer": shex_shape["shex"],
                 "endpoint_url": self.endpoint_url,
-                "class_uri": cls_uri,
+                "iri": cls_uri,
                 "doc_type": "shex",
             }
             if "label" in shex_shape: