Skip to content

Commit

Permalink
improve indexing workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Dec 18, 2024
1 parent ea8d32c commit e7f88bb
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 69 deletions.
69 changes: 44 additions & 25 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,39 +165,58 @@ print("\n".join(issues))
>
> It can easily be adapted to use any LLM served through an OpenAI-compatible API. We plan to make configuration and deployment of complete SPARQL LLM chat system easier in the future, let us know if you are interested in the GitHub issues!
Create a `.env` file at the root of the repository to provide secrets and API keys:
1. Create a `.env` file at the root of the repository to provide secrets and API keys:

```bash
OPENAI_API_KEY=sk-proj-YYY
GLHF_API_KEY=APIKEY_FOR_glhf.chat_USED_FOR_TEST_OPEN_SOURCE_MODELS
EXPASY_API_KEY=NOT_SO_SECRET_API_KEY_USED_BY_FRONTEND_TO_AVOID_SPAM_FROM_CRAWLERS
LOGS_API_KEY=SECRET_PASSWORD_TO_EASILY_ACCESS_LOGS_THROUGH_THE_API
```
```sh
OPENAI_API_KEY=sk-proj-YYY
GLHF_API_KEY=APIKEY_FOR_glhf.chat_USED_FOR_TEST_OPEN_SOURCE_MODELS
EXPASY_API_KEY=NOT_SO_SECRET_API_KEY_USED_BY_FRONTEND_TO_AVOID_SPAM_FROM_CRAWLERS
LOGS_API_KEY=SECRET_PASSWORD_TO_EASILY_ACCESS_LOGS_THROUGH_THE_API
```

Build the chat webpage (will be better integrated in the workflow in the future):
2. Build the chat UI webpage (will be better integrated in the workflow in the future):

```bash
cd chat-with-context
npm i
npm run build:demo
cd ..
```
```sh
cd chat-with-context
npm i
npm run build:demo
cd ..
```

Start the web UI, API, and similarity search engine in production (you might need to make some changes to the `compose.yml` file to adapt it to your server/proxy setup):
3. Start the vector database and web server

```bash
docker compose up
```
In production (you might need to make some changes to the `compose.yml` file to adapt it to your server/proxy setup):

Start the stack locally for development, with code from `src` folder mounted in the container and automatic API reload on changes to the code:
```bash
docker compose up
```

```bash
docker compose -f compose.dev.yml up
```
Start the stack locally for development, with code from `src` folder mounted in the container and automatic API reload on changes to the code:

```bash
docker compose -f compose.dev.yml up
```

* Chat web UI available at http://localhost:8000
* OpenAPI Swagger UI available at http://localhost:8000/docs
* Vector database dashboard UI available at http://localhost:6333/dashboard
* Chat web UI available at http://localhost:8000
* OpenAPI Swagger UI available at http://localhost:8000/docs
* Vector database dashboard UI available at http://localhost:6333/dashboard

4. Run the script to index the resources (SPARQL endpoints listed in config file):

```sh
docker compose run api python src/sparql_llm/embed.py
```

> [!WARNING]
>
> **Experimental entities indexing**: it can take a lot of time to generate embeddings for entities. So we recommend to run the script to generate embeddings on a machine with GPU (does not need to be a powerful one, but at least with a GPU, checkout [fastembed GPU docs](https://qdrant.github.io/fastembed/examples/FastEmbed_GPU/) to install the GPU drivers and dependencies)
>
> ```sh
> pip install -e ".[chat,gpu]"
> python src/sparql_llm/embed_entities.py
> ```
>
> Then move the CSV containing the embeddings in `data/embeddings/entities_embeddings.py` before running the `embed.py` script
## 🧑‍💻 Contributing
Expand Down
1 change: 0 additions & 1 deletion prestart.sh

This file was deleted.

9 changes: 9 additions & 0 deletions src/sparql_llm/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,18 @@ async def chat(request: ChatCompletionRequest):
]
),
limit=settings.retrieved_docs_count,
# group_by="iri",
# with_payload=True,
)
# TODO: vectordb.search_groups(
# https://qdrant.tech/documentation/concepts/search/#search-groups

# TODO: hybrid search? https://qdrant.github.io/fastembed/examples/Hybrid_Search/#about-qdrant
# we might want to group by iri for shex docs https://qdrant.tech/documentation/concepts/hybrid-queries/?q=hybrid+se#grouping
# https://qdrant.tech/documentation/concepts/search/#search-groups

prompt_with_context += "Here is some additional information that could be useful to answer the user question:\n\n"
# for docs_hit in docs_hits.groups:
for docs_hit in docs_hits:
if docs_hit.payload["doc_type"] == "shex":
prompt_with_context += f"ShEx shape for {docs_hit.payload['question']} in {docs_hit.payload['endpoint_url']}:\n```\n{docs_hit.payload['answer']}\n```\n\n"
Expand Down
8 changes: 6 additions & 2 deletions src/sparql_llm/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def load_schemaorg_description(endpoint: dict[str, str]) -> list[Document]:
"question": question,
"answer": "\n".join(descs),
"endpoint_url": endpoint["endpoint_url"],
"iri": endpoint["homepage"],
"doc_type": "schemaorg_description",
},
)
Expand Down Expand Up @@ -119,6 +120,7 @@ def load_ontology(endpoint: dict[str, str]) -> list[Document]:
"question": split.page_content,
"answer": "",
"endpoint_url": endpoint["endpoint_url"],
"iri": endpoint["ontology"],
"doc_type": "ontology",
},
)
Expand Down Expand Up @@ -175,6 +177,7 @@ def init_vectordb(vectordb_host: str = settings.vectordb_host) -> None:
The UniProt consortium is headed by Alex Bateman, Alan Bridge and Cathy Wu, supported by key staff, and receives valuable input from an independent Scientific Advisory Board.
""",
"endpoint_url": "https://sparql.uniprot.org/sparql/",
"iri": "http://www.uniprot.org/help/about",
"doc_type": "schemaorg_description",
},
)
Expand All @@ -201,8 +204,9 @@ def init_vectordb(vectordb_host: str = settings.vectordb_host) -> None:
collection_name=settings.entities_collection_name,
vectors_config=VectorParams(size=settings.embedding_dimensions, distance=Distance.COSINE),
)
if vectordb.get_collection(settings.entities_collection_name).points_count == 0:
load_entities_embeddings_to_vectordb()
# if vectordb.get_collection(settings.entities_collection_name).points_count == 0:
load_entities_embeddings_to_vectordb()


# docs = []
# # TODO: Add entities list to the vectordb
Expand Down
80 changes: 40 additions & 40 deletions src/sparql_llm/embed_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,32 +134,6 @@ def generate_embeddings_for_entities():
?uri a orth:Protein .
?uri rdfs:label ?label .}""",
},
# TODO: way too many UniProt genes, should we just ignore indexing genes?
# "uniprot_gene": {
# "uri": "http://purl.uniprot.org/core/Gene",
# "label": "Gene",
# "description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.",
# "endpoint": "https://sparql.uniprot.org/sparql/",
# "pagination": True,
# "query": """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
# PREFIX up: <http://purl.uniprot.org/core/>
# PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
# SELECT ?uri ?label {
# ?uri a up:Gene .
# ?uri skos:prefLabel ?label .}""",
# },
# "uniprot_protein": {
# "uri": "http://purl.uniprot.org/core/Protein",
# "label": "Protein",
# "description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.",
# "endpoint": "https://sparql.uniprot.org/sparql/",
# "pagination": True,
# "query": """PREFIX up: <http://purl.uniprot.org/core/>
# PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
# SELECT ?uri ?label {
# ?uri a up:Protein .
# ?uri rdfs:label ?label .}""",
# },
"uniprot_species": {
"uri": "http://purl.uniprot.org/core/Taxon",
"label": "species",
Expand Down Expand Up @@ -201,19 +175,6 @@ def generate_embeddings_for_entities():
orth:taxRange ?label .
}""",
},
# "uniprot_mnemonics": {
# "uri": "http://purl.uniprot.org/core/Protein",
# "label": "mnemonic",
# "description": "uniprot mnemonic",
# "endpoint": "https://sparql.uniprot.org/sparql/",
# "pagination": True,
# "query": """PREFIX up: <http://purl.uniprot.org/core/>
# SELECT ?uri ?label
# WHERE {
# ?uri a up:Protein ;
# up:mnemonic ?label .
# }""",
# },
"uniprot_taxon": {
"uri": "http://purl.uniprot.org/core/Taxon",
"label": "species",
Expand Down Expand Up @@ -241,6 +202,45 @@ def generate_embeddings_for_entities():
skos:prefLabel ?label .
}""",
},
# TODO: way too many UniProt genes, should we just ignore indexing genes?
# "uniprot_gene": {
# "uri": "http://purl.uniprot.org/core/Gene",
# "label": "Gene",
# "description": "A region (or regions) that includes all of the sequence elements necessary to encode a functional transcript. A gene may include regulatory regions, transcribed regions and/or other functional sequence regions.",
# "endpoint": "https://sparql.uniprot.org/sparql/",
# "pagination": True,
# "query": """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
# PREFIX up: <http://purl.uniprot.org/core/>
# PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
# SELECT ?uri ?label {
# ?uri a up:Gene .
# ?uri skos:prefLabel ?label .}""",
# },
# "uniprot_protein": {
# "uri": "http://purl.uniprot.org/core/Protein",
# "label": "Protein",
# "description": "A sequence of amino acids linked by peptide bonds which may lack appreciable tertiary structure and may not be liable to irreversible denaturation.",
# "endpoint": "https://sparql.uniprot.org/sparql/",
# "pagination": True,
# "query": """PREFIX up: <http://purl.uniprot.org/core/>
# PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
# SELECT ?uri ?label {
# ?uri a up:Protein .
# ?uri rdfs:label ?label .}""",
# },
# "uniprot_mnemonics": {
# "uri": "http://purl.uniprot.org/core/Protein",
# "label": "mnemonic",
# "description": "uniprot mnemonic",
# "endpoint": "https://sparql.uniprot.org/sparql/",
# "pagination": True,
# "query": """PREFIX up: <http://purl.uniprot.org/core/>
# SELECT ?uri ?label
# WHERE {
# ?uri a up:Protein ;
# up:mnemonic ?label .
# }""",
# },
}

docs: list[Document] = []
Expand Down Expand Up @@ -301,7 +301,7 @@ def load_entities_embeddings_to_vectordb():
docs = []
embeddings = []

print("Reading entities embeddings from the .csv file")
print(f"Reading entities embeddings from the .csv file at {entities_embeddings_filepath}")
with open(entities_embeddings_filepath) as file:
reader = csv.DictReader(file)
for row in tqdm(reader, desc="Extracting embeddings from CSV file"):
Expand Down
2 changes: 1 addition & 1 deletion src/sparql_llm/sparql_void_shapes_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def load(self) -> list[Document]:
metadata_dict = {
"answer": shex_shape["shex"],
"endpoint_url": self.endpoint_url,
"class_uri": cls_uri,
"iri": cls_uri,
"doc_type": "shex",
}
if "label" in shex_shape:
Expand Down

0 comments on commit e7f88bb

Please sign in to comment.