-
Notifications
You must be signed in to change notification settings - Fork 149
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Pinecone support for dataprep and retrieval microservice (#157)
Signed-off-by: Pallavi Jaini <pallavi.jaini@intel.com>
- Loading branch information
1 parent
4649d68
commit 8b6486b
Showing
18 changed files
with
438 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
# Dataprep Microservice with Pinecone | ||
|
||
# 🚀Start Microservice with Python | ||
|
||
## Install Requirements | ||
|
||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## Start Pinecone Server | ||
|
||
Please refer to this [readme](../../../vectorstores/langchain/pinecone/README.md). | ||
|
||
## Setup Environment Variables | ||
|
||
```bash | ||
export http_proxy=${your_http_proxy} | ||
export https_proxy=${your_http_proxy} | ||
export PINECONE_API_KEY=${PINECONE_API_KEY} | ||
export PINECONE_INDEX_NAME=${PINECONE_INDEX_NAME} | ||
``` | ||
|
||
## Start Document Preparation Microservice for Pinecone with Python Script | ||
|
||
Start document preparation microservice for Pinecone with below command. | ||
|
||
```bash | ||
python prepare_doc_pinecone.py | ||
``` | ||
|
||
# 🚀Start Microservice with Docker | ||
|
||
## Build Docker Image | ||
|
||
```bash | ||
cd ../../../../ | ||
docker build -t opea/dataprep-pinecone:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/pinecone/docker/Dockerfile . | ||
``` | ||
|
||
## Run Docker with CLI | ||
|
||
```bash | ||
docker run -d --name="dataprep-pinecone-server" -p 6000:6000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-pinecone:latest | ||
``` | ||
|
||
## Setup Environment Variables | ||
|
||
```bash | ||
export http_proxy=${your_http_proxy} | ||
export https_proxy=${your_http_proxy} | ||
export PINECONE_API_KEY=${PINECONE_API_KEY} | ||
export PINECONE_INDEX_NAME=${PINECONE_INDEX_NAME} | ||
``` | ||
|
||
## Run Docker with Docker Compose | ||
|
||
```bash | ||
cd comps/dataprep/pinecone/docker | ||
docker compose -f docker-compose-dataprep-pinecone.yaml up -d | ||
``` | ||
|
||
# Invoke Microservice | ||
|
||
Once document preparation microservice for Pinecone is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. | ||
|
||
```bash | ||
curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document"}' http://localhost:6000/v1/dataprep | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import os | ||
|
||
# Embedding model | ||
EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2") | ||
|
||
# Pinecone configuration | ||
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "xxx_xxx") | ||
PINECONE_INDEX_NAME = int(os.getenv("PINECONE_INDEX_NAME", "langchain-test")) | ||
|
||
# LLM/Embedding endpoints | ||
TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") | ||
TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081") | ||
TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
|
||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
FROM python:3.11-slim | ||
|
||
ENV LANG C.UTF-8 | ||
|
||
RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ | ||
build-essential \ | ||
libgl1-mesa-glx \ | ||
libjemalloc-dev \ | ||
vim | ||
|
||
RUN useradd -m -s /bin/bash user && \ | ||
mkdir -p /home/user && \ | ||
chown -R user /home/user/ | ||
|
||
USER user | ||
|
||
COPY comps /home/user/comps | ||
|
||
RUN pip install --no-cache-dir --upgrade pip && \ | ||
pip install --no-cache-dir -r /home/user/comps/dataprep/pinecone/requirements.txt | ||
|
||
ENV PYTHONPATH=$PYTHONPATH:/home/user | ||
|
||
WORKDIR /home/user/comps/dataprep/pinecone | ||
|
||
ENTRYPOINT ["python", "prepare_doc_pinecone.py"] | ||
|
21 changes: 21 additions & 0 deletions
21
comps/dataprep/pinecone/docker/docker-compose-dataprep-pinecone.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
version: "3" | ||
services: | ||
dataprep-pinecone: | ||
image: opea/gen-ai-comps:dataprep-pinecone-xeon-server | ||
container_name: dataprep-pinecone-server | ||
ports: | ||
- "6000:6000" | ||
ipc: host | ||
environment: | ||
http_proxy: ${http_proxy} | ||
https_proxy: ${https_proxy} | ||
PINECONE_API_KEY: ${PINECONE_API_KEY} | ||
PINECONE_INDEX_NAME: ${PINECONE_INDEX_NAME} | ||
restart: unless-stopped | ||
|
||
networks: | ||
default: | ||
driver: bridge |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import os | ||
|
||
from config import EMBED_MODEL, PINECONE_API_KEY, PINECONE_INDEX_NAME | ||
from langchain.text_splitter import RecursiveCharacterTextSplitter | ||
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings | ||
from langchain_community.vectorstores import Pinecone | ||
|
||
from comps import DocPath, opea_microservices, opea_telemetry, register_microservice | ||
from comps.dataprep.utils import document_loader | ||
|
||
tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") | ||
|
||
|
||
@register_microservice( | ||
name="opea_service@prepare_doc_pinecone", | ||
endpoint="/v1/dataprep", | ||
host="0.0.0.0", | ||
port=6000, | ||
input_datatype=DocPath, | ||
output_datatype=None, | ||
) | ||
@opea_telemetry | ||
def ingest_documents(doc_path: DocPath): | ||
"""Ingest document to Pinecone.""" | ||
doc_path = doc_path.path | ||
print(f"Parsing document {doc_path}.") | ||
|
||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True) | ||
content = document_loader(doc_path) | ||
chunks = text_splitter.split_text(content) | ||
|
||
print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") | ||
# Create vectorstore | ||
if tei_embedding_endpoint: | ||
# create embeddings using TEI endpoint service | ||
embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint) | ||
else: | ||
# create embeddings using local embedding model | ||
embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL) | ||
|
||
# Batch size | ||
batch_size = 32 | ||
num_chunks = len(chunks) | ||
for i in range(0, num_chunks, batch_size): | ||
batch_chunks = chunks[i : i + batch_size] | ||
batch_texts = batch_chunks | ||
|
||
_ = Pinecone.from_texts( | ||
texts=batch_texts, | ||
embedding=embedder, | ||
index_name=PINECONE_INDEX_NAME, | ||
) | ||
print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}") | ||
|
||
|
||
if __name__ == "__main__": | ||
opea_microservices["opea_service@prepare_doc_pinecone"].start() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
beautifulsoup4 | ||
docarray[full] | ||
easyocr | ||
fastapi | ||
huggingface_hub | ||
langchain | ||
langchain-community | ||
langchain-pinecone | ||
langsmith | ||
numpy | ||
opentelemetry-api | ||
opentelemetry-exporter-otlp | ||
opentelemetry-sdk | ||
pandas | ||
Pillow | ||
pinecone-client | ||
pymupdf | ||
python-docx | ||
sentence_transformers | ||
shortuuid |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
import os | ||
|
||
# Embedding model | ||
EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2") | ||
|
||
# Pinecone configuration | ||
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "xxx_xxx") | ||
PINECONE_INDEX_NAME = int(os.getenv("PINECONE_INDEX_NAME", "langchain-test")) | ||
|
||
# LLM/Embedding endpoints | ||
TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") | ||
TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081") | ||
TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
|
||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
FROM langchain/langchain:latest | ||
|
||
RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ | ||
libgl1-mesa-glx \ | ||
libjemalloc-dev \ | ||
vim | ||
|
||
RUN useradd -m -s /bin/bash user && \ | ||
mkdir -p /home/user && \ | ||
chown -R user /home/user/ | ||
|
||
COPY comps /home/user/comps | ||
|
||
RUN chmod +x /home/user/comps/retrievers/langchain/pinecone/run.sh | ||
|
||
USER user | ||
|
||
RUN pip install --no-cache-dir --upgrade pip && \ | ||
pip install --no-cache-dir -r /home/user/comps/retrievers/requirements.txt | ||
|
||
ENV PYTHONPATH=$PYTHONPATH:/home/user | ||
|
||
WORKDIR /home/user/comps/retrievers/langchain/pinecone | ||
|
||
ENTRYPOINT ["/home/user/comps/retrievers/langchain/pinecone/run.sh"] |
32 changes: 32 additions & 0 deletions
32
comps/retrievers/langchain/pinecone/docker/docker_compose_retriever.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
version: "3.8" | ||
|
||
services: | ||
tei_xeon_service: | ||
image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 | ||
container_name: tei-xeon-server | ||
ports: | ||
- "6060:80" | ||
volumes: | ||
- "./data:/data" | ||
shm_size: 1g | ||
command: --model-id ${RETRIEVE_MODEL_ID} | ||
retriever: | ||
image: opea/retriever-pinecone:latest | ||
container_name: retriever-pinecone-server | ||
ports: | ||
- "7000:7000" | ||
ipc: host | ||
environment: | ||
http_proxy: ${http_proxy} | ||
https_proxy: ${https_proxy} | ||
PINECONE_API_KEY: ${PINECONE_API_KEY} | ||
INDEX_NAME: ${PINECONE_INDEX_NAME} | ||
LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY} | ||
restart: unless-stopped | ||
|
||
networks: | ||
default: | ||
driver: bridge |
Oops, something went wrong.