Skip to content

Commit

Permalink
Pinecone support for dataprep and retrieval microservice (#157)
Browse files Browse the repository at this point in the history
Signed-off-by: Pallavi Jaini <pallavi.jaini@intel.com>
  • Loading branch information
pallavijaini0525 authored Jun 26, 2024
1 parent 4649d68 commit 8b6486b
Show file tree
Hide file tree
Showing 18 changed files with 438 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .github/workflows/docker/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ services:
pull_policy: always
retriever-redis-server:
build:
dockerfile: comps/retrievers/langchain/docker/Dockerfile
dockerfile: comps/retrievers/langchain/redis/docker/Dockerfile
extends: embedding-tei-server
image: ${REGISTRY}/${REPO}:retriever-redis-server
reranking-tei-server:
Expand Down
4 changes: 4 additions & 0 deletions comps/dataprep/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ For details, please refer to this [readme](milvus/README.md)

For details, please refer to this [readme](qdrant/README.md)

# Dataprep Microservice with Pinecone

For details, please refer to this [readme](pinecone/README.md)

# Dataprep Microservice with PGVector

For details, please refer to this [readme](pgvector/README.md)
69 changes: 69 additions & 0 deletions comps/dataprep/pinecone/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Dataprep Microservice with Pinecone

# 🚀Start Microservice with Python

## Install Requirements

```bash
pip install -r requirements.txt
```

## Start Pinecone Server

Please refer to this [readme](../../../vectorstores/langchain/pinecone/README.md).

## Setup Environment Variables

```bash
export http_proxy=${your_http_proxy}
export https_proxy=${your_http_proxy}
export PINECONE_API_KEY=${PINECONE_API_KEY}
export PINECONE_INDEX_NAME=${PINECONE_INDEX_NAME}
```

## Start Document Preparation Microservice for Pinecone with Python Script

Start document preparation microservice for Pinecone with below command.

```bash
python prepare_doc_pinecone.py
```

# 🚀Start Microservice with Docker

## Build Docker Image

```bash
cd ../../../../
docker build -t opea/dataprep-pinecone:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/pinecone/docker/Dockerfile .
```

## Run Docker with CLI

```bash
docker run -d --name="dataprep-pinecone-server" -p 6000:6000 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/dataprep-pinecone:latest
```

## Setup Environment Variables

```bash
export http_proxy=${your_http_proxy}
export https_proxy=${your_http_proxy}
export PINECONE_API_KEY=${PINECONE_API_KEY}
export PINECONE_INDEX_NAME=${PINECONE_INDEX_NAME}
```

## Run Docker with Docker Compose

```bash
cd comps/dataprep/pinecone/docker
docker compose -f docker-compose-dataprep-pinecone.yaml up -d
```

# Invoke Microservice

Once document preparation microservice for Pinecone is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database.

```bash
curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document"}' http://localhost:6000/v1/dataprep
```
2 changes: 2 additions & 0 deletions comps/dataprep/pinecone/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
16 changes: 16 additions & 0 deletions comps/dataprep/pinecone/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import os

# Embedding model
EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")

# Pinecone configuration
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "xxx_xxx")
PINECONE_INDEX_NAME = int(os.getenv("PINECONE_INDEX_NAME", "langchain-test"))

# LLM/Embedding endpoints
TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081")
TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT")
31 changes: 31 additions & 0 deletions comps/dataprep/pinecone/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

FROM python:3.11-slim

ENV LANG C.UTF-8

RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
build-essential \
libgl1-mesa-glx \
libjemalloc-dev \
vim

RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/

USER user

COPY comps /home/user/comps

RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /home/user/comps/dataprep/pinecone/requirements.txt

ENV PYTHONPATH=$PYTHONPATH:/home/user

WORKDIR /home/user/comps/dataprep/pinecone

ENTRYPOINT ["python", "prepare_doc_pinecone.py"]

Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

version: "3"
services:
dataprep-pinecone:
image: opea/gen-ai-comps:dataprep-pinecone-xeon-server
container_name: dataprep-pinecone-server
ports:
- "6000:6000"
ipc: host
environment:
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
PINECONE_API_KEY: ${PINECONE_API_KEY}
PINECONE_INDEX_NAME: ${PINECONE_INDEX_NAME}
restart: unless-stopped

networks:
default:
driver: bridge
60 changes: 60 additions & 0 deletions comps/dataprep/pinecone/prepare_doc_pinecone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import os

from config import EMBED_MODEL, PINECONE_API_KEY, PINECONE_INDEX_NAME
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceHubEmbeddings
from langchain_community.vectorstores import Pinecone

from comps import DocPath, opea_microservices, opea_telemetry, register_microservice
from comps.dataprep.utils import document_loader

tei_embedding_endpoint = os.getenv("TEI_ENDPOINT")


@register_microservice(
name="opea_service@prepare_doc_pinecone",
endpoint="/v1/dataprep",
host="0.0.0.0",
port=6000,
input_datatype=DocPath,
output_datatype=None,
)
@opea_telemetry
def ingest_documents(doc_path: DocPath):
"""Ingest document to Pinecone."""
doc_path = doc_path.path
print(f"Parsing document {doc_path}.")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True)
content = document_loader(doc_path)
chunks = text_splitter.split_text(content)

print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
# Create vectorstore
if tei_embedding_endpoint:
# create embeddings using TEI endpoint service
embedder = HuggingFaceHubEmbeddings(model=tei_embedding_endpoint)
else:
# create embeddings using local embedding model
embedder = HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)

# Batch size
batch_size = 32
num_chunks = len(chunks)
for i in range(0, num_chunks, batch_size):
batch_chunks = chunks[i : i + batch_size]
batch_texts = batch_chunks

_ = Pinecone.from_texts(
texts=batch_texts,
embedding=embedder,
index_name=PINECONE_INDEX_NAME,
)
print(f"Processed batch {i//batch_size + 1}/{(num_chunks-1)//batch_size + 1}")


if __name__ == "__main__":
opea_microservices["opea_service@prepare_doc_pinecone"].start()
20 changes: 20 additions & 0 deletions comps/dataprep/pinecone/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
beautifulsoup4
docarray[full]
easyocr
fastapi
huggingface_hub
langchain
langchain-community
langchain-pinecone
langsmith
numpy
opentelemetry-api
opentelemetry-exporter-otlp
opentelemetry-sdk
pandas
Pillow
pinecone-client
pymupdf
python-docx
sentence_transformers
shortuuid
2 changes: 2 additions & 0 deletions comps/retrievers/langchain/pinecone/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
16 changes: 16 additions & 0 deletions comps/retrievers/langchain/pinecone/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import os

# Embedding model
EMBED_MODEL = os.getenv("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")

# Pinecone configuration
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "xxx_xxx")
PINECONE_INDEX_NAME = int(os.getenv("PINECONE_INDEX_NAME", "langchain-test"))

# LLM/Embedding endpoints
TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081")
TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT")
29 changes: 29 additions & 0 deletions comps/retrievers/langchain/pinecone/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

FROM langchain/langchain:latest

RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
libgl1-mesa-glx \
libjemalloc-dev \
vim

RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/

COPY comps /home/user/comps

RUN chmod +x /home/user/comps/retrievers/langchain/pinecone/run.sh

USER user

RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /home/user/comps/retrievers/requirements.txt

ENV PYTHONPATH=$PYTHONPATH:/home/user

WORKDIR /home/user/comps/retrievers/langchain/pinecone

ENTRYPOINT ["/home/user/comps/retrievers/langchain/pinecone/run.sh"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

version: "3.8"

services:
tei_xeon_service:
image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.2
container_name: tei-xeon-server
ports:
- "6060:80"
volumes:
- "./data:/data"
shm_size: 1g
command: --model-id ${RETRIEVE_MODEL_ID}
retriever:
image: opea/retriever-pinecone:latest
container_name: retriever-pinecone-server
ports:
- "7000:7000"
ipc: host
environment:
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
PINECONE_API_KEY: ${PINECONE_API_KEY}
INDEX_NAME: ${PINECONE_INDEX_NAME}
LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY}
restart: unless-stopped

networks:
default:
driver: bridge
Loading

0 comments on commit 8b6486b

Please sign in to comment.