Skip to content

Commit

Permalink
Merge pull request #10 from FacerAin/feat/retrieval
Browse files Browse the repository at this point in the history
Implement Retrieval System using pinecone
  • Loading branch information
FacerAin authored Nov 9, 2023
2 parents d4f0a4c + 0c5e0f6 commit 5f7ac43
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 3 deletions.
3 changes: 3 additions & 0 deletions .env.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
OPENAI_API_KEY=YOUR_OPENAI_API_KEY
PINECONE_API_KEY=YOUR_PINECONE_API_KEY
PINECONE_ENVIRONMENT_REGION=YOUR_PINECONE_ENVIRONMENT_REGION
7 changes: 5 additions & 2 deletions app/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,19 @@

from app.agent.context import SAMPLE_CONTEXT
from app.agent.prompts import system_prompt_template
from app.agent.retriever import PineconeRetriever
from app.core.config import settings


class ChatAgent:
def __init__(self) -> None:
def __init__(self, index_name: str = "khugpt") -> None:
self.llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0, openai_api_key=settings.OPENAI_API_KEY)
self.retreiver = PineconeRetriever(index_name=index_name)

def run(self, query: str):
context = self.retreiver.get_relevant_doc_string(query)
system_prompt = system_prompt_template.format(
question=query, context=SAMPLE_CONTEXT, current_date=datetime.datetime.now()
question=query, context=context, current_date=datetime.datetime.now()
)
answer = self.llm.predict(system_prompt)
return answer
1 change: 1 addition & 0 deletions app/agent/consts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
DOCUMENT_SEPERATOR: str = "\n[sep]\n"
6 changes: 5 additions & 1 deletion app/agent/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,14 @@
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Users tend to want up-to-date information, so please refer to the current date to answer.
Each context is separated by [SEP]. In most cases, the contents of each context are independent, so choose the most appropriate one and answer.
Attach the page link of the corresponding context at the bottom of the answer.
You must answer in Korean.
Current date: {current_date}
Context: {context}
Contexts: {context}
Question: {question}
Helpful answer:
Expand Down
62 changes: 62 additions & 0 deletions app/agent/retriever.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from typing import Any, Callable, Dict, List, Union

from abc import ABC, abstractmethod

import pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema.embeddings import Embeddings

from app.agent.consts import DOCUMENT_SEPERATOR
from app.core.config import settings


class Retriever(ABC):
@abstractmethod
def similarity_search(self, query: str, top_k: int = 5, **kwargs: Any):
"""Return docs most similar to query."""
raise NotImplementedError


class PineconeRetriever(Retriever):
pinecone.init(api_key=settings.PINECONE_API_KEY, environment=settings.PINECONE_ENVIRONMENT_REGION)

def __init__(
self,
index_name: str,
embedding_model: Union[Embeddings, Callable] = OpenAIEmbeddings(openai_api_key=settings.OPENAI_API_KEY),
):
self._index = self.get_pinecone_index(index_name=index_name)
self._embedding_model = embedding_model

def get_pinecone_index(self, index_name: str):
indexes = pinecone.list_indexes()

if index_name in indexes:
index = pinecone.Index(index_name)
elif len(indexes) == 0:
raise ValueError("No active indexes found in your Pinecone project.")
else:
raise ValueError(f"Index '{index_name}' not found in your Pinecone project.")
return index

def _convert_response_to_string(self, item: Dict) -> str:
doc = f"""
page_url: {item['metadata']['page_url']}
document: {item['metadata']['text']}
"""
return doc

def _combine_documents(self, responses: List[Dict]) -> List[str]:
docs = [self._convert_response_to_string(response) for response in responses]
doc_string = DOCUMENT_SEPERATOR.join(docs)
return doc_string

def similarity_search(self, query: str, top_k: int = 10, **kwargs: Any):
embeddings = self._embedding_model.embed_query(query)
responses = self._index.query([embeddings], top_k=top_k, include_metadata=True)
return responses

def get_relevant_doc_string(self, query: str, top_k: int = 10):
responses = self.similarity_search(query=query, top_k=top_k)
doc_string = self._combine_documents(responses=responses["matches"])
return doc_string
2 changes: 2 additions & 0 deletions app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ class Settings(BaseSettings):

PROJECT_NAME: str = "KHUGPT"
OPENAI_API_KEY: str
PINECONE_API_KEY: str
PINECONE_ENVIRONMENT_REGION: str

class Config:
env_file = ".env"
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
langchain==0.0.310
fastapi==0.103.0
pinecone-client==2.2.4

0 comments on commit 5f7ac43

Please sign in to comment.