Subclassing the vector dB to instead use a PostgreSQL dB for retrieval agents. #484

AndrewJJacobs · 2023-10-30T15:40:24Z

AndrewJJacobs
Oct 30, 2023

Hi, I just wanted to share a subclass I was working on and I know others probably would like to use it as well. Please let me know if you see any issues and if you found this helpful. This does not handle the structure of your db and does not embed new documents or text. I'm sorry for the formatting of the post, this is my first one.

Base = declarative_base()


class lt_storage(Base):

    __tablename__ = 'lt_storage'
    id = Column(Integer, primary_key=True)
    text = Column(String)  
    code = Column(Text)  
    embedding = Column(Vector(1536))



class SQLRetrieveUserProxyAgent(RetrieveUserProxyAgent):

    def __init__(self, db_url, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.engine = create_engine(db_url)
        self.Session = sessionmaker(bind=self.engine)


    def query_vector_db(
            self,
            query_texts: List[str],
            n_results: int = 10,
            search_string: str = "",
            **kwargs,
    ) -> Dict[str, Union[List[List[int]], List[List[str]]]]:
        master_doc_ids = []
        master_doc_texts = []

        session = self.Session()
        for query in query_texts:

            query_vector = embeddings.embed_query(query)

            results = session.execute(
                select(lt_storage.id, lt_storage.text)
                .where(
                    lt_storage.text.contains(search_string),
                )
                .order_by(lt_storage.embedding.cosine_distance(query_vector))
                .limit(n_results)
            ).fetchall()

            doc_ids = [result.id for result in results]
            doc_texts = [result.text for result in results]

            master_doc_ids.append(doc_ids)
            master_doc_texts.append(doc_texts)

        session.close()

        return {"ids": master_doc_ids, "documents": master_doc_texts}

    def retrieve_docs(self, problem: str, n_results: int = 20, search_string: str = "", **kwargs):
        results = self.query_vector_db(
            query_texts=[problem],
            n_results=n_results,
            search_string=search_string,
            **kwargs,
        )

        self._results = results
        print("doc_ids: ", results["ids"])

gagb · 2023-10-31T02:39:46Z

gagb
Oct 31, 2023
Maintainer

@thinkall

0 replies

sonichi · 2023-11-05T18:23:11Z

sonichi
Nov 5, 2023

Thank you!

0 replies

fangherk · 2024-04-04T22:06:59Z

fangherk
Apr 4, 2024

Thanks! This was really helpful for me to test the support for pgvector. Here is the script I played with that I got working for this example

import os
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent

from typing import List
from sqlalchemy import create_engine, text
from openai import OpenAI

CLIENT = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
# FIXME: Swap your db_url here
DB_URL = "postgresql://postgres:postgres@localhost:5432/some_db_here"


def embed_query(query: str):
    response = CLIENT.embeddings.create(model="text-embedding-3-small", input=query)
    return response.data[0].embedding


class PgRetrieveUserProxyAgent(RetrieveUserProxyAgent):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.engine = create_engine(DB_URL)

    def query_vector_db(
        self,
        query_texts: List[str],
        n_results: int = 10,
        search_string: str = "",
        **kwargs,
    ):
        doc_ids = []
        doc_pieces = []

        for query in query_texts:
            embedding = embed_query(query)

            SQL_QUERY = """SELECT * FROM documents ORDER BY embedding <=> :embedding LIMIT :n_results """
            final_sql = text(SQL_QUERY).bindparams(
                embedding=str(embedding), n_results=n_results
            )

            # There is no re-ranking step right now, which we probably want to better filter results.
            with self.engine.connect() as conn:
                results = conn.execute(final_sql)
                iterated_results = [row for row in results]
                ids = [row[0] for row in iterated_results]
                # FIXME: Change the index to wherever your content is in the tuples that are returned
                pieces = [row[2] for row in iterated_results]

                doc_ids.append(ids)  # id
                doc_pieces.append(pieces)  # content

        return {"ids": doc_ids, "documents": doc_pieces}

    def retrieve_docs(
        self, problem: str, n_results: int = 20, search_string: str = "", **kwargs
    ):
        """Handle retrieval with pgvector"""
        results = self.query_vector_db(
            query_texts=[problem],
            n_results=n_results,
            search_string=search_string,
            **kwargs,
        )
        self._search_string = search_string
        self._results = results


CONFIG_LIST = [{"model": "gpt-4-0125-preview", "api_key": os.environ["OPENAI_API_KEY"]}]

if __name__ == "__main__":
    assessor = RetrieveAssistantAgent(
        name="Assessor",
        system_message="You are a helpful assistant.",
        llm_config={"config_list": CONFIG_LIST},
    )

    rag_proxy = PgRetrieveUserProxyAgent(
        name="pg_proxy",
        retrieve_config={"task": "qa", "client": None, "docs_path": None},
        human_input_mode="NEVER",
    )

    assessor.reset()

    PROBLEM = """ Tell me more about water and fire. """
    rag_proxy.initiate_chat(
        assessor,
        message=rag_proxy.message_generator,
        problem=PROBLEM,
    )

0 replies

thinkall · 2024-04-17T11:53:54Z

thinkall
Apr 17, 2024
Maintainer

Hi @AndrewJJacobs , would you like to raise a PR to add PostgreSQL DB into the vector module? Once added, using postgresql would just need to update a parameter, no need to extend a new Agent.

1 reply

sonichi Apr 18, 2024

Is this the same as #2439 ?

thinkall · 2024-04-22T23:54:30Z

thinkall
Apr 22, 2024
Maintainer

Is this the same as #2439 ?

Right. It has been supported in #2439 . Close this one.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Subclassing the vector dB to instead use a PostgreSQL dB for retrieval agents. #484

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 5 comments 1 reply

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

Select a reply

Subclassing the vector dB to instead use a PostgreSQL dB for retrieval agents. #484

AndrewJJacobs Oct 30, 2023

Replies: 5 comments · 1 reply

gagb Oct 31, 2023 Maintainer

sonichi Nov 5, 2023

fangherk Apr 4, 2024

thinkall Apr 17, 2024 Maintainer

sonichi Apr 18, 2024

thinkall Apr 22, 2024 Maintainer

AndrewJJacobs
Oct 30, 2023

Replies: 5 comments 1 reply

gagb
Oct 31, 2023
Maintainer

sonichi
Nov 5, 2023

fangherk
Apr 4, 2024

thinkall
Apr 17, 2024
Maintainer

thinkall
Apr 22, 2024
Maintainer