Integrate document vector indexing (#13)

* Add docs and document loading script Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Add METADATA_KEY Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Update DB environment variable names Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Add QDRANT to the environment for configuration Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Migrate and test load_docs and add the GUIDE.md Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Implement wbdocs to schema converter Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Update document metadata Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Add other fields Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Add disciple field Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Update the document schema Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Apply black format Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Fix typing for qdrant file Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Fix linting Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Fix lint Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Add note on the advantages of using the metadata standard Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Fix wbdocs metadata mapper Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Add context generation script and schema2info Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Add APIPrompt Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Fix static method Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> * Add the indexing guide to the documentation Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com> --------- Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com>
worldbank · Jun 15, 2023 · 8f088e4 · 8f088e4
1 parent f873907
commit 8f088e4
Show file tree

Hide file tree

Showing 24 changed files with 1,697 additions and 76 deletions.
diff --git a/GUIDE.md b/GUIDE.md
@@ -0,0 +1,39 @@
+# Guide for indexing documents and data
+
+Create the following directory structure:
+
+```
+data/sources/<data_type>/<collection>/
+    - <extension>/
+    - metadata/
+```
+
+For example:
+
+```
+data/sources/docs/prwp/
+    - pdf/
+    - metadata/
+```
+
+## Content
+
+Each `<extension>` directory contains the files to be indexed in the format specified by the extension. The files in this directory will be passed to the appropriate LangChain loader.
+
+## Metadata
+
+The `metadata` directory contains the metadata for the documents to be indexed. The files in this directory will be passed to the stored data in the index together with the vector for the each chunk of the content.
+
+To maximize the interoperability and reusability of the functionalities in LLM4Data and other related applications built on top of it, we use the [schema guide](https://mah0001.github.io/schema-guide/) to define the metadata for the documents and data.
+
+Using the standardized schema will allow you to easily integrate your own data and documents with applications built on top of LLM4Data such as the Chat4Dev application.
+
+## Indexing
+
+To index the documents and metadata, run the following command:
+
+```bash
+python -m llm4data.scripts.indexing.docs.load_docs --path=data/sources/docs/prwp/pdf --strict
+```
+
+This will process the documents and store the vectors generated to the configured vector index.
diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -17,3 +17,7 @@ parts:
       - file: notebooks/examples/indicators/README
         sections:
         - file: notebooks/examples/indicators/wdi/getting-started-with-the-wdi.ipynb
+
+  - caption: Indexing documents and data
+    chapters:
+      - file: GUIDE.md
diff --git a/example.env b/example.env
@@ -7,18 +7,24 @@ OPENAI_API_KEY=""
 WDI_DB_TABLE_NAME="wdi"
 
 ## POSTRESQL
-WDI_DB_ENGINE="postgresql"
-WDI_DB_HOST="localhost"
-WDI_DB_USERNAME="postgres"
-WDI_DB_PASSWORD=<your password>
-WDI_DB_PORT=5432
+DB_ENGINE="postgresql"
+DB_HOST="localhost"
+DB_USERNAME="postgres"
+DB_PASSWORD=<your password>
+DB_PORT=5432
 
 ## SQLITE
-# WDI_DB_ENGINE="sqlite"
-# WDI_DB_HOST=
-# WDI_DB_USERNAME="/data/sqldb/wdi.db"
-# WDI_DB_PASSWORD=
-# WDI_DB_PORT=
+# DB_ENGINE="sqlite"
+# DB_HOST=
+# DB_USERNAME="/data/sqldb/wdi.db"
+# DB_PASSWORD=
+# DB_PORT=
+
+
+# VECTOR INDEX
+QDRANT_URL="localhost"
+QDRANT_PORT=6333
+QDRANT_PATH=
 
 
 # DIRS

diff --git a/llm4data/configs.py b/llm4data/configs.py
@@ -9,11 +9,11 @@
 @dataclass
 class WDIDBConfig:
     table_name: Optional[str] = os.getenv("WDI_DB_TABLE_NAME")
-    engine: Optional[str] = os.getenv("WDI_DB_ENGINE")
-    host: Optional[str] = os.getenv("WDI_DB_HOST")
-    port: Optional[str] = os.getenv("WDI_DB_PORT")
-    username: Optional[str] = os.getenv("WDI_DB_USERNAME")
-    password: Optional[str] = os.getenv("WDI_DB_PASSWORD")
+    engine: Optional[str] = os.getenv("DB_ENGINE")
+    host: Optional[str] = os.getenv("DB_HOST")
+    port: Optional[str] = os.getenv("DB_PORT")
+    username: Optional[str] = os.getenv("DB_USERNAME")
+    password: Optional[str] = os.getenv("DB_PASSWORD")
 
     @property
     def url(self):
@@ -43,16 +43,24 @@ def __post_init__(self):
         self.llm4data_dir = self._process_dir(self.llm4data_dir, "LLM4DATA_DIR")
 
         if not isinstance(self.llm4data_cache_dir, str):
-            raise ValueError("`LLM4DATA_CACHE_DIR` environment variable must be a string.")
+            raise ValueError(
+                "`LLM4DATA_CACHE_DIR` environment variable must be a string."
+            )
 
         if not isinstance(self.openai_payload_dir, str):
-            raise ValueError("`OPENAI_PAYLOAD_DIR` environment variable must be a string.")
+            raise ValueError(
+                "`OPENAI_PAYLOAD_DIR` environment variable must be a string."
+            )
 
         self.llm4data_cache_dir = self.llm4data_dir / self.llm4data_cache_dir
         self.openai_payload_dir = self.llm4data_dir / self.openai_payload_dir
 
-        self.llm4data_cache_dir = self._process_dir(self.llm4data_cache_dir, "LLM4DATA_CACHE_DIR")
-        self.openai_payload_dir = self._process_dir(self.openai_payload_dir, "OPENAI_PAYLOAD_DIR")
+        self.llm4data_cache_dir = self._process_dir(
+            self.llm4data_cache_dir, "LLM4DATA_CACHE_DIR"
+        )
+        self.openai_payload_dir = self._process_dir(
+            self.openai_payload_dir, "OPENAI_PAYLOAD_DIR"
+        )
 
     def _process_dir(self, dirname: Union[str, Path], dirvar: str) -> Path:
         if not dirname:
@@ -61,7 +69,9 @@ def _process_dir(self, dirname: Union[str, Path], dirvar: str) -> Path:
         dirname = Path(dirname).expanduser().resolve()
 
         if dirvar != "LLM4DATA_DIR" and self.llm4data_dir == dirname:
-            raise ValueError(f"{dirvar}={dirname} is the same as LLM4DATA_DIR={self.llm4data_dir}")
+            raise ValueError(
+                f"{dirvar}={dirname} is the same as LLM4DATA_DIR={self.llm4data_dir}"
+            )
 
         if not dirname.exists():
             warnings.warn(f"{dirvar}={dirname} does not exist. Creating it now...")
@@ -85,3 +95,6 @@ def __post_init__(self):
 wdidb = WDIDBConfig()
 dirs = DirsConfig()  # NOTE: `dirs` is a reserved keyword in Python
 task_labels = TaskLabelsConfig()
+
+# Define the metadata key
+METADATA_KEY = "llm4data"
diff --git a/llm4data/embeddings/base.py b/llm4data/embeddings/base.py
@@ -8,7 +8,8 @@
 
 
 # Make the model atomically available
-LOADED_MODELS = {}
+LOADED_MODELS: dict = {}
+
 
 @dataclass
 class BaseEmbeddingModel:
@@ -43,7 +44,6 @@ def dict(self):
 
 @dataclass
 class EmbeddingModel(BaseEmbeddingModel):
-
     def __post_init__(self):
         self._common_init()
         self._instruct_init()
@@ -80,7 +80,7 @@ def _instruct_init(self):
             self.kwargs = {
                 **self.kwargs,
                 "embed_instruction": self.embed_instruction,
-                "query_instruction": self.query_instruction
+                "query_instruction": self.query_instruction,
             }
 
     def _hf_init(self):
@@ -92,11 +92,13 @@ def _hf_init(self):
                 "model_name": self.model_name,
             }
 
-    def _create_embeddings(self) -> ModelMetaclass:
+    def _create_embeddings(self):
         if not isinstance(self.kwargs, dict):
             raise ValueError("`config.kwargs` must be a dict")
 
-        self.embeddings = getattr(langchain_embeddings, self.embedding_cls)(**self.kwargs)
+        self.embeddings = getattr(langchain_embeddings, self.embedding_cls)(
+            **self.kwargs
+        )
 
-        if self.max_tokens is None:
+        if self.max_tokens is None and self.embeddings:
             self.max_tokens = self.embeddings.client.max_seq_length
diff --git a/llm4data/embeddings/docs.py b/llm4data/embeddings/docs.py
@@ -1,7 +1,8 @@
+from typing import Optional
 from dataclasses import dataclass
 from llm4data.embeddings.base import EmbeddingModel
 
-DOCS_EMBEDDINGS: EmbeddingModel = None
+DOCS_EMBEDDINGS: Optional[EmbeddingModel] = None
 
 
 @dataclass

diff --git a/llm4data/embeddings/indicators.py b/llm4data/embeddings/indicators.py
@@ -1,7 +1,8 @@
+from typing import Optional
 from dataclasses import dataclass
 from llm4data.embeddings.base import EmbeddingModel
 
-INDICATORS_EMBEDDINGS: EmbeddingModel = None
+INDICATORS_EMBEDDINGS: Optional[EmbeddingModel] = None
 
 
 @dataclass

diff --git a/llm4data/embeddings/microdata.py b/llm4data/embeddings/microdata.py
@@ -1,7 +1,8 @@
+from typing import Optional
 from dataclasses import dataclass
 from llm4data.embeddings.base import EmbeddingModel
 
-MICRODATA_EMBEDDINGS: EmbeddingModel = None
+MICRODATA_EMBEDDINGS: Optional[EmbeddingModel] = None
 
 
 @dataclass

diff --git a/llm4data/index/index.py → llm4data/index/__init__.py b/llm4data/index/index.py → llm4data/index/__init__.py
diff --git a/llm4data/index/qdrant.py b/llm4data/index/qdrant.py
@@ -1,4 +1,5 @@
 import os
+from typing import Optional, Union
 from langchain.vectorstores import Qdrant
 import qdrant_client
 from qdrant_client.http import models
@@ -15,51 +16,65 @@ def collection_exists(collection_name: str) -> bool:
     return collection_name in [i.name for i in colls.collections]
 
 
-def get_index_client(path: str = None):
+def get_index_client(path: Optional[str] = None):
     global _CLIENT
     if _CLIENT is None:
-        if os.environ.get("QDRANT_URL") is not None:
-            url = os.environ.get("QDRANT_URL")
-            if os.environ.get("QDRANT_PORT") is not None:
-                url += f":{os.environ.get('QDRANT_PORT')}"
-            _CLIENT = qdrant_client.QdrantClient(url=url, prefer_grpc=False)
-        elif os.environ.get("QDRANT_PATH") is not None:
-            path = os.environ.get("QDRANT_PATH")
+        if path is not None:
             _CLIENT = qdrant_client.QdrantClient(path=path, prefer_grpc=True)
         else:
-            raise ValueError("QDRANT_URL or QDRANT_PATH not set in the environment")
+            url = os.environ.get("QDRANT_URL")
+            if url is not None:
+                port = os.environ.get("QDRANT_PORT")
+                if port is not None:
+                    url += f":{port}"
+                _CLIENT = qdrant_client.QdrantClient(url=url, prefer_grpc=False)
+            else:
+                path = os.environ.get("QDRANT_PATH")
+                if path is not None:
+                    _CLIENT = qdrant_client.QdrantClient(path=path, prefer_grpc=True)
+                else:
+                    raise ValueError("QDRANT_URL or QDRANT_PATH not set in the environment")
 
     return _CLIENT
 
 
-def get_index_collection(embeddings, path: str = None, recreate: bool = False):
+def get_index_collection(embeddings, path: Optional[str] = None, recreate: bool = False):
     client = get_index_client(path=path)
 
     if recreate:
         client.recreate_collection(
             collection_name=embeddings.collection_name,
-            vectors_config=models.VectorParams(size=embeddings.size, distance=embeddings.distance),
+            vectors_config=models.VectorParams(
+                size=embeddings.size, distance=embeddings.distance
+            ),
         )
 
     if not collection_exists(embeddings.collection_name):
         client.create_collection(
             collection_name=embeddings.collection_name,
-            vectors_config=models.VectorParams(size=embeddings.size, distance=embeddings.distance),
+            vectors_config=models.VectorParams(
+                size=embeddings.size, distance=embeddings.distance
+            ),
         )
 
     return Qdrant(
-        client=client, collection_name=embeddings.collection_name,
-        embeddings=embeddings.embeddings
+        client=client,
+        collection_name=embeddings.collection_name,
+        embeddings=embeddings.embeddings,
     )
 
 
-def get_docs_index(path: str = None, recreate: bool = False):
+def get_docs_index(path: Optional[str] = None, recreate: bool = False):
     return get_index_collection(get_docs_embeddings(), path=path, recreate=recreate)
 
 
-def get_indicators_index(path: str = None, recreate: bool = False):
-    return get_index_collection(get_indicators_embeddings(), path=path, recreate=recreate)
+def get_indicators_index(path: Optional[str] = None, recreate: bool = False):
+    return get_index_collection(
+        get_indicators_embeddings(), path=path, recreate=recreate
+    )
 
 
-def get_microdata_index(path: str = None, recreate: bool = False):
-    return get_index_collection(get_microdata_embeddings(), path=path, recreate=recreate)
+def get_microdata_index(path: Optional[str] = None, recreate: bool = False):
+    return get_index_collection(
+        get_microdata_embeddings(), path=path, recreate=recreate
+    )
diff --git a/llm4data/llm/indicators/wdi_sql.py b/llm4data/llm/indicators/wdi_sql.py
@@ -189,7 +189,6 @@ def llm2sql_answer(
         drop_na=True,
         num_samples=20,
     ):
-
         if params is None:
             params = {}
 

diff --git a/llm4data/prompts/base.py b/llm4data/prompts/base.py
@@ -48,3 +48,19 @@ def send_prompt(
     @abstractmethod
     def parse_response(self, response: dict, **kwargs: Any) -> Any:
         pass
+
+
+class APIPrompt(DatedPrompt):
+    task_label = "APIPrompt"
+    prompt_type = "zeros"
+    template = "Current date: {now}\n\n"
+
+    @abstractmethod
+    def parse_response(
+        self, response: dict, **kwargs: Any
+    ) -> Any:
+        pass
+
+    @abstractmethod
+    def send_prompt_get_sample(self, prompt: str, n_samples: int = 10, **kwargs: Any) -> dict:
+        pass