TileDB-Inc · NikolaosPapailiou · Oct 3, 2024 · Oct 1, 2024 · Oct 1, 2024
diff --git a/_quarto.yml b/_quarto.yml
@@ -33,14 +33,27 @@ quartodoc:
   package: tiledb.vector_search
   dir: "documentation/reference"
   sections:
-    - title: "tiledb.vector_search"
+    - title: "Vector API"
       desc: ""
       contents:
+        - open
+        - ingestion
         - index.Index
+    - subtitle: "Algorithms"
+      desc: ""
+      contents:
         - flat_index
         - ivf_flat_index
         - vamana_index
-        - ingestion
+        - ivf_pq_index
+    - title: "Object API"
+      desc: ""
+      contents:
+        - object_api.create
+        - object_api.ObjectIndex
+        - embeddings.ObjectEmbedding
+        - object_readers.ObjectReader
+        - object_readers.ObjectPartition
 
 website:
   favicon: "documentation/assets/tiledb.ico"

diff --git a/apis/python/src/tiledb/vector_search/index.py b/apis/python/src/tiledb/vector_search/index.py
@@ -21,15 +21,14 @@
 class Index(metaclass=ABCMeta):
     """
     Abstract Vector Index class.
+    Do not use this directly but rather use the `open` factory method.
 
     All Vector Index algorithm implementations are instantiations of this class. Apart
     from the abstract method interfaces, `Index` provides implementations for common
     tasks i.e. supporting updates, time-traveling and metadata management.
 
     Opens an `Index` reading metadata and applying time-traveling options.
 
-    Do not use this directly but rather instantiate the concrete Index classes.
-
     Parameters
     ----------
     uri: str
@@ -883,35 +882,33 @@ def create_metadata(
         group.close()
 
 
-"""
-Factory method that opens a vector index.
-
-Retrieves the `index_type` from the index group metadata and instantiates the appropriate `Index` subclass.
-
-Parameters
-----------
-uri: str
-    URI of the index.
-config: Optional[Mapping[str, Any]]
-    TileDB config dictionary.
-timestamp: int or tuple(int)
-    If int, open the index at a given timestamp.
-    If tuple, open at the given start and end timestamps.
-open_for_remote_query_execution: bool
-    If `True`, do not load any index data in main memory locally, and instead load index data in the TileDB Cloud taskgraph created when a non-`None` `driver_mode` is passed to `query()`.
-    If `False`, load index data in main memory locally. Note that you can still use a taskgraph for query execution, you'll just end up loading the data both on your local machine and in the cloud taskgraph.
-kwargs:
-    Additional arguments to be passed to the `Index` subclass constructor.
-"""
-
-
 def open(
     uri: str,
     open_for_remote_query_execution: bool = False,
     config: Optional[Mapping[str, Any]] = None,
     timestamp=None,
     **kwargs,
 ) -> Index:
+    """
+    Factory method that opens a vector index.
+
+    Retrieves the `index_type` from the index group metadata and instantiates the appropriate `Index` subclass.
+
+    Parameters
+    ----------
+    uri: str
+        URI of the index.
+    config: Optional[Mapping[str, Any]]
+        TileDB config dictionary.
+    timestamp: int or tuple(int)
+        If int, open the index at a given timestamp.
+        If tuple, open at the given start and end timestamps.
+    open_for_remote_query_execution: bool
+        If `True`, do not load any index data in main memory locally, and instead load index data in the TileDB Cloud taskgraph created when a non-`None` `driver_mode` is passed to `query()`.
+        If `False`, load index data in main memory locally. Note that you can still use a taskgraph for query execution, you'll just end up loading the data both on your local machine and in the cloud taskgraph.
+    kwargs:
+        Additional arguments to be passed to the `Index` subclass constructor.
+    """
     from tiledb.vector_search.flat_index import FlatIndex
     from tiledb.vector_search.ivf_flat_index import IVFFlatIndex
     from tiledb.vector_search.ivf_pq_index import IVFPQIndex

diff --git a/apis/python/src/tiledb/vector_search/object_api/__init__.py b/apis/python/src/tiledb/vector_search/object_api/__init__.py
@@ -1,4 +1,5 @@
 from .embeddings_ingestion import ingest_embeddings_with_driver
 from .object_index import ObjectIndex
+from .object_index import create
 
-__all__ = ["ObjectIndex", "ingest_embeddings_with_driver"]
+__all__ = ["ObjectIndex", "create", "ingest_embeddings_with_driver"]
diff --git a/apis/python/src/tiledb/vector_search/object_api/embeddings_ingestion.py b/apis/python/src/tiledb/vector_search/object_api/embeddings_ingestion.py
@@ -29,6 +29,82 @@ def ingest_embeddings_with_driver(
     environment_variables: Dict = {},
     **kwargs,
 ):
+    """
+    Ingest embeddings into a TileDB vector search index using a driver function.
+
+    This function orchestrates the embedding ingestion process by creating and executing
+    a TileDB Cloud DAG (Directed Acyclic Graph). The DAG consists of two main stages:
+
+    1. **Embeddings Generation:** This stage is responsible for computing embeddings
+    for the objects to be indexed.
+
+    2. **Vector Indexing:** This stage is responsible for ingesting the generated
+    embeddings into the TileDB vector search index.
+
+    Both stages can be be executed in one of three modes:
+
+    - **LOCAL:** Embeddings are ingested locally within the current process.
+    - **REALTIME:** Embeddings are ingested using a TileDB Cloud REALTIME TaskGraph.
+    - **BATCH:** Embeddings are ingested using a TileDB Cloud BATCH TaskGraph.
+
+    The `ingest_embeddings_with_driver` function provides flexibility in configuring
+    the execution environment for both stages. Users can specify the number of workers,
+    resources, Docker images, and extra modules for both the driver and worker nodes.
+
+    Parameters
+    ----------
+    object_index_uri: str
+        The URI of the TileDB vector search index.
+    use_updates_array: bool
+        Whether to use the updates array for ingesting embeddings.
+    embeddings_array_uri: str, optional
+        The URI of the array to store the generated embeddings. This parameter is
+        required if `use_updates_array` is set to `False`.
+    metadata_array_uri: str, optional
+        The URI of the array to store object metadata.
+    index_timestamp: int, optional
+        The timestamp to use for the ingestion. If not specified, the current time
+        will be used.
+    workers: int, optional
+        The number of workers to use for the ingestion. If not specified, the default
+        number of workers will be used.
+    worker_resources: Dict, optional
+        A dictionary specifying the resources to allocate for each worker node.
+    worker_image: str, optional
+        The name of the Docker image to use for the worker nodes.
+    extra_worker_modules: List[str], optional
+        A list of extra Python modules to install on the worker nodes.
+    driver_resources: Dict, optional
+        A dictionary specifying the resources to allocate for the driver node.
+    driver_image: str, optional
+        The name of the Docker image to use for the driver node.
+    extra_driver_modules: List[str], optional
+        A list of extra Python modules to install on the driver node.
+    worker_access_credentials_name: str, optional
+        The name of the TileDB Cloud access credentials to use for the worker nodes.
+    max_tasks_per_stage: int, optional
+        The maximum number of tasks to run per stage.
+    verbose: bool, optional
+        Whether to enable verbose logging.
+    trace_id: str, optional
+        A unique identifier for tracing the execution of the ingestion process.
+    embeddings_generation_mode: Mode, optional
+        The mode to use for generating embeddings. Defaults to `Mode.LOCAL`.
+    embeddings_generation_driver_mode: Mode, optional
+        The mode to use for running the embeddings generation driver function.
+        Defaults to `Mode.LOCAL`.
+    vector_indexing_mode: Mode, optional
+        The mode to use for indexing the generated vectors. Defaults to `Mode.LOCAL`.
+    config: Mapping[str, Any], optional
+        A dictionary containing TileDB configuration parameters.
+    namespace: str, optional
+        The TileDB Cloud namespace to use for the ingestion.
+    environment_variables: Dict, optional
+        Environment variables to set for the object reader and embedding function.
+    **kwargs
+        Additional keyword arguments to pass to the ingestion function.
+    """
+
     def ingest_embeddings(
         object_index_uri: str,
         use_updates_array: bool,