diff --git a/examples/LLM_Workflows/modular_llm_stack/README.md b/examples/LLM_Workflows/modular_llm_stack/README.md index b73edcd76..b57dcd29a 100644 --- a/examples/LLM_Workflows/modular_llm_stack/README.md +++ b/examples/LLM_Workflows/modular_llm_stack/README.md @@ -5,13 +5,13 @@ This example shows how to pull data from the HuggingFace datasets hub, create em ![](./weaviate_dag.png) *DAG for OpenAI embeddings and Weaviate vector database* -In addition, you'll see how Hamilton can help you create replaceable components. This flexibility, makes it easier to assess service providers and refactor code to fit your needs. The above and below DAGs were generated simply by changing a string value and a module import. Try to spot the differences! +In addition, you'll see how Hamilton can help you create replaceable components. This flexibility, makes it easier to assess service providers and refactor code to fit your needs. The above and below DAGs were generated simply by changing a string value and a module import. Try to spot the differences! ![](./pinecone_dag.png) *DAG for SentenceTransformers embeddings and Pinecone vector database* # Example structure -- `run.py` contains the code to test the example. It uses `click` to provide a simple command interface. +- `run.py` contains the code to test the example. It uses `click` to provide a simple command interface. - `data_module.py` contains the code to pull data from HuggingFace. The code is in a separate Python module since it doesn't depend on the other functionalities and could include more involved preprocessing. - `embedding_module.py` contains the code to embed text using either Cohere API, OpenAI API or SentenceTransformer library. The use of `@config.when` allows to have all options in the same Python module. This allows to quickly rerun your Hamilton DAG by simply changing your config. You'll see that functions share similar signature to enable interchangeability. - `lancedb_module.py`, `weaviate_module.py` and `pinecone_module.py` implement the same functionalities for each vector database. Having the same function names allows Hamilton to abstract away the implementation details and reinforce the notion that both modules shouldn't be loaded simultaneously. @@ -20,7 +20,7 @@ In addition, you'll see how Hamilton can help you create replaceable components. # How-to run the example Prerequesite: - Create accounts and get the API keys for the service you plan to use. -- Create your python environment, and do `pip install -r requirements.txt`. +- Create your python environment, and do `pip install -r requirements.txt`. - For Weaviate start your local instance using `docker compose up -d` 1. Run `python run.py --help` to learn about the options. You will options to: - Select a vector database from: weaviate, pinecone @@ -41,4 +41,4 @@ To change vector database you need to pass a JSON config argument: # Next step / Exercises - Implement the code to read data from the vector database -- Add the code to send the same generative prompt to multiple providers +- Add the code to send the same generative prompt to multiple providers diff --git a/examples/LLM_Workflows/modular_llm_stack/embedding_module.py b/examples/LLM_Workflows/modular_llm_stack/embedding_module.py index 580c4cd38..316967c0c 100644 --- a/examples/LLM_Workflows/modular_llm_stack/embedding_module.py +++ b/examples/LLM_Workflows/modular_llm_stack/embedding_module.py @@ -19,9 +19,7 @@ def embedding_config__openai(embedding_service: str, model_name: str) -> dict: if model_name == "text-embedding-ada-002": return dict(embedding_dimension=1536, embedding_metric="cosine") # If you support more models, you would add that here - raise ValueError( - f"Invalid `model_name`[{model_name}] for openai was passed." - ) + raise ValueError(f"Invalid `model_name`[{model_name}] for openai was passed.") @config.when(embedding_service="cohere") @@ -35,9 +33,7 @@ def embedding_config__cohere(embedding_service: str, model_name: str) -> dict: if model_name == "embed-english-light-v2.0": return dict(embedding_dimension=1024, embedding_metric="cosine") # If you support more models, you would add that here - raise ValueError( - f"Invalid `model_name`[{model_name}] for Cohere was passed." - ) + raise ValueError(f"Invalid `model_name`[{model_name}] for Cohere was passed.") @config.when(embedding_service="sentence_transformer") @@ -51,9 +47,7 @@ def embedding_config__sentence_transformer(embedding_service: str, model_name: s if model_name == "multi-qa-MiniLM-L6-cos-v1": return dict(embedding_dimension=384, embedding_metric="cosine") # If you support more models, you would add that here - raise ValueError( - f"Invalid `model_name`[{model_name}] for SentenceTransformer was passed." - ) + raise ValueError(f"Invalid `model_name`[{model_name}] for SentenceTransformer was passed.") def metadata(embedding_service: str, model_name: str) -> dict: @@ -61,7 +55,6 @@ def metadata(embedding_service: str, model_name: str) -> dict: return dict(embedding_service=embedding_service, model_name=model_name) - @config.when(embedding_service="openai") def embedding_provider__openai(api_key: str) -> ModuleType: """Set OpenAI API key""" diff --git a/examples/LLM_Workflows/modular_llm_stack/lancedb_module.py b/examples/LLM_Workflows/modular_llm_stack/lancedb_module.py index aca2cce87..09d3913c2 100644 --- a/examples/LLM_Workflows/modular_llm_stack/lancedb_module.py +++ b/examples/LLM_Workflows/modular_llm_stack/lancedb_module.py @@ -1,7 +1,7 @@ +import lancedb import numpy as np import pandas as pd import pyarrow as pa -import lancedb def client_vector_db(vector_db_config: dict) -> lancedb.LanceDBConnection: @@ -17,21 +17,19 @@ def initialize_vector_db_indices( """Initialize the LanceDB table; NOTE this pattern currently doesn't work and is due to a bug with lancedb """ - schema = pa.schema([ - ("squad_id", pa.string()), - ("title", pa.string()), - ("context", pa.string()), - ("embedding_service", pa.string()), - ("model_name", pa.string()), - pa.field("vector", type=pa.list_(pa.float32(), list_size=embedding_dimension)), - ]) - - client_vector_db.create_table( - name=class_name, - schema=schema, - mode="create" + schema = pa.schema( + [ + ("squad_id", pa.string()), + ("title", pa.string()), + ("context", pa.string()), + ("embedding_service", pa.string()), + ("model_name", pa.string()), + pa.field("vector", type=pa.list_(pa.float32(), list_size=embedding_dimension)), + ] ) - + + client_vector_db.create_table(name=class_name, schema=schema, mode="create") + return True @@ -43,11 +41,11 @@ def reset_vector_db(client_vector_db: lancedb.LanceDBConnection) -> bool: def data_objects( - ids: list[str], + ids: list[str], titles: list[str], text_contents: list[str], embeddings: list[np.ndarray], - metadata: dict + metadata: dict, ) -> list[dict]: """Create valid LanceDB objects""" assert len(ids) == len(titles) == len(text_contents) == len(embeddings) @@ -69,4 +67,4 @@ def push_to_vector_db( df = pd.DataFrame.from_records(data_objects) table = client_vector_db.create_table(name=class_name, data=df, mode="overwrite") - return table.to_pandas().shape[0] \ No newline at end of file + return table.to_pandas().shape[0] diff --git a/examples/LLM_Workflows/modular_llm_stack/requirements.txt b/examples/LLM_Workflows/modular_llm_stack/requirements.txt index 470ba3bb1..62b52b469 100644 --- a/examples/LLM_Workflows/modular_llm_stack/requirements.txt +++ b/examples/LLM_Workflows/modular_llm_stack/requirements.txt @@ -1,3 +1,4 @@ +click cohere datasets lancedb @@ -9,4 +10,3 @@ sentence-transformers sf-hamilton sf-hamilton[visualization] weaviate-client -click diff --git a/examples/LLM_Workflows/modular_llm_stack/run.py b/examples/LLM_Workflows/modular_llm_stack/run.py index 9135ac60d..26f316b6d 100644 --- a/examples/LLM_Workflows/modular_llm_stack/run.py +++ b/examples/LLM_Workflows/modular_llm_stack/run.py @@ -1,7 +1,6 @@ import json import click - import data_module import embedding_module import lancedb_module @@ -14,36 +13,30 @@ @click.command() @click.option( "--vector_db", - type=click.Choice(['lancedb', 'weaviate', 'pinecone'], case_sensitive=False), - default="lancedb", help="Vector database service" + type=click.Choice(["lancedb", "weaviate", "pinecone"], case_sensitive=False), + default="lancedb", + help="Vector database service", ) @click.option( "--vector_db_config", default='{"uri": "data/lancedb"}', help="Pass a JSON string for vector database config.\ Weaviate needs a dictionary {'url': ''}\ - Pinecone needs dictionary {'environment': '', 'api_key': ''}" + Pinecone needs dictionary {'environment': '', 'api_key': ''}", ) @click.option( "--embedding_service", - type=click.Choice(['openai', 'cohere', 'sentence_transformer'], case_sensitive=False), - default="sentence_transformer", help="Text embedding service." + type=click.Choice(["openai", "cohere", "sentence_transformer"], case_sensitive=False), + default="sentence_transformer", + help="Text embedding service.", ) @click.option( "--embedding_service_api_key", default=None, - help='API Key for embedding service. Needed if using OpenAI or Cohere.' -) -@click.option( - "--model_name", - default=None, - help='Text embedding model name.' -) -@click.option( - '--display_dag', - is_flag=True, - help="Generate a .png of the Hamilton DAG" + help="API Key for embedding service. Needed if using OpenAI or Cohere.", ) +@click.option("--model_name", default=None, help="Text embedding model name.") +@click.option("--display_dag", is_flag=True, help="Generate a .png of the Hamilton DAG") def main( vector_db: str, vector_db_config: str, @@ -66,7 +59,7 @@ def main( model_name = "embed-english-light-v2.0" elif embedding_service == "sentence_transformer": model_name = "multi-qa-MiniLM-L6-cos-v1" - + config = dict( vector_db_config=json.loads(vector_db_config), embedding_service=embedding_service, # this triggers config.when() in embedding_module diff --git a/examples/LLM_Workflows/modular_llm_stack/weaviate_module.py b/examples/LLM_Workflows/modular_llm_stack/weaviate_module.py index af6907b67..6def0ec2c 100644 --- a/examples/LLM_Workflows/modular_llm_stack/weaviate_module.py +++ b/examples/LLM_Workflows/modular_llm_stack/weaviate_module.py @@ -42,9 +42,9 @@ def initialize_vector_db_indices(client_vector_db: weaviate.Client) -> bool: }, { "name": "model_name", - "dataType": ["string"], + "dataType": ["string"], "description": "model used by embedding service to create the vector", - } + }, ], }