Fixing pre-commit (#207)

Examples weren't formatted properly.
DAGWorks-Inc · Jul 3, 2023 · 51d0626 · 51d0626
1 parent 2316fc7
commit 51d0626
Show file tree

Hide file tree

Showing 6 changed files with 37 additions and 53 deletions.
diff --git a/examples/LLM_Workflows/modular_llm_stack/README.md b/examples/LLM_Workflows/modular_llm_stack/README.md
@@ -5,13 +5,13 @@ This example shows how to pull data from the HuggingFace datasets hub, create em
 ![](./weaviate_dag.png)
 *DAG for OpenAI embeddings and Weaviate vector database*
 
-In addition, you'll see how Hamilton can help you create replaceable components. This flexibility, makes it easier to assess service providers and refactor code to fit your needs. The above and below DAGs were generated simply by changing a string value and a module import. Try to spot the differences! 
+In addition, you'll see how Hamilton can help you create replaceable components. This flexibility, makes it easier to assess service providers and refactor code to fit your needs. The above and below DAGs were generated simply by changing a string value and a module import. Try to spot the differences!
 
 ![](./pinecone_dag.png)
 *DAG for SentenceTransformers embeddings and Pinecone vector database*
 
 # Example structure
-- `run.py` contains the code to test the example. It uses `click` to provide a simple command interface. 
+- `run.py` contains the code to test the example. It uses `click` to provide a simple command interface.
 - `data_module.py` contains the code to pull data from HuggingFace. The code is in a separate Python module since it doesn't depend on the other functionalities and could include more involved preprocessing.
 - `embedding_module.py` contains the code to embed text using either Cohere API, OpenAI API or SentenceTransformer library. The use of `@config.when` allows to have all options in the same Python module. This allows to quickly rerun your Hamilton DAG by simply changing your config. You'll see that functions share similar signature to enable interchangeability.
 - `lancedb_module.py`, `weaviate_module.py` and `pinecone_module.py` implement the same functionalities for each vector database. Having the same function names allows Hamilton to abstract away the implementation details and reinforce the notion that both modules shouldn't be loaded simultaneously.
@@ -20,7 +20,7 @@ In addition, you'll see how Hamilton can help you create replaceable components.
 # How-to run the example
 Prerequesite:
 - Create accounts and get the API keys for the service you plan to use.
-- Create your python environment, and do `pip install -r requirements.txt`. 
+- Create your python environment, and do `pip install -r requirements.txt`.
 - For Weaviate start your local instance using `docker compose up -d`
 1. Run `python run.py --help` to learn about the options. You will options to:
     - Select a vector database from: weaviate, pinecone
@@ -41,4 +41,4 @@ To change vector database you need to pass a JSON config argument:
 
 # Next step / Exercises
 - Implement the code to read data from the vector database
-- Add the code to send the same generative prompt to multiple providers 
+- Add the code to send the same generative prompt to multiple providers
diff --git a/examples/LLM_Workflows/modular_llm_stack/embedding_module.py b/examples/LLM_Workflows/modular_llm_stack/embedding_module.py
@@ -19,9 +19,7 @@ def embedding_config__openai(embedding_service: str, model_name: str) -> dict:
     if model_name == "text-embedding-ada-002":
         return dict(embedding_dimension=1536, embedding_metric="cosine")
     # If you support more models, you would add that here
-    raise ValueError(
-            f"Invalid `model_name`[{model_name}] for openai was passed."
-        )
+    raise ValueError(f"Invalid `model_name`[{model_name}] for openai was passed.")
 
 
 @config.when(embedding_service="cohere")
@@ -35,9 +33,7 @@ def embedding_config__cohere(embedding_service: str, model_name: str) -> dict:
     if model_name == "embed-english-light-v2.0":
         return dict(embedding_dimension=1024, embedding_metric="cosine")
     # If you support more models, you would add that here
-    raise ValueError(
-            f"Invalid `model_name`[{model_name}] for Cohere was passed."
-        )
+    raise ValueError(f"Invalid `model_name`[{model_name}] for Cohere was passed.")
 
 
 @config.when(embedding_service="sentence_transformer")
@@ -51,17 +47,14 @@ def embedding_config__sentence_transformer(embedding_service: str, model_name: s
     if model_name == "multi-qa-MiniLM-L6-cos-v1":
         return dict(embedding_dimension=384, embedding_metric="cosine")
     # If you support more models, you would add that here
-    raise ValueError(
-            f"Invalid `model_name`[{model_name}] for SentenceTransformer was passed."
-        )
+    raise ValueError(f"Invalid `model_name`[{model_name}] for SentenceTransformer was passed.")
 
 
 def metadata(embedding_service: str, model_name: str) -> dict:
     """Create metadata dictionary"""
     return dict(embedding_service=embedding_service, model_name=model_name)
 
 
-
 @config.when(embedding_service="openai")
 def embedding_provider__openai(api_key: str) -> ModuleType:
     """Set OpenAI API key"""

diff --git a/examples/LLM_Workflows/modular_llm_stack/lancedb_module.py b/examples/LLM_Workflows/modular_llm_stack/lancedb_module.py
@@ -1,7 +1,7 @@
+import lancedb
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-import lancedb
 
 
 def client_vector_db(vector_db_config: dict) -> lancedb.LanceDBConnection:
@@ -17,21 +17,19 @@ def initialize_vector_db_indices(
     """Initialize the LanceDB table;
     NOTE this pattern currently doesn't work and is due to a bug with lancedb
     """
-    schema = pa.schema([
-        ("squad_id", pa.string()),
-        ("title", pa.string()),
-        ("context", pa.string()),
-        ("embedding_service", pa.string()),
-        ("model_name", pa.string()),
-        pa.field("vector", type=pa.list_(pa.float32(), list_size=embedding_dimension)),
-    ])
-
-    client_vector_db.create_table(
-        name=class_name,
-        schema=schema,
-        mode="create"
+    schema = pa.schema(
+        [
+            ("squad_id", pa.string()),
+            ("title", pa.string()),
+            ("context", pa.string()),
+            ("embedding_service", pa.string()),
+            ("model_name", pa.string()),
+            pa.field("vector", type=pa.list_(pa.float32(), list_size=embedding_dimension)),
+        ]
     )
-
+
+    client_vector_db.create_table(name=class_name, schema=schema, mode="create")
+
     return True
 
 
@@ -43,11 +41,11 @@ def reset_vector_db(client_vector_db: lancedb.LanceDBConnection) -> bool:
 
 
 def data_objects(
-    ids: list[str], 
+    ids: list[str],
     titles: list[str],
     text_contents: list[str],
     embeddings: list[np.ndarray],
-    metadata: dict
+    metadata: dict,
 ) -> list[dict]:
     """Create valid LanceDB objects"""
     assert len(ids) == len(titles) == len(text_contents) == len(embeddings)
@@ -69,4 +67,4 @@ def push_to_vector_db(
     df = pd.DataFrame.from_records(data_objects)
     table = client_vector_db.create_table(name=class_name, data=df, mode="overwrite")
 
-    return table.to_pandas().shape[0]
+    return table.to_pandas().shape[0]
diff --git a/examples/LLM_Workflows/modular_llm_stack/requirements.txt b/examples/LLM_Workflows/modular_llm_stack/requirements.txt
@@ -1,3 +1,4 @@
+click
 cohere
 datasets
 lancedb
@@ -9,4 +10,3 @@ sentence-transformers
 sf-hamilton
 sf-hamilton[visualization]
 weaviate-client
-click
diff --git a/examples/LLM_Workflows/modular_llm_stack/run.py b/examples/LLM_Workflows/modular_llm_stack/run.py
@@ -1,7 +1,6 @@
 import json
 
 import click
-
 import data_module
 import embedding_module
 import lancedb_module
@@ -14,36 +13,30 @@
 @click.command()
 @click.option(
     "--vector_db",
-    type=click.Choice(['lancedb', 'weaviate', 'pinecone'], case_sensitive=False),
-    default="lancedb", help="Vector database service"
+    type=click.Choice(["lancedb", "weaviate", "pinecone"], case_sensitive=False),
+    default="lancedb",
+    help="Vector database service",
 )
 @click.option(
     "--vector_db_config",
     default='{"uri": "data/lancedb"}',
     help="Pass a JSON string for vector database config.\
         Weaviate needs a dictionary {'url': ''}\
-        Pinecone needs dictionary {'environment': '', 'api_key': ''}"
+        Pinecone needs dictionary {'environment': '', 'api_key': ''}",
 )
 @click.option(
     "--embedding_service",
-    type=click.Choice(['openai', 'cohere', 'sentence_transformer'], case_sensitive=False),
-    default="sentence_transformer", help="Text embedding service."
+    type=click.Choice(["openai", "cohere", "sentence_transformer"], case_sensitive=False),
+    default="sentence_transformer",
+    help="Text embedding service.",
 )
 @click.option(
     "--embedding_service_api_key",
     default=None,
-    help='API Key for embedding service. Needed if using OpenAI or Cohere.'
-)
-@click.option(
-    "--model_name", 
-    default=None,
-    help='Text embedding model name.'
-)
-@click.option(
-    '--display_dag',
-    is_flag=True,
-    help="Generate a .png of the Hamilton DAG"
+    help="API Key for embedding service. Needed if using OpenAI or Cohere.",
 )
+@click.option("--model_name", default=None, help="Text embedding model name.")
+@click.option("--display_dag", is_flag=True, help="Generate a .png of the Hamilton DAG")
 def main(
     vector_db: str,
     vector_db_config: str,
@@ -66,7 +59,7 @@ def main(
             model_name = "embed-english-light-v2.0"
         elif embedding_service == "sentence_transformer":
             model_name = "multi-qa-MiniLM-L6-cos-v1"
-    
+
     config = dict(
         vector_db_config=json.loads(vector_db_config),
         embedding_service=embedding_service,  # this triggers config.when() in embedding_module

diff --git a/examples/LLM_Workflows/modular_llm_stack/weaviate_module.py b/examples/LLM_Workflows/modular_llm_stack/weaviate_module.py
@@ -42,9 +42,9 @@ def initialize_vector_db_indices(client_vector_db: weaviate.Client) -> bool:
             },
             {
                 "name": "model_name",
-                "dataType": ["string"], 
+                "dataType": ["string"],
                 "description": "model used by embedding service to create the vector",
-            }
+            },
         ],
     }