Skip to content

Commit

Permalink
Fixing pre-commit (#207)
Browse files Browse the repository at this point in the history
Examples weren't formatted properly.
  • Loading branch information
skrawcz authored Jul 3, 2023
1 parent 2316fc7 commit 51d0626
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 53 deletions.
8 changes: 4 additions & 4 deletions examples/LLM_Workflows/modular_llm_stack/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ This example shows how to pull data from the HuggingFace datasets hub, create em
![](./weaviate_dag.png)
*DAG for OpenAI embeddings and Weaviate vector database*

In addition, you'll see how Hamilton can help you create replaceable components. This flexibility, makes it easier to assess service providers and refactor code to fit your needs. The above and below DAGs were generated simply by changing a string value and a module import. Try to spot the differences!
In addition, you'll see how Hamilton can help you create replaceable components. This flexibility, makes it easier to assess service providers and refactor code to fit your needs. The above and below DAGs were generated simply by changing a string value and a module import. Try to spot the differences!

![](./pinecone_dag.png)
*DAG for SentenceTransformers embeddings and Pinecone vector database*

# Example structure
- `run.py` contains the code to test the example. It uses `click` to provide a simple command interface.
- `run.py` contains the code to test the example. It uses `click` to provide a simple command interface.
- `data_module.py` contains the code to pull data from HuggingFace. The code is in a separate Python module since it doesn't depend on the other functionalities and could include more involved preprocessing.
- `embedding_module.py` contains the code to embed text using either Cohere API, OpenAI API or SentenceTransformer library. The use of `@config.when` allows to have all options in the same Python module. This allows to quickly rerun your Hamilton DAG by simply changing your config. You'll see that functions share similar signature to enable interchangeability.
- `lancedb_module.py`, `weaviate_module.py` and `pinecone_module.py` implement the same functionalities for each vector database. Having the same function names allows Hamilton to abstract away the implementation details and reinforce the notion that both modules shouldn't be loaded simultaneously.
Expand All @@ -20,7 +20,7 @@ In addition, you'll see how Hamilton can help you create replaceable components.
# How-to run the example
Prerequesite:
- Create accounts and get the API keys for the service you plan to use.
- Create your python environment, and do `pip install -r requirements.txt`.
- Create your python environment, and do `pip install -r requirements.txt`.
- For Weaviate start your local instance using `docker compose up -d`
1. Run `python run.py --help` to learn about the options. You will options to:
- Select a vector database from: weaviate, pinecone
Expand All @@ -41,4 +41,4 @@ To change vector database you need to pass a JSON config argument:

# Next step / Exercises
- Implement the code to read data from the vector database
- Add the code to send the same generative prompt to multiple providers
- Add the code to send the same generative prompt to multiple providers
13 changes: 3 additions & 10 deletions examples/LLM_Workflows/modular_llm_stack/embedding_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@ def embedding_config__openai(embedding_service: str, model_name: str) -> dict:
if model_name == "text-embedding-ada-002":
return dict(embedding_dimension=1536, embedding_metric="cosine")
# If you support more models, you would add that here
raise ValueError(
f"Invalid `model_name`[{model_name}] for openai was passed."
)
raise ValueError(f"Invalid `model_name`[{model_name}] for openai was passed.")


@config.when(embedding_service="cohere")
Expand All @@ -35,9 +33,7 @@ def embedding_config__cohere(embedding_service: str, model_name: str) -> dict:
if model_name == "embed-english-light-v2.0":
return dict(embedding_dimension=1024, embedding_metric="cosine")
# If you support more models, you would add that here
raise ValueError(
f"Invalid `model_name`[{model_name}] for Cohere was passed."
)
raise ValueError(f"Invalid `model_name`[{model_name}] for Cohere was passed.")


@config.when(embedding_service="sentence_transformer")
Expand All @@ -51,17 +47,14 @@ def embedding_config__sentence_transformer(embedding_service: str, model_name: s
if model_name == "multi-qa-MiniLM-L6-cos-v1":
return dict(embedding_dimension=384, embedding_metric="cosine")
# If you support more models, you would add that here
raise ValueError(
f"Invalid `model_name`[{model_name}] for SentenceTransformer was passed."
)
raise ValueError(f"Invalid `model_name`[{model_name}] for SentenceTransformer was passed.")


def metadata(embedding_service: str, model_name: str) -> dict:
"""Create metadata dictionary"""
return dict(embedding_service=embedding_service, model_name=model_name)



@config.when(embedding_service="openai")
def embedding_provider__openai(api_key: str) -> ModuleType:
"""Set OpenAI API key"""
Expand Down
34 changes: 16 additions & 18 deletions examples/LLM_Workflows/modular_llm_stack/lancedb_module.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import lancedb
import numpy as np
import pandas as pd
import pyarrow as pa
import lancedb


def client_vector_db(vector_db_config: dict) -> lancedb.LanceDBConnection:
Expand All @@ -17,21 +17,19 @@ def initialize_vector_db_indices(
"""Initialize the LanceDB table;
NOTE this pattern currently doesn't work and is due to a bug with lancedb
"""
schema = pa.schema([
("squad_id", pa.string()),
("title", pa.string()),
("context", pa.string()),
("embedding_service", pa.string()),
("model_name", pa.string()),
pa.field("vector", type=pa.list_(pa.float32(), list_size=embedding_dimension)),
])

client_vector_db.create_table(
name=class_name,
schema=schema,
mode="create"
schema = pa.schema(
[
("squad_id", pa.string()),
("title", pa.string()),
("context", pa.string()),
("embedding_service", pa.string()),
("model_name", pa.string()),
pa.field("vector", type=pa.list_(pa.float32(), list_size=embedding_dimension)),
]
)


client_vector_db.create_table(name=class_name, schema=schema, mode="create")

return True


Expand All @@ -43,11 +41,11 @@ def reset_vector_db(client_vector_db: lancedb.LanceDBConnection) -> bool:


def data_objects(
ids: list[str],
ids: list[str],
titles: list[str],
text_contents: list[str],
embeddings: list[np.ndarray],
metadata: dict
metadata: dict,
) -> list[dict]:
"""Create valid LanceDB objects"""
assert len(ids) == len(titles) == len(text_contents) == len(embeddings)
Expand All @@ -69,4 +67,4 @@ def push_to_vector_db(
df = pd.DataFrame.from_records(data_objects)
table = client_vector_db.create_table(name=class_name, data=df, mode="overwrite")

return table.to_pandas().shape[0]
return table.to_pandas().shape[0]
2 changes: 1 addition & 1 deletion examples/LLM_Workflows/modular_llm_stack/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
click
cohere
datasets
lancedb
Expand All @@ -9,4 +10,3 @@ sentence-transformers
sf-hamilton
sf-hamilton[visualization]
weaviate-client
click
29 changes: 11 additions & 18 deletions examples/LLM_Workflows/modular_llm_stack/run.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json

import click

import data_module
import embedding_module
import lancedb_module
Expand All @@ -14,36 +13,30 @@
@click.command()
@click.option(
"--vector_db",
type=click.Choice(['lancedb', 'weaviate', 'pinecone'], case_sensitive=False),
default="lancedb", help="Vector database service"
type=click.Choice(["lancedb", "weaviate", "pinecone"], case_sensitive=False),
default="lancedb",
help="Vector database service",
)
@click.option(
"--vector_db_config",
default='{"uri": "data/lancedb"}',
help="Pass a JSON string for vector database config.\
Weaviate needs a dictionary {'url': ''}\
Pinecone needs dictionary {'environment': '', 'api_key': ''}"
Pinecone needs dictionary {'environment': '', 'api_key': ''}",
)
@click.option(
"--embedding_service",
type=click.Choice(['openai', 'cohere', 'sentence_transformer'], case_sensitive=False),
default="sentence_transformer", help="Text embedding service."
type=click.Choice(["openai", "cohere", "sentence_transformer"], case_sensitive=False),
default="sentence_transformer",
help="Text embedding service.",
)
@click.option(
"--embedding_service_api_key",
default=None,
help='API Key for embedding service. Needed if using OpenAI or Cohere.'
)
@click.option(
"--model_name",
default=None,
help='Text embedding model name.'
)
@click.option(
'--display_dag',
is_flag=True,
help="Generate a .png of the Hamilton DAG"
help="API Key for embedding service. Needed if using OpenAI or Cohere.",
)
@click.option("--model_name", default=None, help="Text embedding model name.")
@click.option("--display_dag", is_flag=True, help="Generate a .png of the Hamilton DAG")
def main(
vector_db: str,
vector_db_config: str,
Expand All @@ -66,7 +59,7 @@ def main(
model_name = "embed-english-light-v2.0"
elif embedding_service == "sentence_transformer":
model_name = "multi-qa-MiniLM-L6-cos-v1"

config = dict(
vector_db_config=json.loads(vector_db_config),
embedding_service=embedding_service, # this triggers config.when() in embedding_module
Expand Down
4 changes: 2 additions & 2 deletions examples/LLM_Workflows/modular_llm_stack/weaviate_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ def initialize_vector_db_indices(client_vector_db: weaviate.Client) -> bool:
},
{
"name": "model_name",
"dataType": ["string"],
"dataType": ["string"],
"description": "model used by embedding service to create the vector",
}
},
],
}

Expand Down

0 comments on commit 51d0626

Please sign in to comment.