-
Notifications
You must be signed in to change notification settings - Fork 133
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
1. Configuration linkage was not exposed. So adding a parameter for people to specify what configuration is required. 2. There was a bug in the UDF logic, because `all(empty)` evaluates to True. So fixed that. 3. The `select` in append mode appended all possible columns. It didn't only append what was in the `select`. This changes that behavior by dropping "outputs" from the subdag that aren't in the select. We do it this way because we don't know what's in the original dataframe. 4. Adds test to check for the new logic
- Loading branch information
Showing
6 changed files
with
197 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
59 changes: 59 additions & 0 deletions
59
examples/LLM_Workflows/pdf_summarizer/run_on_spark/run_with_columns.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
"""Spark driver and Hamilton driver code.""" | ||
|
||
import spark_pdf_pipeline | ||
from pyspark.sql import SparkSession | ||
|
||
from hamilton import base, driver, log_setup | ||
|
||
|
||
def my_spark_job(spark: SparkSession, openai_gpt_model: str, content_type: str, user_query: str): | ||
"""Template for a Spark job that uses Hamilton for their featuring engineering, i.e. any map, operations. | ||
:param spark: the SparkSession | ||
:param openai_gpt_model: the model to use for summarization | ||
:param content_type: the content type of the document to summarize | ||
:param user_query: the user query to use for summarization | ||
""" | ||
dr = ( | ||
driver.Builder() | ||
.with_config({"file_type": "pdf"}) | ||
.with_modules(spark_pdf_pipeline) | ||
.with_adapter(base.DefaultAdapter()) | ||
.build() | ||
) | ||
# create inputs to the UDFs - this needs to be column_name -> spark dataframe. | ||
execute_inputs = { | ||
"spark_session": spark, | ||
"save_path": "summarized_pdf_df.parquet", | ||
"openai_gpt_model": openai_gpt_model, | ||
"content_type": content_type, | ||
"user_query": user_query, | ||
} | ||
output = ["saved_summarized_pdf_df"] | ||
# visualize execution of what is going to be appended | ||
dr.visualize_execution( | ||
output, | ||
"./spark_with_columns_summarization.png", | ||
inputs=execute_inputs, | ||
deduplicate_inputs=True, | ||
) | ||
# tell Hamilton to tell Spark what to do | ||
dict_result = dr.execute(output, inputs=execute_inputs) | ||
return dict_result["saved_summarized_pdf_df"] | ||
|
||
|
||
if __name__ == "__main__": | ||
import os | ||
|
||
openai_api_key = os.environ.get("OPENAI_API_KEY") | ||
log_setup.setup_logging(log_level=log_setup.LOG_LEVELS["INFO"]) | ||
# create the SparkSession -- note in real life, you'd adjust the number of executors to control parallelism. | ||
spark = SparkSession.builder.config( | ||
"spark.executorEnv.OPENAI_API_KEY", openai_api_key | ||
).getOrCreate() | ||
spark.sparkContext.setLogLevel("info") | ||
# run the job | ||
_df = my_spark_job(spark, "gpt-3.5-turbo-0613", "Scientific article", "Can you ELI5 the paper?") | ||
# show the dataframe & thus make spark compute something | ||
_df.show() | ||
spark.stop() |
34 changes: 34 additions & 0 deletions
34
examples/LLM_Workflows/pdf_summarizer/run_on_spark/spark_pdf_pipeline.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import pandas as pd | ||
import pyspark.sql as ps | ||
import summarization | ||
|
||
from hamilton.plugins.h_spark import with_columns | ||
|
||
|
||
def pdf_df(spark_session: ps.SparkSession) -> ps.DataFrame: | ||
pandas_df = pd.DataFrame( | ||
# TODO: update this to point to a PDF or two. | ||
{"pdf_source": ["CDMS_HAMILTON_PAPER.pdf"]} | ||
) | ||
df = spark_session.createDataFrame(pandas_df) | ||
return df | ||
|
||
|
||
@with_columns( | ||
summarization, | ||
select=["summarized_chunks", "summarized_text"], | ||
columns_to_pass=["pdf_source"], | ||
config_required=["file_type"], | ||
) | ||
def summarized_pdf_df(pdf_df: ps.DataFrame) -> ps.DataFrame: | ||
return pdf_df | ||
|
||
|
||
def saved_summarized_pdf_df( | ||
summarized_pdf_df: ps.DataFrame, save_path: str, persist_before_save: bool = True | ||
) -> ps.DataFrame: | ||
"""Save the summarized PDF dataframe to a parquet file.""" | ||
if persist_before_save: | ||
summarized_pdf_df.persist() | ||
summarized_pdf_df.write.parquet(save_path, mode="overwrite") | ||
return summarized_pdf_df |
Binary file added
BIN
+118 KB
.../LLM_Workflows/pdf_summarizer/run_on_spark/spark_with_columns_summarization.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters