DAGWorks-Inc · skrawcz · Aug 4, 2023 · Aug 2, 2023 · Aug 3, 2023 · Aug 4, 2023
diff --git a/examples/LLM_Workflows/pdf_summarizer/README.md b/examples/LLM_Workflows/pdf_summarizer/README.md
@@ -0,0 +1,26 @@
+# (Yet another) LLM PDF Summarizer 📝
+Here's an extensible and production-ready PDF summarizer that you can run anywhere! The frontend uses streamlit, which communicates with a FastAPI backend powered by Hamilton. You give it a PDF file via the browser app and it returns you a text summary using the OpenAI API. If you want, you skip the browser inteface and directly access the `/summarize` endpoint with your document! Everything is containerized using Docker, so you should be able to run it where you please 🏃.
+
+## Why build this project?
+This project shows how easy it is to productionize Hamilton. Its function-centric declarative approach makes the code easy to read and extend. We invite you to clone the repo and customize to your needs! We are happy to help you via [Slack](https://hamilton-opensource.slack.com/join/shared_invite/zt-1bjs72asx-wcUTgH7q7QX1igiQ5bbdcg) and are excited to see what you build 😁
+
+Here are a few ideas:
+- Modify the streamlit `file_uploader` to allow sending batches of files through the UI
+- Add PDF parsing and preprocessing to reduce number of tokens sent to OpenAI
+- Add Hamilton functions to gather metadata (file length, number of tokens, language, etc.) and return it via `SummaryResponse`
+- Support other file formats; use the `@config.when()` decorator to add alternatives to the `raw_text()` function for PDFs
+- Extract structured data from PDFs using open source models from the HuggingFace Hub.
+
+
+![](./backend/summarization_module.png)
+*The Hamilton execution DAG powering the backend*
+
+
+# Setup
+1. Clone this repository `git clone https://github.com/dagworks-inc/hamilton.git`
+2. Move to the directory `cd hamilton/examples/LLM_Workflows/pdf_summarizer`
+3. Create a `.env` (next to `README.md` and `docker-compose.yaml`) and add your OpenAI API key in  such that `OPENAI_API_KEY=YOUR_API_KEY`
+4. Build docker images `docker compose build`
+5. Create docker containers `docker compose up -d`
+6. Go to [http://localhost:8080/docs] to see if the FastAPI server is running
+7. Go to [http://localhost:8081/] to view the Streamlit app
diff --git a/examples/LLM_Workflows/pdf_summarizer/backend/Dockerfile b/examples/LLM_Workflows/pdf_summarizer/backend/Dockerfile
@@ -0,0 +1,20 @@
+FROM python:3.10-slim-bullseye
+
+WORKDIR /app
+
+# install graphviz backend
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends graphviz \
+    && apt-get autoremove -yqq --purge \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+EXPOSE 8080:8080
+
+# loads server.py module and the FastAPI `app` object
+CMD uvicorn server:app --host 0.0.0.0 --port 8080
diff --git a/examples/LLM_Workflows/pdf_summarizer/backend/requirements.txt b/examples/LLM_Workflows/pdf_summarizer/backend/requirements.txt
@@ -0,0 +1,10 @@
+fastapi
+openai
+pydantic
+PyPDF2
+python-multipart
+sf-hamilton[visualization]
+tenacity
+tiktoken
+tqdm
+uvicorn
diff --git a/examples/LLM_Workflows/pdf_summarizer/backend/server.py b/examples/LLM_Workflows/pdf_summarizer/backend/server.py
@@ -0,0 +1,94 @@
+import base64
+
+import fastapi
+import pydantic
+import summarization
+
+from hamilton import base, driver
+from hamilton.experimental import h_async
+
+# instantiate FastAPI app
+app = fastapi.FastAPI()
+
+
+# define constants for Hamilton driver
+driver_config = dict(
+    file_type="pdf",
+)
+
+# instantiate the Hamilton driver; it will power all API endpoints
+# async driver for use with async functions
+async_dr = h_async.AsyncDriver(
+    driver_config,
+    summarization,  # python module containing function logic
+    result_builder=base.DictResult(),
+)
+# sync driver for use with regular functions
+sync_dr = driver.Driver(
+    driver_config,
+    summarization,  # python module containing function logic
+    adapter=base.SimplePythonGraphAdapter(base.DictResult()),
+)
+
+
+class SummarizeResponse(pydantic.BaseModel):
+    """Response to the /summarize endpoint"""
+
+    summary: str
+
+
+@app.post("/summarize")
+async def summarize_pdf(
+    pdf_file: fastapi.UploadFile,
+    openai_gpt_model: str = "gpt-3.5-turbo-0613",
+    content_type: str = "Scientific article",
+    user_query: str = "Can you ELI5 the paper?",
+) -> SummarizeResponse:
+    """Request `summarized_text` from Hamilton driver with `pdf_file` and `user_query`"""
+    results = await async_dr.execute(
+        ["summarized_text"],
+        inputs=dict(
+            pdf_source=pdf_file.file,
+            openai_gpt_model=openai_gpt_model,
+            content_type=content_type,
+            user_query=user_query,
+        ),
+    )
+
+    return SummarizeResponse(summary=results["summarized_text"])
+
+
+@app.post("/summarize_sync")
+def summarize_pdf_sync(
+    pdf_file: fastapi.UploadFile,
+    openai_gpt_model: str = "gpt-3.5-turbo-0613",
+    content_type: str = "Scientific article",
+    user_query: str = "Can you ELI5 the paper?",
+) -> SummarizeResponse:
+    """Request `summarized_text` from Hamilton driver with `pdf_file` and `user_query`"""
+    results = sync_dr.execute(
+        ["summarized_text"],
+        inputs=dict(
+            pdf_source=pdf_file.file,
+            openai_gpt_model=openai_gpt_model,
+            content_type=content_type,
+            user_query=user_query,
+        ),
+    )
+
+    return SummarizeResponse(summary=results["summarized_text"])
+
+
+# add to SwaggerUI the execution DAG png
+# see http://localhost:8080/docs#/default/summarize_pdf_summarize_post
+base64_viz = base64.b64encode(open("summarize_route.png", "rb").read()).decode("utf-8")
+app.routes[
+    -1
+].description = f"""<h1>Execution DAG</h1><img alt="" src="data:image/png;base64,{base64_viz}"/>"""
+
+
+if __name__ == "__main__":
+    # run as a script to test server locally
+    import uvicorn
+
+    uvicorn.run(app, host="0.0.0.0", port=8080)
diff --git a/examples/LLM_Workflows/pdf_summarizer/backend/summarization.py b/examples/LLM_Workflows/pdf_summarizer/backend/summarization.py
@@ -0,0 +1,166 @@
+import concurrent
+import tempfile
+from typing import Generator
+
+import openai
+import tiktoken
+from PyPDF2 import PdfReader
+from tenacity import retry, stop_after_attempt, wait_random_exponential
+from tqdm import tqdm
+
+from hamilton.function_modifiers import config
+
+
+def summarize_chunk_of_text_prompt(content_type: str = "an academic paper") -> str:
+    """Base prompt for summarizing chunks of text."""
+    return f"Summarize this text from {content_type}. Extract any key points with reasoning.\n\nContent:"
+
+
+def summarize_text_from_summaries_prompt(content_type: str = "an academic paper") -> str:
+    """Prompt for summarizing a paper from a list of summaries."""
+    return f"""Write a summary collated from this collection of key points extracted from {content_type}.
+    The summary should highlight the core argument, conclusions and evidence, and answer the user's query.
+    User query: {{query}}
+    The summary should be structured in bulleted lists following the headings Core Argument, Evidence, and Conclusions.
+    Key points:\n{{results}}\nSummary:\n"""
+
+
+@config.when(file_type="pdf")
+def raw_text(pdf_source: str | bytes | tempfile.SpooledTemporaryFile) -> str:
+    """Takes a filepath to a PDF and returns a string of the PDF's contents
+    :param pdf_source: Series of filepaths to PDFs
+    :return: Series of strings of the PDFs' contents
+    """
+    reader = PdfReader(pdf_source)
+    _pdf_text = ""
+    page_number = 0
+    for page in reader.pages:
+        page_number += 1
+        _pdf_text += page.extract_text() + f"\nPage Number: {page_number}"
+    return _pdf_text
+
+
+def _create_chunks(text: str, n: int, tokenizer: tiktoken.Encoding) -> Generator[str, None, None]:
+    """Helper function. Returns successive n-sized chunks from provided text.
+    Split a text into smaller chunks of size n, preferably ending at the end of a sentence
+    :param text:
+    :param n:
+    :param tokenizer:
+    :return:
+    """
+    tokens = tokenizer.encode(text)
+    i = 0
+    while i < len(tokens):
+        # Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
+        j = min(i + int(1.5 * n), len(tokens))
+        while j > i + int(0.5 * n):
+            # Decode the tokens and check for full stop or newline
+            chunk = tokenizer.decode(tokens[i:j])
+            if chunk.endswith(".") or chunk.endswith("\n"):
+                break
+            j -= 1
+        # If no end of sentence found, use n tokens as the chunk size
+        if j == i + int(0.5 * n):
+            j = min(i + n, len(tokens))
+        yield tokens[i:j]
+        i = j
+
+
+def chunked_text(
+    raw_text: str, max_token_length: int = 1500, tokenizer_encoding: str = "cl100k_base"
+) -> list[str]:
+    """Chunks the pdf text into smaller chunks of size max_token_length.
+    :param pdf_text: the Series of individual pdf texts to chunk.
+    :param max_token_length: the maximum length of tokens in each chunk.
+    :param tokenizer_encoding: the encoding to use for the tokenizer.
+    :return: Series of chunked pdf text. Each element is a list of chunks.
+    """
+    tokenizer = tiktoken.get_encoding(tokenizer_encoding)
+    _encoded_chunks = _create_chunks(raw_text, max_token_length, tokenizer)
+    _decoded_chunks = [tokenizer.decode(chunk) for chunk in _encoded_chunks]
+    return _decoded_chunks
+
+
+@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3))
+def _summarize_chunk(content: str, template_prompt: str, openai_gpt_model: str) -> str:
+    """This function applies a prompt to some input content. In this case it returns a summarized chunk of text.
+    :param content: the content to summarize.
+    :param template_prompt: the prompt template to use to put the content into.
+    :param openai_gpt_model: the openai gpt model to use.
+    :return: the response from the openai API.
+    """
+    prompt = template_prompt + content
+    response = openai.ChatCompletion.create(
+        model=openai_gpt_model, messages=[{"role": "user", "content": prompt}], temperature=0
+    )
+    return response["choices"][0]["message"]["content"]
+
+
+def summarized_chunks(
+    chunked_text: list[str], summarize_chunk_of_text_prompt: str, openai_gpt_model: str
+) -> str:
+    """Summarizes a series of chunks of text.
+    Note: this takes the first result from the top_n_related_articles series and summarizes it. This is because
+    the top_n_related_articles series is sorted by relatedness, so the first result is the most related.
+    :param top_n_related_articles: series with each entry being a list of chunks of text for an article.
+    :param summarize_chunk_of_text_prompt:  the prompt to use to summarize each chunk of text.
+    :param openai_gpt_model: the openai gpt model to use.
+    :return: a single string of each chunk of text summarized, concatenated together.
+    """
+    _summarized_text = ""
+    with concurrent.futures.ThreadPoolExecutor(max_workers=len(chunked_text)) as executor:
+        futures = [
+            executor.submit(
+                _summarize_chunk, chunk, summarize_chunk_of_text_prompt, openai_gpt_model
+            )
+            for chunk in chunked_text
+        ]
+        with tqdm(total=len(chunked_text)) as pbar:
+            for _ in concurrent.futures.as_completed(futures):
+                pbar.update(1)
+        for future in futures:
+            data = future.result()
+            _summarized_text += data
+    return _summarized_text
+
+
+def summarized_text(
+    user_query: str,
+    summarized_chunks: str,
+    summarize_text_from_summaries_prompt: str,
+    openai_gpt_model: str,
+) -> str:
+    """Summarizes the text from the summarized chunks of the pdf.
+    :param user_query: the original user query.
+    :param summarized_chunks: a long string of chunked summaries of a file.
+    :param summarize_text_from_summaries_prompt: the template to use to summarize the chunks.
+    :param openai_gpt_model: which openai gpt model to use.
+    :return: the string response from the openai API.
+    """
+    response = openai.ChatCompletion.create(
+        model=openai_gpt_model,
+        messages=[
+            {
+                "role": "user",
+                "content": summarize_text_from_summaries_prompt.format(
+                    query=user_query, results=summarized_chunks
+                ),
+            }
+        ],
+        temperature=0,
+    )
+    return response["choices"][0]["message"]["content"]
+
+
+if __name__ == "__main__":
+    # run as a script to test Hamilton's execution
+    import summarization
+
+    from hamilton import base, driver
+
+    dr = driver.Driver(
+        {},
+        summarization,
+        adapter=base.SimplePythonGraphAdapter(base.DictResult()),
+    )
+    dr.display_all_functions("summary", {"format": "png"})
diff --git a/examples/LLM_Workflows/pdf_summarizer/backend/summarization_module.png b/examples/LLM_Workflows/pdf_summarizer/backend/summarization_module.png
diff --git a/examples/LLM_Workflows/pdf_summarizer/backend/summarize_route.png b/examples/LLM_Workflows/pdf_summarizer/backend/summarize_route.png
diff --git a/examples/LLM_Workflows/pdf_summarizer/docker-compose.yaml b/examples/LLM_Workflows/pdf_summarizer/docker-compose.yaml
@@ -0,0 +1,24 @@
+version: "3"
+services:
+  api:
+    container_name: fastapi_server
+    build: backend/.
+    command: "uvicorn server:app --host 0.0.0.0 --port 8080"
+    ports:
+      - "8080:8080"
+    environment:
+      - OPENAI_API_KEY=${OPENAI_API_KEY}
+    networks:
+      - pdf-summarizer
+
+  app:
+    container_name: streamlit_app
+    build: frontend/.
+    command: "streamlit run --server.port 8081 --server.enableCORS false app.py"
+    ports:
+      - "8081:8081"
+    networks:
+      - pdf-summarizer
+
+networks:
+  pdf-summarizer:
diff --git a/examples/LLM_Workflows/pdf_summarizer/frontend/Dockerfile b/examples/LLM_Workflows/pdf_summarizer/frontend/Dockerfile
@@ -0,0 +1,12 @@
+FROM python:3.10-slim-bullseye
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+EXPOSE 8081
+
+CMD streamlit run --server.port 8081 --server.enableCORS false app.py