From 92c9af085a8ab4a0398b31ad6d8a4a547980c576 Mon Sep 17 00:00:00 2001
From: Victor Miti <victormiti@umusebo.com>
Date: Sat, 24 Jun 2023 11:22:25 +0100
Subject: [PATCH] feat: separate modules for summarization backends

---
 README.md                                   |  2 +-
 app/core/podcast/content.py                 | 52 ++++-----------------
 app/core/run.py                             |  3 +-
 app/core/summarization/__init__.py          |  0
 app/core/summarization/backends/__init__.py |  0
 app/core/summarization/backends/cohere.py   | 26 +++++++++++
 app/core/summarization/backends/openai.py   | 26 +++++++++++
 7 files changed, 65 insertions(+), 44 deletions(-)
 create mode 100644 app/core/summarization/__init__.py
 create mode 100644 app/core/summarization/backends/__init__.py
 create mode 100644 app/core/summarization/backends/cohere.py
 create mode 100644 app/core/summarization/backends/openai.py

diff --git a/README.md b/README.md
index 4f478ade..6cc472c5 100644
--- a/README.md
+++ b/README.md
@@ -249,7 +249,7 @@ See `pre-commit-config.yaml` for more details. In addition, please note the foll
 
 ### Core
 
-- [ ] Add a separate module for summarization backends so we can choose which one to work with
+- [x] Add a separate module for summarization backends so we can choose which one to work with
 - [ ] Add appropriate error handling on `requests` and `feedparser` jobs as well as all other operations, such as connecting to AWS Polly, etc.
 - [ ] Add task to perform substitution so that, for instance, K400 is written as 400 Kwacha. The AWS Polly voices fail to read Zambian money correctly.
 
diff --git a/app/core/podcast/content.py b/app/core/podcast/content.py
index 94a6a5d5..a035423a 100644
--- a/app/core/podcast/content.py
+++ b/app/core/podcast/content.py
@@ -1,17 +1,13 @@
 import datetime
 import logging
 import random
+from typing import Callable
 
-import cohere
-from langchain import OpenAI, PromptTemplate
 from num2words import num2words
 from pydantic import HttpUrl
 
 from app.core.db.models import Article, Episode
-from app.core.utilities import COHERE_API_KEY, OPENAI_API_KEY, podcast_host, today, today_human_readable
-
-llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
-co = cohere.Client(COHERE_API_KEY)
+from app.core.utilities import podcast_host, today, today_human_readable
 
 
 async def get_episode_number() -> int:
@@ -37,7 +33,7 @@ async def random_opening():
         f"Today is {today_human_readable}. Welcome to the {episode_number} installment of the Zed News Podcast! I'm your host, {podcast_host}, and I'm excited to have you accompany me as we embark on a voyage through the latest news and stories from across Zambia.",
         f"Greetings and a warm welcome to the {episode_number} edition of the Zed News Podcast! It's {today_human_readable}, and I'm your host, {podcast_host}. Join me as we dive into the dynamic world of news and uncover the intriguing narratives shaping Zambia's landscape.",
         f"It's {today_human_readable}. I am thrilled to have you here for the {episode_number} edition of the Zed News Podcast! This is your host, {podcast_host}. Together, let's embark on an enriching journey through the vibrant tapestry of news and stories that define Zambia.",
-        f"Welcome! It's a pleasure to have you join me today for the {episode_number} installment of the Zed News Podcast! I'm {podcast_host}, your friendly guide through the ever-evolving news landscape of Zambia. Get ready to immerse yourself in the latest headlines and captivating narratives that await us.",
+        f"Welcome! It's a pleasure to have you join me today, {today_human_readable} for the {episode_number} installment of the Zed News Podcast! I'm {podcast_host}, your friendly guide through the ever-evolving news landscape of Zambia. Get ready to immerse yourself in the latest headlines and captivating narratives that await us.",
         f"Here we are, on {today_human_readable}, marking the {episode_number} edition of the Zed News Podcast! I'm {podcast_host}, your enthusiastic host, and I'm delighted to have you with me as we traverse the vast expanse of news and stories that illuminate the heart of Zambia.",
     ]
 
@@ -104,8 +100,9 @@ def random_outro():
 ]
 
 
-async def create_transcript(news: list[dict[str, str]], dest: str):
-    """Create a podcast transcript from the news, and write it to a file
+async def create_transcript(news: list[dict[str, str]], dest: str, summarizer: Callable):
+    """Create a podcast transcript from the news, using the provided summarization function
+    and write it to a file
 
     Args:
         news (list[dict[str, str]]): A list of news articles represented as
@@ -116,6 +113,9 @@ async def create_transcript(news: list[dict[str, str]], dest: str):
             - 'content': The content of the article. This is passed to the OpenAI API for summarization.
             - 'category': The category of the article.
         dest (str): The destination file path where the transcript will be written.
+        summarizer (Callable): The function to use for summarization. This function must accept two arguments:
+            - content (str): The content of the article.
+            - title (str): The title of the article.
 
     Raises:
         - OpenAIException: If there is an issue with the OpenAI API.
@@ -128,10 +128,8 @@ async def create_transcript(news: list[dict[str, str]], dest: str):
         None: The function writes the transcript to the specified file but does not return any value.
     """
 
-    # Create a dictionary to store the articles by source
     articles_by_source = {}
 
-    # Iterate over each article in the news list
     for article in news:
         source = article["source"].replace("Zambia National Broadcasting Corporation (ZNBC)", "ZNBC")
 
@@ -170,38 +168,8 @@ async def create_transcript(news: list[dict[str, str]], dest: str):
             ]
 
             title = article["title"]
-
             content = article["content"]
-
-            # =============== Summarize using OpenAI ===============
-            template = """
-            Please provide a very short, sweet, informative and engaging summary of the following news entry, in not more than two sentences.
-            Please provide your output in a manner suitable for reading as part of a podcast.
-
-            {entry}
-            """
-
-            prompt = PromptTemplate(input_variables=["entry"], template=template)
-            summary_prompt = prompt.format(entry=content)
-
-            num_tokens = llm.get_num_tokens(summary_prompt)
-            logging.info(f"'{title}' and its prompt has {num_tokens} tokens")
-
-            summary = llm(summary_prompt)
-
-            # =============== Summarize using Cohere ===============
-            # logging.info(f"Summarizing '{title}' via Cohere ...")
-            # # https://docs.cohere.com/reference/summarize-2
-            # response = co.summarize(
-            #     text=content,
-            #     model="summarize-xlarge",
-            #     temperature=0,
-            #     length="auto",
-            #     format="paragraph",
-            #     extractiveness="auto",
-            #     additional_command="in a manner suitable for reading as part of a podcast",
-            # )
-            # summary = response.summary
+            summary = summarizer(content, title)
 
             await update_article_with_summary(title, article["url"], today, summary)
 
diff --git a/app/core/run.py b/app/core/run.py
index c91a376f..c4f8de73 100644
--- a/app/core/run.py
+++ b/app/core/run.py
@@ -17,6 +17,7 @@
 from app.core.podcast.episode import add_articles_to_episode, add_episode_to_db
 from app.core.podcast.mix import add_to_db, mix_audio, upload_to_s3
 from app.core.podcast.voice import create_audio, delete_source_mp3
+from app.core.summarization.backends import openai
 from app.core.utilities import DATA_DIR, configure_logging, count_words, today_iso_fmt
 
 raw_news = f"{DATA_DIR}/{today_iso_fmt}_news.json"
@@ -56,7 +57,7 @@ async def main():
         # Create podcast transcript
         from app.core.podcast.content import create_transcript
 
-        await create_transcript(news, transcript)
+        await create_transcript(news, transcript, openai.summarize)
 
         # Create podcast audio
         output_key = create_audio(transcript)
diff --git a/app/core/summarization/__init__.py b/app/core/summarization/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/app/core/summarization/backends/__init__.py b/app/core/summarization/backends/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/app/core/summarization/backends/cohere.py b/app/core/summarization/backends/cohere.py
new file mode 100644
index 00000000..45b07abc
--- /dev/null
+++ b/app/core/summarization/backends/cohere.py
@@ -0,0 +1,26 @@
+import logging
+
+import cohere
+
+from app.core.utilities import COHERE_API_KEY
+
+co = cohere.Client(COHERE_API_KEY)
+
+
+def summarize(content: str, title: str) -> str:
+    """Summarize the content using Cohere's summarization API.
+
+    https://docs.cohere.com/reference/summarize-2
+    """
+
+    logging.info(f"Summarizing '{title}' via Cohere ...")
+    response = co.summarize(
+        text=content,
+        model="summarize-xlarge",
+        temperature=0,
+        length="auto",
+        format="paragraph",
+        extractiveness="auto",
+        additional_command="in a manner suitable for reading as part of a podcast",
+    )
+    return response.summary
diff --git a/app/core/summarization/backends/openai.py b/app/core/summarization/backends/openai.py
new file mode 100644
index 00000000..34394141
--- /dev/null
+++ b/app/core/summarization/backends/openai.py
@@ -0,0 +1,26 @@
+import logging
+
+from langchain import OpenAI, PromptTemplate
+
+from app.core.utilities import OPENAI_API_KEY
+
+llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
+
+
+def summarize(content: str, title: str) -> str:
+    """Summarize the content using OpenAI's language model."""
+
+    template = """
+    Please provide a very short, sweet, informative and engaging summary of the following news entry, in not more than two sentences.
+    Please provide your output in a manner suitable for reading as part of a podcast.
+
+    {entry}
+    """
+
+    prompt = PromptTemplate(input_variables=["entry"], template=template)
+    summary_prompt = prompt.format(entry=content)
+
+    num_tokens = llm.get_num_tokens(summary_prompt)
+    logging.info(f"'{title}' and its prompt has {num_tokens} tokens")
+
+    return llm(summary_prompt)