From 92c9af085a8ab4a0398b31ad6d8a4a547980c576 Mon Sep 17 00:00:00 2001 From: Victor Miti Date: Sat, 24 Jun 2023 11:22:25 +0100 Subject: [PATCH] feat: separate modules for summarization backends --- README.md | 2 +- app/core/podcast/content.py | 52 ++++----------------- app/core/run.py | 3 +- app/core/summarization/__init__.py | 0 app/core/summarization/backends/__init__.py | 0 app/core/summarization/backends/cohere.py | 26 +++++++++++ app/core/summarization/backends/openai.py | 26 +++++++++++ 7 files changed, 65 insertions(+), 44 deletions(-) create mode 100644 app/core/summarization/__init__.py create mode 100644 app/core/summarization/backends/__init__.py create mode 100644 app/core/summarization/backends/cohere.py create mode 100644 app/core/summarization/backends/openai.py diff --git a/README.md b/README.md index 4f478ade..6cc472c5 100644 --- a/README.md +++ b/README.md @@ -249,7 +249,7 @@ See `pre-commit-config.yaml` for more details. In addition, please note the foll ### Core -- [ ] Add a separate module for summarization backends so we can choose which one to work with +- [x] Add a separate module for summarization backends so we can choose which one to work with - [ ] Add appropriate error handling on `requests` and `feedparser` jobs as well as all other operations, such as connecting to AWS Polly, etc. - [ ] Add task to perform substitution so that, for instance, K400 is written as 400 Kwacha. The AWS Polly voices fail to read Zambian money correctly. diff --git a/app/core/podcast/content.py b/app/core/podcast/content.py index 94a6a5d5..a035423a 100644 --- a/app/core/podcast/content.py +++ b/app/core/podcast/content.py @@ -1,17 +1,13 @@ import datetime import logging import random +from typing import Callable -import cohere -from langchain import OpenAI, PromptTemplate from num2words import num2words from pydantic import HttpUrl from app.core.db.models import Article, Episode -from app.core.utilities import COHERE_API_KEY, OPENAI_API_KEY, podcast_host, today, today_human_readable - -llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY) -co = cohere.Client(COHERE_API_KEY) +from app.core.utilities import podcast_host, today, today_human_readable async def get_episode_number() -> int: @@ -37,7 +33,7 @@ async def random_opening(): f"Today is {today_human_readable}. Welcome to the {episode_number} installment of the Zed News Podcast! I'm your host, {podcast_host}, and I'm excited to have you accompany me as we embark on a voyage through the latest news and stories from across Zambia.", f"Greetings and a warm welcome to the {episode_number} edition of the Zed News Podcast! It's {today_human_readable}, and I'm your host, {podcast_host}. Join me as we dive into the dynamic world of news and uncover the intriguing narratives shaping Zambia's landscape.", f"It's {today_human_readable}. I am thrilled to have you here for the {episode_number} edition of the Zed News Podcast! This is your host, {podcast_host}. Together, let's embark on an enriching journey through the vibrant tapestry of news and stories that define Zambia.", - f"Welcome! It's a pleasure to have you join me today for the {episode_number} installment of the Zed News Podcast! I'm {podcast_host}, your friendly guide through the ever-evolving news landscape of Zambia. Get ready to immerse yourself in the latest headlines and captivating narratives that await us.", + f"Welcome! It's a pleasure to have you join me today, {today_human_readable} for the {episode_number} installment of the Zed News Podcast! I'm {podcast_host}, your friendly guide through the ever-evolving news landscape of Zambia. Get ready to immerse yourself in the latest headlines and captivating narratives that await us.", f"Here we are, on {today_human_readable}, marking the {episode_number} edition of the Zed News Podcast! I'm {podcast_host}, your enthusiastic host, and I'm delighted to have you with me as we traverse the vast expanse of news and stories that illuminate the heart of Zambia.", ] @@ -104,8 +100,9 @@ def random_outro(): ] -async def create_transcript(news: list[dict[str, str]], dest: str): - """Create a podcast transcript from the news, and write it to a file +async def create_transcript(news: list[dict[str, str]], dest: str, summarizer: Callable): + """Create a podcast transcript from the news, using the provided summarization function + and write it to a file Args: news (list[dict[str, str]]): A list of news articles represented as @@ -116,6 +113,9 @@ async def create_transcript(news: list[dict[str, str]], dest: str): - 'content': The content of the article. This is passed to the OpenAI API for summarization. - 'category': The category of the article. dest (str): The destination file path where the transcript will be written. + summarizer (Callable): The function to use for summarization. This function must accept two arguments: + - content (str): The content of the article. + - title (str): The title of the article. Raises: - OpenAIException: If there is an issue with the OpenAI API. @@ -128,10 +128,8 @@ async def create_transcript(news: list[dict[str, str]], dest: str): None: The function writes the transcript to the specified file but does not return any value. """ - # Create a dictionary to store the articles by source articles_by_source = {} - # Iterate over each article in the news list for article in news: source = article["source"].replace("Zambia National Broadcasting Corporation (ZNBC)", "ZNBC") @@ -170,38 +168,8 @@ async def create_transcript(news: list[dict[str, str]], dest: str): ] title = article["title"] - content = article["content"] - - # =============== Summarize using OpenAI =============== - template = """ - Please provide a very short, sweet, informative and engaging summary of the following news entry, in not more than two sentences. - Please provide your output in a manner suitable for reading as part of a podcast. - - {entry} - """ - - prompt = PromptTemplate(input_variables=["entry"], template=template) - summary_prompt = prompt.format(entry=content) - - num_tokens = llm.get_num_tokens(summary_prompt) - logging.info(f"'{title}' and its prompt has {num_tokens} tokens") - - summary = llm(summary_prompt) - - # =============== Summarize using Cohere =============== - # logging.info(f"Summarizing '{title}' via Cohere ...") - # # https://docs.cohere.com/reference/summarize-2 - # response = co.summarize( - # text=content, - # model="summarize-xlarge", - # temperature=0, - # length="auto", - # format="paragraph", - # extractiveness="auto", - # additional_command="in a manner suitable for reading as part of a podcast", - # ) - # summary = response.summary + summary = summarizer(content, title) await update_article_with_summary(title, article["url"], today, summary) diff --git a/app/core/run.py b/app/core/run.py index c91a376f..c4f8de73 100644 --- a/app/core/run.py +++ b/app/core/run.py @@ -17,6 +17,7 @@ from app.core.podcast.episode import add_articles_to_episode, add_episode_to_db from app.core.podcast.mix import add_to_db, mix_audio, upload_to_s3 from app.core.podcast.voice import create_audio, delete_source_mp3 +from app.core.summarization.backends import openai from app.core.utilities import DATA_DIR, configure_logging, count_words, today_iso_fmt raw_news = f"{DATA_DIR}/{today_iso_fmt}_news.json" @@ -56,7 +57,7 @@ async def main(): # Create podcast transcript from app.core.podcast.content import create_transcript - await create_transcript(news, transcript) + await create_transcript(news, transcript, openai.summarize) # Create podcast audio output_key = create_audio(transcript) diff --git a/app/core/summarization/__init__.py b/app/core/summarization/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/app/core/summarization/backends/__init__.py b/app/core/summarization/backends/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/app/core/summarization/backends/cohere.py b/app/core/summarization/backends/cohere.py new file mode 100644 index 00000000..45b07abc --- /dev/null +++ b/app/core/summarization/backends/cohere.py @@ -0,0 +1,26 @@ +import logging + +import cohere + +from app.core.utilities import COHERE_API_KEY + +co = cohere.Client(COHERE_API_KEY) + + +def summarize(content: str, title: str) -> str: + """Summarize the content using Cohere's summarization API. + + https://docs.cohere.com/reference/summarize-2 + """ + + logging.info(f"Summarizing '{title}' via Cohere ...") + response = co.summarize( + text=content, + model="summarize-xlarge", + temperature=0, + length="auto", + format="paragraph", + extractiveness="auto", + additional_command="in a manner suitable for reading as part of a podcast", + ) + return response.summary diff --git a/app/core/summarization/backends/openai.py b/app/core/summarization/backends/openai.py new file mode 100644 index 00000000..34394141 --- /dev/null +++ b/app/core/summarization/backends/openai.py @@ -0,0 +1,26 @@ +import logging + +from langchain import OpenAI, PromptTemplate + +from app.core.utilities import OPENAI_API_KEY + +llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY) + + +def summarize(content: str, title: str) -> str: + """Summarize the content using OpenAI's language model.""" + + template = """ + Please provide a very short, sweet, informative and engaging summary of the following news entry, in not more than two sentences. + Please provide your output in a manner suitable for reading as part of a podcast. + + {entry} + """ + + prompt = PromptTemplate(input_variables=["entry"], template=template) + summary_prompt = prompt.format(entry=content) + + num_tokens = llm.get_num_tokens(summary_prompt) + logging.info(f"'{title}' and its prompt has {num_tokens} tokens") + + return llm(summary_prompt)