Skip to content

Commit

Permalink
feat: separate modules for summarization backends
Browse files Browse the repository at this point in the history
  • Loading branch information
engineervix committed Jun 24, 2023
1 parent ee21e4c commit 92c9af0
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 44 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ See `pre-commit-config.yaml` for more details. In addition, please note the foll
### Core
- [ ] Add a separate module for summarization backends so we can choose which one to work with
- [x] Add a separate module for summarization backends so we can choose which one to work with
- [ ] Add appropriate error handling on `requests` and `feedparser` jobs as well as all other operations, such as connecting to AWS Polly, etc.
- [ ] Add task to perform substitution so that, for instance, K400 is written as 400 Kwacha. The AWS Polly voices fail to read Zambian money correctly.
Expand Down
52 changes: 10 additions & 42 deletions app/core/podcast/content.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
import datetime
import logging
import random
from typing import Callable

import cohere
from langchain import OpenAI, PromptTemplate
from num2words import num2words
from pydantic import HttpUrl

from app.core.db.models import Article, Episode
from app.core.utilities import COHERE_API_KEY, OPENAI_API_KEY, podcast_host, today, today_human_readable

llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
co = cohere.Client(COHERE_API_KEY)
from app.core.utilities import podcast_host, today, today_human_readable


async def get_episode_number() -> int:
Expand All @@ -37,7 +33,7 @@ async def random_opening():
f"Today is {today_human_readable}. Welcome to the {episode_number} installment of the Zed News Podcast! I'm your host, {podcast_host}, and I'm excited to have you accompany me as we embark on a voyage through the latest news and stories from across Zambia.",
f"Greetings and a warm welcome to the {episode_number} edition of the Zed News Podcast! It's {today_human_readable}, and I'm your host, {podcast_host}. Join me as we dive into the dynamic world of news and uncover the intriguing narratives shaping Zambia's landscape.",
f"It's {today_human_readable}. I am thrilled to have you here for the {episode_number} edition of the Zed News Podcast! This is your host, {podcast_host}. Together, let's embark on an enriching journey through the vibrant tapestry of news and stories that define Zambia.",
f"Welcome! It's a pleasure to have you join me today for the {episode_number} installment of the Zed News Podcast! I'm {podcast_host}, your friendly guide through the ever-evolving news landscape of Zambia. Get ready to immerse yourself in the latest headlines and captivating narratives that await us.",
f"Welcome! It's a pleasure to have you join me today, {today_human_readable} for the {episode_number} installment of the Zed News Podcast! I'm {podcast_host}, your friendly guide through the ever-evolving news landscape of Zambia. Get ready to immerse yourself in the latest headlines and captivating narratives that await us.",
f"Here we are, on {today_human_readable}, marking the {episode_number} edition of the Zed News Podcast! I'm {podcast_host}, your enthusiastic host, and I'm delighted to have you with me as we traverse the vast expanse of news and stories that illuminate the heart of Zambia.",
]

Expand Down Expand Up @@ -104,8 +100,9 @@ def random_outro():
]


async def create_transcript(news: list[dict[str, str]], dest: str):
"""Create a podcast transcript from the news, and write it to a file
async def create_transcript(news: list[dict[str, str]], dest: str, summarizer: Callable):
"""Create a podcast transcript from the news, using the provided summarization function
and write it to a file
Args:
news (list[dict[str, str]]): A list of news articles represented as
Expand All @@ -116,6 +113,9 @@ async def create_transcript(news: list[dict[str, str]], dest: str):
- 'content': The content of the article. This is passed to the OpenAI API for summarization.
- 'category': The category of the article.
dest (str): The destination file path where the transcript will be written.
summarizer (Callable): The function to use for summarization. This function must accept two arguments:
- content (str): The content of the article.
- title (str): The title of the article.
Raises:
- OpenAIException: If there is an issue with the OpenAI API.
Expand All @@ -128,10 +128,8 @@ async def create_transcript(news: list[dict[str, str]], dest: str):
None: The function writes the transcript to the specified file but does not return any value.
"""

# Create a dictionary to store the articles by source
articles_by_source = {}

# Iterate over each article in the news list
for article in news:
source = article["source"].replace("Zambia National Broadcasting Corporation (ZNBC)", "ZNBC")

Expand Down Expand Up @@ -170,38 +168,8 @@ async def create_transcript(news: list[dict[str, str]], dest: str):
]

title = article["title"]

content = article["content"]

# =============== Summarize using OpenAI ===============
template = """
Please provide a very short, sweet, informative and engaging summary of the following news entry, in not more than two sentences.
Please provide your output in a manner suitable for reading as part of a podcast.
{entry}
"""

prompt = PromptTemplate(input_variables=["entry"], template=template)
summary_prompt = prompt.format(entry=content)

num_tokens = llm.get_num_tokens(summary_prompt)
logging.info(f"'{title}' and its prompt has {num_tokens} tokens")

summary = llm(summary_prompt)

# =============== Summarize using Cohere ===============
# logging.info(f"Summarizing '{title}' via Cohere ...")
# # https://docs.cohere.com/reference/summarize-2
# response = co.summarize(
# text=content,
# model="summarize-xlarge",
# temperature=0,
# length="auto",
# format="paragraph",
# extractiveness="auto",
# additional_command="in a manner suitable for reading as part of a podcast",
# )
# summary = response.summary
summary = summarizer(content, title)

await update_article_with_summary(title, article["url"], today, summary)

Expand Down
3 changes: 2 additions & 1 deletion app/core/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from app.core.podcast.episode import add_articles_to_episode, add_episode_to_db
from app.core.podcast.mix import add_to_db, mix_audio, upload_to_s3
from app.core.podcast.voice import create_audio, delete_source_mp3
from app.core.summarization.backends import openai
from app.core.utilities import DATA_DIR, configure_logging, count_words, today_iso_fmt

raw_news = f"{DATA_DIR}/{today_iso_fmt}_news.json"
Expand Down Expand Up @@ -56,7 +57,7 @@ async def main():
# Create podcast transcript
from app.core.podcast.content import create_transcript

await create_transcript(news, transcript)
await create_transcript(news, transcript, openai.summarize)

# Create podcast audio
output_key = create_audio(transcript)
Expand Down
Empty file.
Empty file.
26 changes: 26 additions & 0 deletions app/core/summarization/backends/cohere.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import logging

import cohere

from app.core.utilities import COHERE_API_KEY

co = cohere.Client(COHERE_API_KEY)


def summarize(content: str, title: str) -> str:
"""Summarize the content using Cohere's summarization API.
https://docs.cohere.com/reference/summarize-2
"""

logging.info(f"Summarizing '{title}' via Cohere ...")
response = co.summarize(
text=content,
model="summarize-xlarge",
temperature=0,
length="auto",
format="paragraph",
extractiveness="auto",
additional_command="in a manner suitable for reading as part of a podcast",
)
return response.summary
26 changes: 26 additions & 0 deletions app/core/summarization/backends/openai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import logging

from langchain import OpenAI, PromptTemplate

from app.core.utilities import OPENAI_API_KEY

llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)


def summarize(content: str, title: str) -> str:
"""Summarize the content using OpenAI's language model."""

template = """
Please provide a very short, sweet, informative and engaging summary of the following news entry, in not more than two sentences.
Please provide your output in a manner suitable for reading as part of a podcast.
{entry}
"""

prompt = PromptTemplate(input_variables=["entry"], template=template)
summary_prompt = prompt.format(entry=content)

num_tokens = llm.get_num_tokens(summary_prompt)
logging.info(f"'{title}' and its prompt has {num_tokens} tokens")

return llm(summary_prompt)

0 comments on commit 92c9af0

Please sign in to comment.