From 16e28772a8e82e6e5fcd25d5faad476f738873a4 Mon Sep 17 00:00:00 2001 From: Victor Miti Date: Wed, 13 Dec 2023 18:46:45 +0000 Subject: [PATCH] feat: use vicuna-13b-v1.5-16k and garage-bAInd/Platypus2-70B-instruct --- app/core/news/other.py | 7 +++ app/core/podcast/content.py | 47 ++++++--------------- app/core/summarization/backends/together.py | 8 ++-- social.py | 14 +++--- 4 files changed, 32 insertions(+), 44 deletions(-) diff --git a/app/core/news/other.py b/app/core/news/other.py index 96e49a95..3288246b 100644 --- a/app/core/news/other.py +++ b/app/core/news/other.py @@ -47,6 +47,9 @@ def get_daily_mail_article_detail(url): content = content.replace("CLICK TO READ MORE", "...") content = content.replace("https://enews.daily-mail.co.zm/welcome/home", "") + # remove Read more: eNews Daily Mail | Without Fear Or Favour (daily-mail.co.zm) + content = content.replace("Read more: eNews Daily Mail | Without Fear Or Favour (daily-mail.co.zm)", "") + return content elif article := soup.find("main"): content_elements = article.select("div.e-con-inner") @@ -57,6 +60,10 @@ def get_daily_mail_article_detail(url): # remove Read more: eNews Daily Mail | Without Fear Or Favour (daily-mail.co.zm) content = content.replace("Read more: eNews Daily Mail | Without Fear Or Favour (daily-mail.co.zm)", "") + # Remove "CLICK TO READ MORE" from the content + content = content.replace("CLICK TO READ MORE", "...") + content = content.replace("https://enews.daily-mail.co.zm/welcome/home", "") + return content return None diff --git a/app/core/podcast/content.py b/app/core/podcast/content.py index 20815c2d..17538b23 100644 --- a/app/core/podcast/content.py +++ b/app/core/podcast/content.py @@ -1,12 +1,13 @@ import datetime import logging +import time from typing import Callable import together from pydantic import HttpUrl from app.core.db.models import Article, Episode -from app.core.utilities import TOGETHER_API_KEY, podcast_host, today, today_human_readable +from app.core.utilities import DATA_DIR, TOGETHER_API_KEY, podcast_host, today, today_human_readable, today_iso_fmt async def get_episode_number() -> int: @@ -65,33 +66,10 @@ async def create_transcript(news: list[dict[str, str]], dest: str, summarizer: C # Add the article to the list for the corresponding source articles_by_source[source].append(article) - prompt = f": You are {podcast_host}, an accomplished, fun and witty scriptwriter, content creator and podcast host. You have a news and current affairs podcast which runs Monday to Friday. Your secretary has gathered the news from various sources, and has given you the notes as shown below. To ensure accuracy, please read the content carefully and pay attention to any nuances or complexities in the language, then go ahead and present today's episode. It is important that you cover EVERYTHING, do not leave out anything. Feel free to consolidate any similar news items from different sources, and present the news in a logical sequence, based on common themes. At the end, add a fun and witty remark informing your audience that you are actually an AI, and not a human.\n\n" + prompt = f"You are {podcast_host}, an accomplished, fun and witty scriptwriter, content creator and podcast host. You have a news and current affairs podcast which runs Monday to Friday. Your secretary has gathered the news from various sources as indicated below. Study the content, consolidate any similar news items from different sources, and organize the news in a logical, coherent manner so it's easy to follow. You can then go ahead and present today's episode, ensuring that you cover all the news your secretary has curated. At the end, add a fun and witty remark informing your audience that you are actually an AI, and not a human.\n\n" metadata = f"Title: Zed News Podcast episode {await get_episode_number()}\nDate: {today_human_readable}\nHost: {podcast_host}\n\n" - unwanted_text = [ - # "Sure, here's a summary of the news entry in two sentences:", - # "Sure, here is a summary of the news entry in two sentences:", - # "Sure, here's a summary of the news entry in not more than two sentences:", - # "Sure, here is a summary of the news entry in not more than two sentences:", - "Sure! Here's the summary:", - "Sure! Here is the summary:", - "Sure, I can help you with that!", - "Sure, I can do that!", - # "Here's a summary of the news entry in two sentences:", - # "Here is a summary of the news entry in two sentences:", - # "Here's a summary of the news entry in not more than two sentences:", - # "Here is a summary of the news entry in not more than two sentences:", - # "Here's a two-sentence summary of the news entry:", - # "Here is a two-sentence summary of the news entry:", - "Sure! Here's a possible summary of the news entry:", - "Sure! Here is a possible summary of the news entry:", - "Sure, here's a possible summary:", - "Sure, here is a possible summary:", - # "Sure! Here's a two-sentence summary of the news entry you provided:", - # "Sure! Here is a two-sentence summary of the news entry you provided:", - ] - content = "" counter = 0 for source in articles_by_source: @@ -101,9 +79,6 @@ async def create_transcript(news: list[dict[str, str]], dest: str, summarizer: C text = article["content"] summary = summarizer(text, title) - for text in unwanted_text: - summary = summary.replace(text, "") - await update_article_with_summary(title, article["url"], today, summary) counter += 1 @@ -111,25 +86,27 @@ async def create_transcript(news: list[dict[str, str]], dest: str, summarizer: C content += f"{counter}. '{title}' (source: {source})" content += f"\n{summary.strip()}\n\n" - notes = prompt + "```" + metadata + "News Items:\n\n" + content + "```:" + notes = prompt + "```" + metadata + "News Items:\n\n" + content + "```" - model = "mistralai/Mixtral-8x7B-Instruct-v0.1" - temperature = 0.4 - top_p = 0.5 + # Write the content to a file + with open(f"{DATA_DIR}/{today_iso_fmt}_news_headlines.txt", "w") as f: + f.write(metadata + "News Items:\n\n" + content) + + model = "lmsys/vicuna-13b-v1.5-16k" + temperature = 0.7 max_tokens = 4096 together.api_key = TOGETHER_API_KEY output = together.Complete.create( prompt=notes, model=model, temperature=temperature, - top_p=top_p, max_tokens=max_tokens, - repetition_penalty=1.1, ) + time.sleep(30) logging.info(output) transcript = output["output"]["choices"][0]["text"] - # Write the content to a file + # Write the transcript to a file with open(dest, "w") as f: f.write(transcript) diff --git a/app/core/summarization/backends/together.py b/app/core/summarization/backends/together.py index 33277263..1cdc3c6a 100644 --- a/app/core/summarization/backends/together.py +++ b/app/core/summarization/backends/together.py @@ -1,4 +1,5 @@ import logging +import time import together @@ -14,10 +15,10 @@ def summarize(content: str, title: str) -> str: https://docs.together.ai/reference/complete """ - prompt = f": You are a distinguished news editor and content publisher, your task is to summarize the following news entry. The summary should accurately reflect the main message and arguments presented in the original text, while also being concise and easy to understand. Just summarize straight away, without responding to me.\n\n ```{content}```\n:" - model = "togethercomputer/llama-2-70b-chat" + prompt = f"You are a distinguished news editor and content publisher, your task is to summarize the following news entry. The summary should accurately reflect the main message and arguments presented in the original news entry, while also being concise and easy to understand. Your summary should not exceed two sentences.\n\n ```{content}```:" + model = "garage-bAInd/Platypus2-70B-instruct" temperature = 0.7 - max_tokens = 512 + max_tokens = 128 output = together.Complete.create( prompt=prompt, @@ -25,6 +26,7 @@ def summarize(content: str, title: str) -> str: temperature=temperature, max_tokens=max_tokens, ) + time.sleep(1.5) logging.info(output) return output["output"]["choices"][0]["text"] diff --git a/social.py b/social.py index 8ead83c8..c3a70978 100644 --- a/social.py +++ b/social.py @@ -12,6 +12,7 @@ import os import pathlib import sys +import time from http import HTTPStatus import facebook @@ -40,7 +41,7 @@ TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY") together.api_key = TOGETHER_API_KEY -podcast_transcript = f"{DATA_DIR}/{today_iso_fmt}/{today_iso_fmt}_podcast-content.txt" +news_headlines = f"{DATA_DIR}/{today_iso_fmt}_news_headlines.txt" podcast_url = f"https://zednews.pages.dev/episode/{today_iso_fmt}/" @@ -61,8 +62,8 @@ def podcast_is_live(url): def get_content() -> str: - """Get the content of the podcast transcript""" - with open(podcast_transcript, "r") as f: + """Get the headlines""" + with open(news_headlines, "r") as f: return f.read() @@ -73,8 +74,9 @@ def create_facebook_post(content: str) -> str: https://docs.together.ai/reference/complete """ - prompt = f": You are a social media marketing guru. You have been hired by a podcaster, {podcast_host} to create a nice, short and catchy facebook post (max 130 words) inviting people to listen to today's podcast whose transcript is below. Highlight some interesting news headlines, appropriately paraphrasing them to grab the attention of your audience. Also, appropriately utilize bullet points, emojis, whitespace and hashtags where necessary. Do not add the link to the podcast as it will be added automatically.\n\n```{content}\n```\n:" - model = "togethercomputer/llama-2-70b-chat" + prompt = f"You are a social media marketing guru. You have been hired by a podcaster, {podcast_host}, who hosts a news and current affairs podcast which runs Monday to Friday. Your task is to create a nice, short and catchy facebook post inviting people to listen to today's podcast whose details are below. Appropriately utilize bullet points, emojis, whitespace and hashtags where necessary. Do not add the link to the podcast as it will be added automatically.\n\n```{content}\n```" + + model = "lmsys/vicuna-13b-v1.5-16k" temperature = 0.7 max_tokens = 768 @@ -83,9 +85,9 @@ def create_facebook_post(content: str) -> str: model=model, temperature=temperature, max_tokens=max_tokens, - repetition_penalty=1.1, ) logger.info(output) + time.sleep(30) return output["output"]["choices"][0]["text"]