feat: use vicuna-13b-v1.5-16k and garage-bAInd/Platypus2-70B-instruct

engineervix · Dec 13, 2023 · 16e2877 · 16e2877
1 parent dd6c918
commit 16e2877
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 44 deletions.
diff --git a/app/core/news/other.py b/app/core/news/other.py
@@ -47,6 +47,9 @@ def get_daily_mail_article_detail(url):
             content = content.replace("CLICK TO READ MORE", "...")
             content = content.replace("https://enews.daily-mail.co.zm/welcome/home", "")
 
+            # remove Read more: eNews Daily Mail | Without Fear Or Favour (daily-mail.co.zm)
+            content = content.replace("Read more: eNews Daily Mail | Without Fear Or Favour (daily-mail.co.zm)", "")
+
             return content
         elif article := soup.find("main"):
             content_elements = article.select("div.e-con-inner")
@@ -57,6 +60,10 @@ def get_daily_mail_article_detail(url):
             # remove Read more: eNews Daily Mail | Without Fear Or Favour (daily-mail.co.zm)
             content = content.replace("Read more: eNews Daily Mail | Without Fear Or Favour (daily-mail.co.zm)", "")
 
+            # Remove "CLICK TO READ MORE" from the content
+            content = content.replace("CLICK TO READ MORE", "...")
+            content = content.replace("https://enews.daily-mail.co.zm/welcome/home", "")
+
             return content
         return None
 

diff --git a/app/core/podcast/content.py b/app/core/podcast/content.py
@@ -1,12 +1,13 @@
 import datetime
 import logging
+import time
 from typing import Callable
 
 import together
 from pydantic import HttpUrl
 
 from app.core.db.models import Article, Episode
-from app.core.utilities import TOGETHER_API_KEY, podcast_host, today, today_human_readable
+from app.core.utilities import DATA_DIR, TOGETHER_API_KEY, podcast_host, today, today_human_readable, today_iso_fmt
 
 
 async def get_episode_number() -> int:
@@ -65,33 +66,10 @@ async def create_transcript(news: list[dict[str, str]], dest: str, summarizer: C
         # Add the article to the list for the corresponding source
         articles_by_source[source].append(article)
 
-    prompt = f"<human>: You are {podcast_host}, an accomplished, fun and witty scriptwriter, content creator and podcast host. You have a news and current affairs podcast which runs Monday to Friday. Your secretary has gathered the news from various sources, and has given you the notes as shown below. To ensure accuracy, please read the content carefully and pay attention to any nuances or complexities in the language, then go ahead and present today's episode. It is important that you cover EVERYTHING, do not leave out anything. Feel free to consolidate any similar news items from different sources, and present the news in a logical sequence, based on common themes. At the end, add a fun and witty remark informing your audience that you are actually an AI, and not a human.\n\n"
+        prompt = f"You are {podcast_host}, an accomplished, fun and witty scriptwriter, content creator and podcast host. You have a news and current affairs podcast which runs Monday to Friday. Your secretary has gathered the news from various sources as indicated below. Study the content, consolidate any similar news items from different sources, and organize the news in a logical, coherent manner so it's easy to follow. You can then go ahead and present today's episode, ensuring that you cover all the news your secretary has curated. At the end, add a fun and witty remark informing your audience that you are actually an AI, and not a human.\n\n"
 
     metadata = f"Title: Zed News Podcast episode {await get_episode_number()}\nDate: {today_human_readable}\nHost: {podcast_host}\n\n"
 
-    unwanted_text = [
-        # "Sure, here's a summary of the news entry in two sentences:",
-        # "Sure, here is a summary of the news entry in two sentences:",
-        # "Sure, here's a summary of the news entry in not more than two sentences:",
-        # "Sure, here is a summary of the news entry in not more than two sentences:",
-        "Sure! Here's the summary:",
-        "Sure! Here is the summary:",
-        "Sure, I can help you with that!",
-        "Sure, I can do that!",
-        # "Here's a summary of the news entry in two sentences:",
-        # "Here is a summary of the news entry in two sentences:",
-        # "Here's a summary of the news entry in not more than two sentences:",
-        # "Here is a summary of the news entry in not more than two sentences:",
-        # "Here's a two-sentence summary of the news entry:",
-        # "Here is a two-sentence summary of the news entry:",
-        "Sure! Here's a possible summary of the news entry:",
-        "Sure! Here is a possible summary of the news entry:",
-        "Sure, here's a possible summary:",
-        "Sure, here is a possible summary:",
-        # "Sure! Here's a two-sentence summary of the news entry you provided:",
-        # "Sure! Here is a two-sentence summary of the news entry you provided:",
-    ]
-
     content = ""
     counter = 0
     for source in articles_by_source:
@@ -101,35 +79,34 @@ async def create_transcript(news: list[dict[str, str]], dest: str, summarizer: C
             text = article["content"]
             summary = summarizer(text, title)
 
-            for text in unwanted_text:
-                summary = summary.replace(text, "")
-
             await update_article_with_summary(title, article["url"], today, summary)
 
             counter += 1
 
             content += f"{counter}. '{title}' (source: {source})"
             content += f"\n{summary.strip()}\n\n"
 
-    notes = prompt + "```" + metadata + "News Items:\n\n" + content + "```<bot>:"
+    notes = prompt + "```" + metadata + "News Items:\n\n" + content + "```"
 
-    model = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-    temperature = 0.4
-    top_p = 0.5
+    # Write the content to a file
+    with open(f"{DATA_DIR}/{today_iso_fmt}_news_headlines.txt", "w") as f:
+        f.write(metadata + "News Items:\n\n" + content)
+
+    model = "lmsys/vicuna-13b-v1.5-16k"
+    temperature = 0.7
     max_tokens = 4096
     together.api_key = TOGETHER_API_KEY
     output = together.Complete.create(
         prompt=notes,
         model=model,
         temperature=temperature,
-        top_p=top_p,
         max_tokens=max_tokens,
-        repetition_penalty=1.1,
     )
+    time.sleep(30)
     logging.info(output)
 
     transcript = output["output"]["choices"][0]["text"]
 
-    # Write the content to a file
+    # Write the transcript to a file
     with open(dest, "w") as f:
         f.write(transcript)
diff --git a/app/core/summarization/backends/together.py b/app/core/summarization/backends/together.py
@@ -1,4 +1,5 @@
 import logging
+import time
 
 import together
 
@@ -14,17 +15,18 @@ def summarize(content: str, title: str) -> str:
     https://docs.together.ai/reference/complete
     """
 
-    prompt = f"<human>: You are a distinguished news editor and content publisher, your task is to summarize the following news entry. The summary should accurately reflect the main message and arguments presented in the original text, while also being concise and easy to understand. Just summarize straight away, without responding to me.\n\n ```{content}```\n<bot>:"
-    model = "togethercomputer/llama-2-70b-chat"
+    prompt = f"You are a distinguished news editor and content publisher, your task is to summarize the following news entry. The summary should accurately reflect the main message and arguments presented in the original news entry, while also being concise and easy to understand. Your summary should not exceed two sentences.\n\n ```{content}```:"
+    model = "garage-bAInd/Platypus2-70B-instruct"
     temperature = 0.7
-    max_tokens = 512
+    max_tokens = 128
 
     output = together.Complete.create(
         prompt=prompt,
         model=model,
         temperature=temperature,
         max_tokens=max_tokens,
     )
+    time.sleep(1.5)
     logging.info(output)
 
     return output["output"]["choices"][0]["text"]
diff --git a/social.py b/social.py
@@ -12,6 +12,7 @@
 import os
 import pathlib
 import sys
+import time
 from http import HTTPStatus
 
 import facebook
@@ -40,7 +41,7 @@
 TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")
 together.api_key = TOGETHER_API_KEY
 
-podcast_transcript = f"{DATA_DIR}/{today_iso_fmt}/{today_iso_fmt}_podcast-content.txt"
+news_headlines = f"{DATA_DIR}/{today_iso_fmt}_news_headlines.txt"
 podcast_url = f"https://zednews.pages.dev/episode/{today_iso_fmt}/"
 
 
@@ -61,8 +62,8 @@ def podcast_is_live(url):
 
 
 def get_content() -> str:
-    """Get the content of the podcast transcript"""
-    with open(podcast_transcript, "r") as f:
+    """Get the headlines"""
+    with open(news_headlines, "r") as f:
         return f.read()
 
 
@@ -73,8 +74,9 @@ def create_facebook_post(content: str) -> str:
     https://docs.together.ai/reference/complete
     """
 
-    prompt = f"<human>: You are a social media marketing guru. You have been hired by a podcaster, {podcast_host} to create a nice, short and catchy facebook post (max 130 words) inviting people to listen to today's podcast whose transcript is below. Highlight some interesting news headlines, appropriately paraphrasing them to grab the attention of your audience. Also, appropriately utilize bullet points, emojis, whitespace and hashtags where necessary. Do not add the link to the podcast as it will be added automatically.\n\n```{content}\n```\n<bot>:"
-    model = "togethercomputer/llama-2-70b-chat"
+    prompt = f"You are a social media marketing guru. You have been hired by a podcaster, {podcast_host}, who hosts a news and current affairs podcast which runs Monday to Friday. Your task is to create a nice, short and catchy facebook post inviting people to listen to today's podcast whose details are below. Appropriately utilize bullet points, emojis, whitespace and hashtags where necessary. Do not add the link to the podcast as it will be added automatically.\n\n```{content}\n```"
+
+    model = "lmsys/vicuna-13b-v1.5-16k"
     temperature = 0.7
     max_tokens = 768
 
@@ -83,9 +85,9 @@ def create_facebook_post(content: str) -> str:
         model=model,
         temperature=temperature,
         max_tokens=max_tokens,
-        repetition_penalty=1.1,
     )
     logger.info(output)
+    time.sleep(30)
 
     return output["output"]["choices"][0]["text"]