From fd28dc090af625ef6d84d86793d9228c0d6239f3 Mon Sep 17 00:00:00 2001 From: Hisham Akmal Date: Thu, 31 Oct 2024 09:59:34 +0530 Subject: [PATCH] Adds script to pull blog updates for updating docs (#1208) Adds a script to help pull blogs from dagworks for inclusion into documentation. Usage: 1. Specify a cutoff date and update the README: python update_blogs_in_learning_resources.py --date 2024-10-01 2. Print articles to the console (without updating README): python update_blogs_in_learning_resources.py --date 2024-10-01 --print 3. Run interactively (no arguments required): python update_blogs_in_learning_resources.py --- scripts/update_blogs_in_learning_resources.py | 225 ++++++++++++++++++ scripts/update_blogs_requirements.txt | 4 + 2 files changed, 229 insertions(+) create mode 100644 scripts/update_blogs_in_learning_resources.py create mode 100644 scripts/update_blogs_requirements.txt diff --git a/scripts/update_blogs_in_learning_resources.py b/scripts/update_blogs_in_learning_resources.py new file mode 100644 index 000000000..4fb7a1185 --- /dev/null +++ b/scripts/update_blogs_in_learning_resources.py @@ -0,0 +1,225 @@ +""" +šŸ“œ DAGWorks Blog Archive Updater Script + +This script fetches articles from the DAGWorks blog archive and updates the README with the latest posts. +It supports filtering articles based on a cutoff date and offers an option to print articles to the console for review. + +Before running this script, make sure to install the required packages by running: +pip install -r update_blogs_requirements.txt + +Usage: + 1. Specify a cutoff date and update the README: + python update_blogs_in_learning_resources.py --date 2024-10-01 + + 2. Print articles to the console (without updating README): + python update_blogs_in_learning_resources.py --date 2024-10-01 --print + + 3. Run interactively (no arguments required): + python update_blogs_in_learning_resources.py + +Arguments: + --date YYYY-MM-DD : Optional. Fetches articles published on or after the given date. + --print : Optional. Prints fetched articles to the console without modifying the README. + +Dependencies: + - selenium: For web scraping. + - webdriver-manager: Manages ChromeDriver installation. + - beautifulsoup4: Parses HTML content. + - prompt_toolkit: Enables interactive user input from the command line. + +Notes: + - Ensure Google Chrome is installed since the script uses it for scraping. + - The README file must be present in the same directory as this script. +""" + +import argparse +import time +from datetime import datetime + +from bs4 import BeautifulSoup +from prompt_toolkit import HTML, prompt +from prompt_toolkit.styles import Style +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +from webdriver_manager.chrome import ChromeDriverManager + + +def fetch_articles(url, cutoff_date): + chrome_options = Options() + chrome_options.add_argument("--headless") + driver = webdriver.Chrome( + service=Service(ChromeDriverManager().install()), options=chrome_options + ) + + driver.get(url) + + last_height = driver.execute_script("return document.body.scrollHeight") + while True: + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(2) + + new_height = driver.execute_script("return document.body.scrollHeight") + if new_height == last_height: + break + last_height = new_height + + soup = BeautifulSoup(driver.page_source, "html.parser") + driver.quit() + + anchors = soup.find_all("a", class_="_color-pub-primary-text_q8zsn_187") + timeEls = soup.find_all("time", class_="_date_1v6nm_1") + + # hardcoded blog links + articles = [ + ( + "https://pyright.blogspot.com/2024/07/dag-hamilton-workflow-for-toy-text.html", + "DAG Hamilton Workflow for Toy Text Processing Script", + None, + ), + ( + "https://vyom-modi.medium.com/building-a-high-performance-rag-based-ai-with-qdrant-groq-langchain-and-dagworks-hamilton-fb1baa7415bc", + "Building a High-Performance RAG-Based AI with Qdrant, Groq, LangChain, and DAGWorks Hamilton", + None, + ), + ( + "https://blog.getwren.ai/how-do-we-rewrite-wren-ai-llm-service-to-support-1500-concurrent-users-online-9ba5c121afc3", + "How do we rewrite Wren AI LLM Service to support 1500 concurrent users online", + None, + ), + ] + for _i, (anchor, time_el) in enumerate(zip(anchors, timeEls)): + link = anchor["href"] + text = anchor.get_text() + + date_str = time_el["datetime"] + article_date = datetime.fromisoformat(date_str.replace("Z", "+00:00")).date() + if article_date >= cutoff_date: + articles.append((link, text, article_date)) + + return articles + + +def get_cutoff_date(): + style = Style.from_dict({"prompt": "bold", "faded": "ansiblack italic"}) + + date_str = prompt( + HTML("Enter cutoff date (YYYY-MM-DD): "), + default=f"{datetime.now().year}-01-01", + style=style, + ) + + try: + return datetime.strptime(date_str, "%Y-%m-%d").date() + except ValueError: + print("Invalid date format. Please use YYYY-MM-DD.") + return get_cutoff_date() + + +def print_articles(articles): + """Prints articles to the console.""" + print("\nšŸ“° Articles:\n") + for link, text, date in articles: + if date: + print(f"* {date} - {text}: {link}") + else: + print(f"* {text}: {link}") + + +def update_readme(articles): + external_blogs_header = "## šŸ“° External Blogs\n" + external_blogs_link = ( + "For the latest blog posts, see the [DAGWorks's Blog](https://blog.dagworks.io/).\n\n" + ) + blog_entries = [] + + for link, text, date in articles: + if date: + blog_entries.append(f"* {date}    [{text}]({link})") + else: + blog_entries.append(f"* [{text}]({link})") + + new_external_blogs_section = ( + external_blogs_header + external_blogs_link + "\n".join(blog_entries) + "\n" + ) + + with open("readme.md", "r") as file: + content = file.readlines() + + new_content = [] + in_external_blogs_section = False + + for line in content: + if line.startswith("## šŸ“° External Blogs"): + in_external_blogs_section = True + new_content.append(new_external_blogs_section) + elif in_external_blogs_section and line.startswith("##"): + in_external_blogs_section = False + + if not in_external_blogs_section: + new_content.append(line) + with open("readme.md", "w") as file: + file.writelines(new_content) + + +def main(): + url = "https://blog.dagworks.io/archive" + + parser = argparse.ArgumentParser( + description="šŸ“œ Fetch and update articles from the DAGWorks blog archive.", + epilog=""" + āœØ Tip: + - Use --date to specify a cutoff date to filter articles (default: prompt input). + - Use --print to display fetched articles directly in the console. + + Examples: + 1. Fetch articles after a specific date and update README: + python update_blogs_in_learning_resources.py --date 2024-10-01 + + 2. Print articles to the console without modifying the README: + python update_blogs_in_learning_resources.py --date 2024-10-01 --print + + 3. Run interactively (without --date) and choose a date via prompt: + python update_blogs_in_learning_resources.py + """, + formatter_class=argparse.RawTextHelpFormatter, # Preserve newlines and formatting + ) + + parser.add_argument( + "--date", + type=str, + metavar="YYYY-MM-DD", + help=( + "Cutoff date to filter articles (e.g., 2024-10-01). " + "If not provided, you will be prompted to enter a date during execution." + ), + ) + + parser.add_argument( + "--print", + action="store_true", + help=( + "Print the fetched articles to the console instead of updating the README. " + "Useful for reviewing articles before making changes." + ), + ) + + args = parser.parse_args() + + cutoff_date = ( + datetime.strptime(args.date, "%Y-%m-%d").date() if args.date else get_cutoff_date() + ) + + print(f"\nšŸ” Fetching articles published after {cutoff_date}...\n") + articles = fetch_articles(url, cutoff_date) + + if args.print: + print_articles(articles) + else: + update_readme(articles) + + print(f"\nāœ… Total Articles Fetched: {len(articles)}") + + +if __name__ == "__main__": + main() diff --git a/scripts/update_blogs_requirements.txt b/scripts/update_blogs_requirements.txt new file mode 100644 index 000000000..9d2132ef1 --- /dev/null +++ b/scripts/update_blogs_requirements.txt @@ -0,0 +1,4 @@ +beautifulsoup4 +prompt_toolkit +selenium +webdriver-manager