Skip to content

Commit

Permalink
Adds script to pull blog updates for updating docs (#1208)
Browse files Browse the repository at this point in the history
Adds a script to help pull blogs from dagworks for inclusion into
documentation.

Usage:
    1. Specify a cutoff date and update the README:
        python update_blogs_in_learning_resources.py --date 2024-10-01
    2. Print articles to the console (without updating README):
        python update_blogs_in_learning_resources.py --date 2024-10-01 --print
    3. Run interactively (no arguments required):
        python update_blogs_in_learning_resources.py
  • Loading branch information
sikehish authored Oct 31, 2024
1 parent afc062c commit fd28dc0
Show file tree
Hide file tree
Showing 2 changed files with 229 additions and 0 deletions.
225 changes: 225 additions & 0 deletions scripts/update_blogs_in_learning_resources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
"""
📜 DAGWorks Blog Archive Updater Script
This script fetches articles from the DAGWorks blog archive and updates the README with the latest posts.
It supports filtering articles based on a cutoff date and offers an option to print articles to the console for review.
Before running this script, make sure to install the required packages by running:
pip install -r update_blogs_requirements.txt
Usage:
1. Specify a cutoff date and update the README:
python update_blogs_in_learning_resources.py --date 2024-10-01
2. Print articles to the console (without updating README):
python update_blogs_in_learning_resources.py --date 2024-10-01 --print
3. Run interactively (no arguments required):
python update_blogs_in_learning_resources.py
Arguments:
--date YYYY-MM-DD : Optional. Fetches articles published on or after the given date.
--print : Optional. Prints fetched articles to the console without modifying the README.
Dependencies:
- selenium: For web scraping.
- webdriver-manager: Manages ChromeDriver installation.
- beautifulsoup4: Parses HTML content.
- prompt_toolkit: Enables interactive user input from the command line.
Notes:
- Ensure Google Chrome is installed since the script uses it for scraping.
- The README file must be present in the same directory as this script.
"""

import argparse
import time
from datetime import datetime

from bs4 import BeautifulSoup
from prompt_toolkit import HTML, prompt
from prompt_toolkit.styles import Style
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


def fetch_articles(url, cutoff_date):
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()), options=chrome_options
)

driver.get(url)

last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)

new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height

soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()

anchors = soup.find_all("a", class_="_color-pub-primary-text_q8zsn_187")
timeEls = soup.find_all("time", class_="_date_1v6nm_1")

# hardcoded blog links
articles = [
(
"https://pyright.blogspot.com/2024/07/dag-hamilton-workflow-for-toy-text.html",
"DAG Hamilton Workflow for Toy Text Processing Script",
None,
),
(
"https://vyom-modi.medium.com/building-a-high-performance-rag-based-ai-with-qdrant-groq-langchain-and-dagworks-hamilton-fb1baa7415bc",
"Building a High-Performance RAG-Based AI with Qdrant, Groq, LangChain, and DAGWorks Hamilton",
None,
),
(
"https://blog.getwren.ai/how-do-we-rewrite-wren-ai-llm-service-to-support-1500-concurrent-users-online-9ba5c121afc3",
"How do we rewrite Wren AI LLM Service to support 1500 concurrent users online",
None,
),
]
for _i, (anchor, time_el) in enumerate(zip(anchors, timeEls)):
link = anchor["href"]
text = anchor.get_text()

date_str = time_el["datetime"]
article_date = datetime.fromisoformat(date_str.replace("Z", "+00:00")).date()
if article_date >= cutoff_date:
articles.append((link, text, article_date))

return articles


def get_cutoff_date():
style = Style.from_dict({"prompt": "bold", "faded": "ansiblack italic"})

date_str = prompt(
HTML("<prompt>Enter cutoff date (YYYY-MM-DD): </prompt>"),
default=f"{datetime.now().year}-01-01",
style=style,
)

try:
return datetime.strptime(date_str, "%Y-%m-%d").date()
except ValueError:
print("Invalid date format. Please use YYYY-MM-DD.")
return get_cutoff_date()


def print_articles(articles):
"""Prints articles to the console."""
print("\n📰 Articles:\n")
for link, text, date in articles:
if date:
print(f"* {date} - {text}: {link}")
else:
print(f"* {text}: {link}")


def update_readme(articles):
external_blogs_header = "## 📰 External Blogs\n"
external_blogs_link = (
"For the latest blog posts, see the [DAGWorks's Blog](https://blog.dagworks.io/).\n\n"
)
blog_entries = []

for link, text, date in articles:
if date:
blog_entries.append(f"* {date} &nbsp;&nbsp; [{text}]({link})")
else:
blog_entries.append(f"* [{text}]({link})")

new_external_blogs_section = (
external_blogs_header + external_blogs_link + "\n".join(blog_entries) + "\n"
)

with open("readme.md", "r") as file:
content = file.readlines()

new_content = []
in_external_blogs_section = False

for line in content:
if line.startswith("## 📰 External Blogs"):
in_external_blogs_section = True
new_content.append(new_external_blogs_section)
elif in_external_blogs_section and line.startswith("##"):
in_external_blogs_section = False

if not in_external_blogs_section:
new_content.append(line)
with open("readme.md", "w") as file:
file.writelines(new_content)


def main():
url = "https://blog.dagworks.io/archive"

parser = argparse.ArgumentParser(
description="📜 Fetch and update articles from the DAGWorks blog archive.",
epilog="""
✨ Tip:
- Use --date to specify a cutoff date to filter articles (default: prompt input).
- Use --print to display fetched articles directly in the console.
Examples:
1. Fetch articles after a specific date and update README:
python update_blogs_in_learning_resources.py --date 2024-10-01
2. Print articles to the console without modifying the README:
python update_blogs_in_learning_resources.py --date 2024-10-01 --print
3. Run interactively (without --date) and choose a date via prompt:
python update_blogs_in_learning_resources.py
""",
formatter_class=argparse.RawTextHelpFormatter, # Preserve newlines and formatting
)

parser.add_argument(
"--date",
type=str,
metavar="YYYY-MM-DD",
help=(
"Cutoff date to filter articles (e.g., 2024-10-01). "
"If not provided, you will be prompted to enter a date during execution."
),
)

parser.add_argument(
"--print",
action="store_true",
help=(
"Print the fetched articles to the console instead of updating the README. "
"Useful for reviewing articles before making changes."
),
)

args = parser.parse_args()

cutoff_date = (
datetime.strptime(args.date, "%Y-%m-%d").date() if args.date else get_cutoff_date()
)

print(f"\n🔍 Fetching articles published after {cutoff_date}...\n")
articles = fetch_articles(url, cutoff_date)

if args.print:
print_articles(articles)
else:
update_readme(articles)

print(f"\n✅ Total Articles Fetched: {len(articles)}")


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions scripts/update_blogs_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
beautifulsoup4
prompt_toolkit
selenium
webdriver-manager

0 comments on commit fd28dc0

Please sign in to comment.