-
Notifications
You must be signed in to change notification settings - Fork 133
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adds script to pull blog updates for updating docs (#1208)
Adds a script to help pull blogs from dagworks for inclusion into documentation. Usage: 1. Specify a cutoff date and update the README: python update_blogs_in_learning_resources.py --date 2024-10-01 2. Print articles to the console (without updating README): python update_blogs_in_learning_resources.py --date 2024-10-01 --print 3. Run interactively (no arguments required): python update_blogs_in_learning_resources.py
- Loading branch information
Showing
2 changed files
with
229 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,225 @@ | ||
""" | ||
📜 DAGWorks Blog Archive Updater Script | ||
This script fetches articles from the DAGWorks blog archive and updates the README with the latest posts. | ||
It supports filtering articles based on a cutoff date and offers an option to print articles to the console for review. | ||
Before running this script, make sure to install the required packages by running: | ||
pip install -r update_blogs_requirements.txt | ||
Usage: | ||
1. Specify a cutoff date and update the README: | ||
python update_blogs_in_learning_resources.py --date 2024-10-01 | ||
2. Print articles to the console (without updating README): | ||
python update_blogs_in_learning_resources.py --date 2024-10-01 --print | ||
3. Run interactively (no arguments required): | ||
python update_blogs_in_learning_resources.py | ||
Arguments: | ||
--date YYYY-MM-DD : Optional. Fetches articles published on or after the given date. | ||
--print : Optional. Prints fetched articles to the console without modifying the README. | ||
Dependencies: | ||
- selenium: For web scraping. | ||
- webdriver-manager: Manages ChromeDriver installation. | ||
- beautifulsoup4: Parses HTML content. | ||
- prompt_toolkit: Enables interactive user input from the command line. | ||
Notes: | ||
- Ensure Google Chrome is installed since the script uses it for scraping. | ||
- The README file must be present in the same directory as this script. | ||
""" | ||
|
||
import argparse | ||
import time | ||
from datetime import datetime | ||
|
||
from bs4 import BeautifulSoup | ||
from prompt_toolkit import HTML, prompt | ||
from prompt_toolkit.styles import Style | ||
from selenium import webdriver | ||
from selenium.webdriver.chrome.options import Options | ||
from selenium.webdriver.chrome.service import Service | ||
from webdriver_manager.chrome import ChromeDriverManager | ||
|
||
|
||
def fetch_articles(url, cutoff_date): | ||
chrome_options = Options() | ||
chrome_options.add_argument("--headless") | ||
driver = webdriver.Chrome( | ||
service=Service(ChromeDriverManager().install()), options=chrome_options | ||
) | ||
|
||
driver.get(url) | ||
|
||
last_height = driver.execute_script("return document.body.scrollHeight") | ||
while True: | ||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | ||
time.sleep(2) | ||
|
||
new_height = driver.execute_script("return document.body.scrollHeight") | ||
if new_height == last_height: | ||
break | ||
last_height = new_height | ||
|
||
soup = BeautifulSoup(driver.page_source, "html.parser") | ||
driver.quit() | ||
|
||
anchors = soup.find_all("a", class_="_color-pub-primary-text_q8zsn_187") | ||
timeEls = soup.find_all("time", class_="_date_1v6nm_1") | ||
|
||
# hardcoded blog links | ||
articles = [ | ||
( | ||
"https://pyright.blogspot.com/2024/07/dag-hamilton-workflow-for-toy-text.html", | ||
"DAG Hamilton Workflow for Toy Text Processing Script", | ||
None, | ||
), | ||
( | ||
"https://vyom-modi.medium.com/building-a-high-performance-rag-based-ai-with-qdrant-groq-langchain-and-dagworks-hamilton-fb1baa7415bc", | ||
"Building a High-Performance RAG-Based AI with Qdrant, Groq, LangChain, and DAGWorks Hamilton", | ||
None, | ||
), | ||
( | ||
"https://blog.getwren.ai/how-do-we-rewrite-wren-ai-llm-service-to-support-1500-concurrent-users-online-9ba5c121afc3", | ||
"How do we rewrite Wren AI LLM Service to support 1500 concurrent users online", | ||
None, | ||
), | ||
] | ||
for _i, (anchor, time_el) in enumerate(zip(anchors, timeEls)): | ||
link = anchor["href"] | ||
text = anchor.get_text() | ||
|
||
date_str = time_el["datetime"] | ||
article_date = datetime.fromisoformat(date_str.replace("Z", "+00:00")).date() | ||
if article_date >= cutoff_date: | ||
articles.append((link, text, article_date)) | ||
|
||
return articles | ||
|
||
|
||
def get_cutoff_date(): | ||
style = Style.from_dict({"prompt": "bold", "faded": "ansiblack italic"}) | ||
|
||
date_str = prompt( | ||
HTML("<prompt>Enter cutoff date (YYYY-MM-DD): </prompt>"), | ||
default=f"{datetime.now().year}-01-01", | ||
style=style, | ||
) | ||
|
||
try: | ||
return datetime.strptime(date_str, "%Y-%m-%d").date() | ||
except ValueError: | ||
print("Invalid date format. Please use YYYY-MM-DD.") | ||
return get_cutoff_date() | ||
|
||
|
||
def print_articles(articles): | ||
"""Prints articles to the console.""" | ||
print("\n📰 Articles:\n") | ||
for link, text, date in articles: | ||
if date: | ||
print(f"* {date} - {text}: {link}") | ||
else: | ||
print(f"* {text}: {link}") | ||
|
||
|
||
def update_readme(articles): | ||
external_blogs_header = "## 📰 External Blogs\n" | ||
external_blogs_link = ( | ||
"For the latest blog posts, see the [DAGWorks's Blog](https://blog.dagworks.io/).\n\n" | ||
) | ||
blog_entries = [] | ||
|
||
for link, text, date in articles: | ||
if date: | ||
blog_entries.append(f"* {date} [{text}]({link})") | ||
else: | ||
blog_entries.append(f"* [{text}]({link})") | ||
|
||
new_external_blogs_section = ( | ||
external_blogs_header + external_blogs_link + "\n".join(blog_entries) + "\n" | ||
) | ||
|
||
with open("readme.md", "r") as file: | ||
content = file.readlines() | ||
|
||
new_content = [] | ||
in_external_blogs_section = False | ||
|
||
for line in content: | ||
if line.startswith("## 📰 External Blogs"): | ||
in_external_blogs_section = True | ||
new_content.append(new_external_blogs_section) | ||
elif in_external_blogs_section and line.startswith("##"): | ||
in_external_blogs_section = False | ||
|
||
if not in_external_blogs_section: | ||
new_content.append(line) | ||
with open("readme.md", "w") as file: | ||
file.writelines(new_content) | ||
|
||
|
||
def main(): | ||
url = "https://blog.dagworks.io/archive" | ||
|
||
parser = argparse.ArgumentParser( | ||
description="📜 Fetch and update articles from the DAGWorks blog archive.", | ||
epilog=""" | ||
✨ Tip: | ||
- Use --date to specify a cutoff date to filter articles (default: prompt input). | ||
- Use --print to display fetched articles directly in the console. | ||
Examples: | ||
1. Fetch articles after a specific date and update README: | ||
python update_blogs_in_learning_resources.py --date 2024-10-01 | ||
2. Print articles to the console without modifying the README: | ||
python update_blogs_in_learning_resources.py --date 2024-10-01 --print | ||
3. Run interactively (without --date) and choose a date via prompt: | ||
python update_blogs_in_learning_resources.py | ||
""", | ||
formatter_class=argparse.RawTextHelpFormatter, # Preserve newlines and formatting | ||
) | ||
|
||
parser.add_argument( | ||
"--date", | ||
type=str, | ||
metavar="YYYY-MM-DD", | ||
help=( | ||
"Cutoff date to filter articles (e.g., 2024-10-01). " | ||
"If not provided, you will be prompted to enter a date during execution." | ||
), | ||
) | ||
|
||
parser.add_argument( | ||
"--print", | ||
action="store_true", | ||
help=( | ||
"Print the fetched articles to the console instead of updating the README. " | ||
"Useful for reviewing articles before making changes." | ||
), | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
cutoff_date = ( | ||
datetime.strptime(args.date, "%Y-%m-%d").date() if args.date else get_cutoff_date() | ||
) | ||
|
||
print(f"\n🔍 Fetching articles published after {cutoff_date}...\n") | ||
articles = fetch_articles(url, cutoff_date) | ||
|
||
if args.print: | ||
print_articles(articles) | ||
else: | ||
update_readme(articles) | ||
|
||
print(f"\n✅ Total Articles Fetched: {len(articles)}") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
beautifulsoup4 | ||
prompt_toolkit | ||
selenium | ||
webdriver-manager |