From 63429cca877c5a2cc55110de447670b96eb99fcc Mon Sep 17 00:00:00 2001 From: Dimitry Foures Date: Thu, 17 Oct 2024 11:29:12 +0100 Subject: [PATCH] parent 7b9f52af3e661aac6665b43c48799b9aa0777cd0 author Dimitry Foures 1729160952 +0100 committer sikehish 1730182901 +0530 parent 7b9f52af3e661aac6665b43c48799b9aa0777cd0 author Dimitry Foures 1729160952 +0100 committer sikehish 1730182885 +0530 parent 7b9f52af3e661aac6665b43c48799b9aa0777cd0 author Dimitry Foures 1729160952 +0100 committer sikehish 1730182883 +0530 parent 7b9f52af3e661aac6665b43c48799b9aa0777cd0 author Dimitry Foures 1729160952 +0100 committer sikehish 1730182877 +0530 parent 7b9f52af3e661aac6665b43c48799b9aa0777cd0 author Dimitry Foures 1729160952 +0100 committer sikehish 1730182862 +0530 parent 7b9f52af3e661aac6665b43c48799b9aa0777cd0 author Dimitry Foures 1729160952 +0100 committer sikehish 1730182842 +0530 parent 7b9f52af3e661aac6665b43c48799b9aa0777cd0 author Dimitry Foures 1729160952 +0100 committer sikehish 1730182748 +0530 Update materialization.rst Fix mutate docstring Docstring raised Pytest escape error due to /* so we wrap it in quotes. Improve pipe_output first node naming Current name convention is very prone to name clashes with user naming. We assign the same naming convention using namespace.raw to indicate the first node. Provide dockerx setup for docker builds (#1194) This change adds a script for multi-platform docker builds as well as a github workflow to automatically build them when a new sf-hamilton-ui version has been published. Squashed commits: * Created buildx_and_push.sh script in ui directory to create multi-platform docker builds * Fixed content related issue in buildx_and_push.sh * buildx_and_push.sh: Added functionality to fecth the latest version from PyPi * buildx_and_push.sh: Added check_buildx_installed * buildx_and_push.sh: Added check_buildx_installed * buildx_and_push.sh: Enhanced error handling(checking if jq exists, curl response handling and docker buildx error handling) * Adding build args to buildx_and_push.sh * buildx_and_push.sh: Changes made to test a new workflow * Created a new workflow: hamilton-ui-build-and-push * buildx_and_push.sh: echo statement added to debug * buildx_and_push.sh: cd'ing to the directory(ui) where the shell script is located to prevent context related errors in workflow. * hamilton-ui-build-and-push.yml: Added Docker Hub login step * buildx_and_push.sh: added dagworks dockerhub username; workflow worked on the fork. * hamilton-ui-build-and-push.yml: Replaced previous version from cache with version tag from Dockerhub image. * buildx_and_push.sh: Changed dockerhub username for testing * hamilton-ui-build-and-push.yml: Minor change in the docker registry URL(version) * hamilton-ui-build-and-push.yml: Minor change in the "Fetch highest version from Docker Hub" step's shell script * hamilton-ui-build-and-push.yml: Replacing deprecated set-output with GITHUB_OUTPUT * hamilton-ui-build-and-push.yml: Conditional execution of steps implemented * Undid change in dockerhub username * Update ui/buildx_and_push.sh * Update ui/buildx_and_push.sh * chore: fix pre-commit whitespace issues --------- Co-authored-by: Stefan Krawczyk Fix `keep_dot` propagation in `Driver` display functions Bumps hamilton version from 1.81.0 to 1.81.1 fix: caching `SQLiteMetadataStore.get_run_ids()` (#1205) * fixed .get_run_ids() and standardized .get_run() + tests * fixed docstrings formatting errors --------- Co-authored-by: zilto Bumps hamilton version from 1.81.1 to 1.81.2 buildx_and_push.sh: Changes made to test a new workflow buildx_and_push.sh: echo statement added to debug buildx_and_push.sh: added dagworks dockerhub username; workflow worked on the fork. buildx_and_push.sh: Changed dockerhub username for testing Undid change in dockerhub username Created a new update external blogs script which updates External Blogs section in learning_resources.md with the latest blogs(with a date cutoff) Added docstring in update_blogs_in_learning_resources.py chore: fix pre-commit whitespace issues update_blogs_in_learning_resources.py: Added a "print to standard out" option (using --print flag) update_blogs_in_learning_resources.py: Argument parser with expaned help text update_blogs_in_learning_resources.py: Expanded docstring Fixes whitespace and small docs typo buildx_and_push.sh: Changes made to test a new workflow buildx_and_push.sh: echo statement added to debug hamilton-ui-build-and-push.yml: Added Docker Hub login step buildx_and_push.sh: added dagworks dockerhub username; workflow worked on the fork. buildx_and_push.sh: Changed dockerhub username for testing hamilton-ui-build-and-push.yml: Minor change in the "Fetch highest version from Docker Hub" step's shell script hamilton-ui-build-and-push.yml: Replacing deprecated set-output with GITHUB_OUTPUT chore: fix pre-commit whitespace issues Update ui/buildx_and_push.sh Update ui/buildx_and_push.sh Fixes whitespace and small docs typo --- scripts/update_blogs_in_learning_resources.py | 225 ++++++++++++++++++ scripts/update_blogs_requirements.txt | 4 + 2 files changed, 229 insertions(+) create mode 100644 scripts/update_blogs_in_learning_resources.py create mode 100644 scripts/update_blogs_requirements.txt diff --git a/scripts/update_blogs_in_learning_resources.py b/scripts/update_blogs_in_learning_resources.py new file mode 100644 index 000000000..4fb7a1185 --- /dev/null +++ b/scripts/update_blogs_in_learning_resources.py @@ -0,0 +1,225 @@ +""" +šŸ“œ DAGWorks Blog Archive Updater Script + +This script fetches articles from the DAGWorks blog archive and updates the README with the latest posts. +It supports filtering articles based on a cutoff date and offers an option to print articles to the console for review. + +Before running this script, make sure to install the required packages by running: +pip install -r update_blogs_requirements.txt + +Usage: + 1. Specify a cutoff date and update the README: + python update_blogs_in_learning_resources.py --date 2024-10-01 + + 2. Print articles to the console (without updating README): + python update_blogs_in_learning_resources.py --date 2024-10-01 --print + + 3. Run interactively (no arguments required): + python update_blogs_in_learning_resources.py + +Arguments: + --date YYYY-MM-DD : Optional. Fetches articles published on or after the given date. + --print : Optional. Prints fetched articles to the console without modifying the README. + +Dependencies: + - selenium: For web scraping. + - webdriver-manager: Manages ChromeDriver installation. + - beautifulsoup4: Parses HTML content. + - prompt_toolkit: Enables interactive user input from the command line. + +Notes: + - Ensure Google Chrome is installed since the script uses it for scraping. + - The README file must be present in the same directory as this script. +""" + +import argparse +import time +from datetime import datetime + +from bs4 import BeautifulSoup +from prompt_toolkit import HTML, prompt +from prompt_toolkit.styles import Style +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +from webdriver_manager.chrome import ChromeDriverManager + + +def fetch_articles(url, cutoff_date): + chrome_options = Options() + chrome_options.add_argument("--headless") + driver = webdriver.Chrome( + service=Service(ChromeDriverManager().install()), options=chrome_options + ) + + driver.get(url) + + last_height = driver.execute_script("return document.body.scrollHeight") + while True: + driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(2) + + new_height = driver.execute_script("return document.body.scrollHeight") + if new_height == last_height: + break + last_height = new_height + + soup = BeautifulSoup(driver.page_source, "html.parser") + driver.quit() + + anchors = soup.find_all("a", class_="_color-pub-primary-text_q8zsn_187") + timeEls = soup.find_all("time", class_="_date_1v6nm_1") + + # hardcoded blog links + articles = [ + ( + "https://pyright.blogspot.com/2024/07/dag-hamilton-workflow-for-toy-text.html", + "DAG Hamilton Workflow for Toy Text Processing Script", + None, + ), + ( + "https://vyom-modi.medium.com/building-a-high-performance-rag-based-ai-with-qdrant-groq-langchain-and-dagworks-hamilton-fb1baa7415bc", + "Building a High-Performance RAG-Based AI with Qdrant, Groq, LangChain, and DAGWorks Hamilton", + None, + ), + ( + "https://blog.getwren.ai/how-do-we-rewrite-wren-ai-llm-service-to-support-1500-concurrent-users-online-9ba5c121afc3", + "How do we rewrite Wren AI LLM Service to support 1500 concurrent users online", + None, + ), + ] + for _i, (anchor, time_el) in enumerate(zip(anchors, timeEls)): + link = anchor["href"] + text = anchor.get_text() + + date_str = time_el["datetime"] + article_date = datetime.fromisoformat(date_str.replace("Z", "+00:00")).date() + if article_date >= cutoff_date: + articles.append((link, text, article_date)) + + return articles + + +def get_cutoff_date(): + style = Style.from_dict({"prompt": "bold", "faded": "ansiblack italic"}) + + date_str = prompt( + HTML("Enter cutoff date (YYYY-MM-DD): "), + default=f"{datetime.now().year}-01-01", + style=style, + ) + + try: + return datetime.strptime(date_str, "%Y-%m-%d").date() + except ValueError: + print("Invalid date format. Please use YYYY-MM-DD.") + return get_cutoff_date() + + +def print_articles(articles): + """Prints articles to the console.""" + print("\nšŸ“° Articles:\n") + for link, text, date in articles: + if date: + print(f"* {date} - {text}: {link}") + else: + print(f"* {text}: {link}") + + +def update_readme(articles): + external_blogs_header = "## šŸ“° External Blogs\n" + external_blogs_link = ( + "For the latest blog posts, see the [DAGWorks's Blog](https://blog.dagworks.io/).\n\n" + ) + blog_entries = [] + + for link, text, date in articles: + if date: + blog_entries.append(f"* {date}    [{text}]({link})") + else: + blog_entries.append(f"* [{text}]({link})") + + new_external_blogs_section = ( + external_blogs_header + external_blogs_link + "\n".join(blog_entries) + "\n" + ) + + with open("readme.md", "r") as file: + content = file.readlines() + + new_content = [] + in_external_blogs_section = False + + for line in content: + if line.startswith("## šŸ“° External Blogs"): + in_external_blogs_section = True + new_content.append(new_external_blogs_section) + elif in_external_blogs_section and line.startswith("##"): + in_external_blogs_section = False + + if not in_external_blogs_section: + new_content.append(line) + with open("readme.md", "w") as file: + file.writelines(new_content) + + +def main(): + url = "https://blog.dagworks.io/archive" + + parser = argparse.ArgumentParser( + description="šŸ“œ Fetch and update articles from the DAGWorks blog archive.", + epilog=""" + āœØ Tip: + - Use --date to specify a cutoff date to filter articles (default: prompt input). + - Use --print to display fetched articles directly in the console. + + Examples: + 1. Fetch articles after a specific date and update README: + python update_blogs_in_learning_resources.py --date 2024-10-01 + + 2. Print articles to the console without modifying the README: + python update_blogs_in_learning_resources.py --date 2024-10-01 --print + + 3. Run interactively (without --date) and choose a date via prompt: + python update_blogs_in_learning_resources.py + """, + formatter_class=argparse.RawTextHelpFormatter, # Preserve newlines and formatting + ) + + parser.add_argument( + "--date", + type=str, + metavar="YYYY-MM-DD", + help=( + "Cutoff date to filter articles (e.g., 2024-10-01). " + "If not provided, you will be prompted to enter a date during execution." + ), + ) + + parser.add_argument( + "--print", + action="store_true", + help=( + "Print the fetched articles to the console instead of updating the README. " + "Useful for reviewing articles before making changes." + ), + ) + + args = parser.parse_args() + + cutoff_date = ( + datetime.strptime(args.date, "%Y-%m-%d").date() if args.date else get_cutoff_date() + ) + + print(f"\nšŸ” Fetching articles published after {cutoff_date}...\n") + articles = fetch_articles(url, cutoff_date) + + if args.print: + print_articles(articles) + else: + update_readme(articles) + + print(f"\nāœ… Total Articles Fetched: {len(articles)}") + + +if __name__ == "__main__": + main() diff --git a/scripts/update_blogs_requirements.txt b/scripts/update_blogs_requirements.txt new file mode 100644 index 000000000..9d2132ef1 --- /dev/null +++ b/scripts/update_blogs_requirements.txt @@ -0,0 +1,4 @@ +beautifulsoup4 +prompt_toolkit +selenium +webdriver-manager