From 2cd1e5f44992d6ece9703aa69eda4ba6ed913930 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sat, 2 Apr 2022 12:03:53 +0200 Subject: [PATCH 1/2] Add scripts/cron_watcher.py --- scripts/cron_watcher.py | 53 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 scripts/cron_watcher.py diff --git a/scripts/cron_watcher.py b/scripts/cron_watcher.py new file mode 100644 index 00000000000..eccb990ace7 --- /dev/null +++ b/scripts/cron_watcher.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 + +""" +Daily Cron-audit task (Python) sentry (who watches the watchers) +If not dump and cdump uploaded for last YYYY-MM on archive.org +If not sitemaps updated for this YYYY-MM on www +If not partner dumps uploaded for this YYYY-MM on archive.org +If no imports in last 48 hours (i.e. 2 days) +If DD>17 for YYYY-MM and bwb `batchname` doesn’t exist in import psql table +Send daily email with failures only or slack failures +""" + +from datetime import date, timedelta + +import bs4 +import httpx + +DATA_DUMPS_URL = "https://archive.org/details/ol_exports?sort=-publicdate" +# Last day of last month is the first day of this month minus one day. +last_day_of_last_month = date.today().replace(day=1) - timedelta(days=1) +yyyy_mm = f"{last_day_of_last_month:%Y-%m}" + + +async def find_last_months_dumps_on_ia(yyyy_mm: str = yyyy_mm) -> bool: + """ + Return True if both ol_dump_yyyy and ol_cdump_yyyy files have been saved on the + Internet Archive. + """ + prefixes = (f"ol_dump_{yyyy_mm}", f"ol_cdump_{yyyy_mm}") + # print(prefixes) + async with httpx.AsyncClient() as client: + response = await client.get(DATA_DUMPS_URL) + response.raise_for_status() + soup = bs4.BeautifulSoup(response.content, features="html.parser") + found = 0 + #
+ for item_ia in soup.find_all("div", class_="item-ia"): + if item_ia["data-id"].startswith(prefixes): + # print(item_ia["data-id"]) + found += 1 + if found >= 2: + break + return found >= 2 + + +if __name__ == "__main__": + import asyncio + import sys + + both_files_found = asyncio.run(find_last_months_dumps_on_ia()) + print(f"{both_files_found = }") + if not both_files_found: + sys.exit(1) From 933f1a763a54b2c4e531b9204553c2c31c1dec70 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Sun, 17 Apr 2022 17:46:51 +0200 Subject: [PATCH 2/2] Use internetarchive --- scripts/cron_watcher.py | 44 +++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/scripts/cron_watcher.py b/scripts/cron_watcher.py index eccb990ace7..040893d1dca 100644 --- a/scripts/cron_watcher.py +++ b/scripts/cron_watcher.py @@ -12,42 +12,38 @@ from datetime import date, timedelta -import bs4 -import httpx +from internetarchive import search_items -DATA_DUMPS_URL = "https://archive.org/details/ol_exports?sort=-publicdate" # Last day of last month is the first day of this month minus one day. last_day_of_last_month = date.today().replace(day=1) - timedelta(days=1) yyyy_mm = f"{last_day_of_last_month:%Y-%m}" -async def find_last_months_dumps_on_ia(yyyy_mm: str = yyyy_mm) -> bool: +def find_last_months_dumps_on_ia(yyyy_mm: str = yyyy_mm) -> bool: """ - Return True if both ol_dump_yyyy and ol_cdump_yyyy files have been saved on the - Internet Archive. + Return True if both ol_dump_yyyy_mm and ol_cdump_yyyy_mm files + have been saved on Internet Archive collection:ol_exports. + + >>> next_month = date.today().replace(day=1) + timedelta(days=31) + >>> find_last_months_dumps_on_ia(f"{next_month:%Y-%m}") + False """ - prefixes = (f"ol_dump_{yyyy_mm}", f"ol_cdump_{yyyy_mm}") + prefixes = {f"ol_dump_{yyyy_mm}": 0, f"ol_cdump_{yyyy_mm}": 0} # print(prefixes) - async with httpx.AsyncClient() as client: - response = await client.get(DATA_DUMPS_URL) - response.raise_for_status() - soup = bs4.BeautifulSoup(response.content, features="html.parser") - found = 0 - #
- for item_ia in soup.find_all("div", class_="item-ia"): - if item_ia["data-id"].startswith(prefixes): - # print(item_ia["data-id"]) - found += 1 - if found >= 2: - break - return found >= 2 + for item in search_items("collection:ol_exports"): + for prefix in prefixes: + if item["identifier"].startswith(prefix): + prefixes[prefix] += 1 + # Is there at least one item id starting with each prefix? + if files_with_both_prefixes_found := all(prefixes.values()): + return files_with_both_prefixes_found + return all(prefixes.values()) if __name__ == "__main__": - import asyncio import sys - both_files_found = asyncio.run(find_last_months_dumps_on_ia()) - print(f"{both_files_found = }") - if not both_files_found: + files_with_both_prefixes_found = find_last_months_dumps_on_ia() + print(f"{files_with_both_prefixes_found = }") + if not files_with_both_prefixes_found: sys.exit(1)