refactor: keep cache hot

JesperDramsch · Sep 6, 2024 · ee97744 · ee97744
1 parent 2777612
commit ee97744
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 26 deletions.
diff --git a/utils/sort_yaml.py b/utils/sort_yaml.py
@@ -16,6 +16,7 @@
 from tidy_conf.date import clean_dates
 from tidy_conf.latlon import add_latlon
 from tidy_conf.links import check_link_availability
+from tidy_conf.links import get_cache
 from tidy_conf.schema import Conference
 from tidy_conf.schema import get_schema
 from tidy_conf.titles import tidy_titles
@@ -140,13 +141,13 @@ def split_data(data):
 
 def check_links(data):
     """Check the links in the data iteratively."""
+    cache, cache_archived = get_cache()
     for i, q in tqdm(enumerate(sorted(data, key=operator.itemgetter("year"), reverse=True)), total=len(data)):
         for key in ("link", "cfp_link", "sponsor", "finaid"):
             if key in q:
-                new_link = check_link_availability(q[key], q["start"])
-                if "https://web.archive.org" not in new_link:
+                new_link = check_link_availability(q[key], q["start"], cache=cache, cache_archived=cache_archived)
+                if q[key] != new_link and "archive.org" in new_link:
                     time.sleep(0.5)
-
                 q[key] = new_link
                 data[i] = q
     return data

diff --git a/utils/tidy_conf/links.py b/utils/tidy_conf/links.py
@@ -8,7 +8,28 @@
 from tqdm import tqdm
 
 
-def check_link_availability(url, start):
+def get_cache_location():
+    # Check if the URL is cached
+    cache_file = Path("utils", "tidy_conf", "data", ".tmp", "no_archive.txt")
+    cache_file_archived = Path("utils", "tidy_conf", "data", ".tmp", "archived_links.txt")
+    return cache_file, cache_file_archived
+
+
+def get_cache():
+    cache_file, cache_file_archived = get_cache_location()
+
+    # Create the cache file if it doesn't exist
+    cache_file.touch()
+    cache_file_archived.touch()
+
+    # Read the cache file
+    cache = set(cache_file.read_text(encoding="utf-8").split("\n")[:-1])
+    cache_archived = set(cache_file_archived.read_text(encoding="utf-8").split("\n")[:-1])
+
+    return cache, cache_archived
+
+
+def check_link_availability(url, start, cache=None, cache_archived=None):
     """Checks if a URL is available.
 
     If not, tries to retrieve an archived version from the Wayback Machine.
@@ -24,19 +45,11 @@ def check_link_availability(url, start):
     if url.startswith(("https://web.archive.org", "http://web.archive.org")):
         return url
 
-    # Check if the URL is cached
-    cache_file = Path("utils", "tidy_conf", "data", ".tmp", "no_archive.txt")
-    cache_file_archived = Path("utils", "tidy_conf", "data", ".tmp", "archived_links.txt")
-
-    # Create the cache file if it doesn't exist
-    cache_file.touch()
-    cache_file_archived.touch()
+    # Get the cache
+    if cache is None or cache_archived is None:
+        cache, cache_archived = get_cache()
 
-    # Read the cache file
-    with cache_file.open(encoding="utf-8") as f:
-        cache = f.read().split("\n")[:-1]
-    with cache_file_archived.open(encoding="utf-8") as f:
-        cache_archived = f.read().split("\n")[:-1]
+    cache_file, _ = get_cache_location()
 
     # Check if the URL is in the cache
     if url in cache and url not in cache_archived:
@@ -65,7 +78,7 @@ def check_link_availability(url, start):
                 )
             else:
                 if start > datetime.now(tz=timezone.utc).date():
-                    attempt_archive_url(url, cache_file_archived)
+                    attempt_archive_url(url, cache_archived)
                 return url
         except requests.RequestException as e:
             tqdm.write(f"An error occurred: {e}. Trying to find an archived version...")
@@ -89,7 +102,7 @@ def check_link_availability(url, start):
                 tqdm.write(f"Found archived version: {archived_url}")
                 return archived_url
             tqdm.write("No archived version found.")
-            attempt_archive_url(url, cache_file_archived)
+            attempt_archive_url(url, cache_archived)
             with cache_file.open("a") as f:
                 f.write(url + "\n")
             return url
@@ -101,25 +114,25 @@ def check_link_availability(url, start):
         return url
 
 
-def attempt_archive_url(url, cache_file):
+def attempt_archive_url(url, cache=None):
     """Attempts to archive a URL using the Wayback Machine."""
     # Read the cache file
-    cache_file = Path(cache_file)
-
-    with cache_file.open(encoding="utf-8") as f:
-        cache = f.read().split("\n")[:-1]
+    if cache is None:
+        _, cache = get_cache()
 
     # Check if the URL is in the cache
     if url in cache:
         tqdm.write(f"URL {url} was already archived.")
         return
-    with cache_file.open("a") as f:
-        f.write(url + "\n")
 
     try:
         tqdm.write(f"Attempting archive of {url}.")
-        archive_response = requests.get("https://web.archive.org/save/" + url, timeout=7)
+        headers = {"User-Agent": "Pythondeadlin.es Archival Attempt/0.1 (https://pythondeadlin.es)"}
+        archive_response = requests.get("https://web.archive.org/save/" + url, timeout=30, headers=headers)
         if archive_response.status_code == 200:
+            _, cache_file = get_cache_location()
+            with cache_file.open("a") as f:
+                f.write(url + "\n")
             tqdm.write(f"Successfully archived {url}.")
     except requests.RequestException as e:
         tqdm.write(f"An error occurred while attempting to archive: {e}")