Skip to content

Commit

Permalink
refactor: keep cache hot
Browse files Browse the repository at this point in the history
  • Loading branch information
JesperDramsch committed Sep 6, 2024
1 parent 2777612 commit ee97744
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 26 deletions.
7 changes: 4 additions & 3 deletions utils/sort_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from tidy_conf.date import clean_dates
from tidy_conf.latlon import add_latlon
from tidy_conf.links import check_link_availability
from tidy_conf.links import get_cache
from tidy_conf.schema import Conference
from tidy_conf.schema import get_schema
from tidy_conf.titles import tidy_titles
Expand Down Expand Up @@ -140,13 +141,13 @@ def split_data(data):

def check_links(data):
"""Check the links in the data iteratively."""
cache, cache_archived = get_cache()
for i, q in tqdm(enumerate(sorted(data, key=operator.itemgetter("year"), reverse=True)), total=len(data)):
for key in ("link", "cfp_link", "sponsor", "finaid"):
if key in q:
new_link = check_link_availability(q[key], q["start"])
if "https://web.archive.org" not in new_link:
new_link = check_link_availability(q[key], q["start"], cache=cache, cache_archived=cache_archived)
if q[key] != new_link and "archive.org" in new_link:
time.sleep(0.5)

q[key] = new_link
data[i] = q
return data
Expand Down
59 changes: 36 additions & 23 deletions utils/tidy_conf/links.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,28 @@
from tqdm import tqdm


def check_link_availability(url, start):
def get_cache_location():
# Check if the URL is cached
cache_file = Path("utils", "tidy_conf", "data", ".tmp", "no_archive.txt")
cache_file_archived = Path("utils", "tidy_conf", "data", ".tmp", "archived_links.txt")
return cache_file, cache_file_archived


def get_cache():
cache_file, cache_file_archived = get_cache_location()

# Create the cache file if it doesn't exist
cache_file.touch()
cache_file_archived.touch()

# Read the cache file
cache = set(cache_file.read_text(encoding="utf-8").split("\n")[:-1])
cache_archived = set(cache_file_archived.read_text(encoding="utf-8").split("\n")[:-1])

return cache, cache_archived


def check_link_availability(url, start, cache=None, cache_archived=None):
"""Checks if a URL is available.
If not, tries to retrieve an archived version from the Wayback Machine.
Expand All @@ -24,19 +45,11 @@ def check_link_availability(url, start):
if url.startswith(("https://web.archive.org", "http://web.archive.org")):
return url

# Check if the URL is cached
cache_file = Path("utils", "tidy_conf", "data", ".tmp", "no_archive.txt")
cache_file_archived = Path("utils", "tidy_conf", "data", ".tmp", "archived_links.txt")

# Create the cache file if it doesn't exist
cache_file.touch()
cache_file_archived.touch()
# Get the cache
if cache is None or cache_archived is None:
cache, cache_archived = get_cache()

# Read the cache file
with cache_file.open(encoding="utf-8") as f:
cache = f.read().split("\n")[:-1]
with cache_file_archived.open(encoding="utf-8") as f:
cache_archived = f.read().split("\n")[:-1]
cache_file, _ = get_cache_location()

# Check if the URL is in the cache
if url in cache and url not in cache_archived:
Expand Down Expand Up @@ -65,7 +78,7 @@ def check_link_availability(url, start):
)
else:
if start > datetime.now(tz=timezone.utc).date():
attempt_archive_url(url, cache_file_archived)
attempt_archive_url(url, cache_archived)
return url
except requests.RequestException as e:
tqdm.write(f"An error occurred: {e}. Trying to find an archived version...")
Expand All @@ -89,7 +102,7 @@ def check_link_availability(url, start):
tqdm.write(f"Found archived version: {archived_url}")
return archived_url
tqdm.write("No archived version found.")
attempt_archive_url(url, cache_file_archived)
attempt_archive_url(url, cache_archived)
with cache_file.open("a") as f:
f.write(url + "\n")
return url
Expand All @@ -101,25 +114,25 @@ def check_link_availability(url, start):
return url


def attempt_archive_url(url, cache_file):
def attempt_archive_url(url, cache=None):
"""Attempts to archive a URL using the Wayback Machine."""
# Read the cache file
cache_file = Path(cache_file)

with cache_file.open(encoding="utf-8") as f:
cache = f.read().split("\n")[:-1]
if cache is None:
_, cache = get_cache()

# Check if the URL is in the cache
if url in cache:
tqdm.write(f"URL {url} was already archived.")
return
with cache_file.open("a") as f:
f.write(url + "\n")

try:
tqdm.write(f"Attempting archive of {url}.")
archive_response = requests.get("https://web.archive.org/save/" + url, timeout=7)
headers = {"User-Agent": "Pythondeadlin.es Archival Attempt/0.1 (https://pythondeadlin.es)"}
archive_response = requests.get("https://web.archive.org/save/" + url, timeout=30, headers=headers)
if archive_response.status_code == 200:
_, cache_file = get_cache_location()
with cache_file.open("a") as f:
f.write(url + "\n")
tqdm.write(f"Successfully archived {url}.")
except requests.RequestException as e:
tqdm.write(f"An error occurred while attempting to archive: {e}")

0 comments on commit ee97744

Please sign in to comment.