Skip to content

Commit

Permalink
handle relative redirections from archive (#372)
Browse files Browse the repository at this point in the history
  • Loading branch information
boogheta committed Jun 1, 2021
1 parent 70e816d commit bcb784f
Showing 1 changed file with 8 additions and 3 deletions.
11 changes: 8 additions & 3 deletions hyphe_backend/crawler/hcicrawler/spiders/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
def timeout_alarm(*args):
raise SeleniumTimeout

RE_ARCHIVE_REDIRECT = r'function go\(\) \{.*document.location.href = "(%s[^"]*)".*<p class="code shift red">Got an HTTP (\d+) response at crawl time</p>.*<p class="code">Redirecting to...</p>'
RE_ARCHIVE_REDIRECT = r'function go\(\) \{.*document.location.href = "(%s/[^"]*)".*<p class="code shift red">Got an HTTP (\d+) response at crawl time</p>.*<p class="code">Redirecting to...</p>'

def normalize(url):
return normalize_url(
Expand Down Expand Up @@ -87,7 +87,9 @@ def __init__(self, **kwargs):
archiveprefix = ARCHIVES["URL_PREFIX"].rstrip('/')
self.archiveprefix = "%s/%s/" % (archiveprefix, self.archivedate)
self.archiveregexp = re.compile(r"^%s/(\d{14})/" % archiveprefix, re.I)
self.archiveredirect = re.compile(RE_ARCHIVE_REDIRECT % ARCHIVES["URL_PREFIX"], re.I|re.S)
self.archivedomain = "/".join(archiveprefix.split('/')[:3])
archivedomain_regexp = "(?:%s|%s)" % (archiveprefix, archiveprefix.replace(self.archivedomain, ""))
self.archiveredirect = re.compile(RE_ARCHIVE_REDIRECT % archivedomain_regexp, re.I|re.S)


self.cookies = None
Expand Down Expand Up @@ -254,8 +256,11 @@ def parse_html(self, response):
redir_url = self.archiveredirect.search(response.body)
if redir_url:
# TODO: check date obtained fits into a user defined timerange and return 404 otherwise
response.headers['Location'] = redir_url.group(1)
response.status = int(redir_url.group(2))
redir_location = redir_url.group(1)
if redir_location.startswith("/"):
redir_location = "%s%s" % (self.archivedomain, redir_location)
response.headers['Location'] = redir_location

if 300 <= response.status < 400:
redir_url = response.headers['Location']
Expand Down

0 comments on commit bcb784f

Please sign in to comment.