Skip to content

Commit

Permalink
apply same filters to urls from linkextractor after rewriting archive…
Browse files Browse the repository at this point in the history
…s urls (#372)
  • Loading branch information
boogheta committed May 26, 2021
1 parent 88dff9e commit 27d48a3
Showing 1 changed file with 16 additions and 4 deletions.
20 changes: 16 additions & 4 deletions hyphe_backend/crawler/hcicrawler/spiders/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from ural import normalize_url
from ural.lru import LRUTrie

from hcicrawler.linkextractor import RegexpLinkExtractor
from hcicrawler.linkextractor import RegexpLinkExtractor, SCHEME_FILTERS
from hcicrawler.urllru import url_to_lru_clean, lru_get_host_url, lru_get_path_url, has_prefix, lru_to_url
from hcicrawler.tlds_tree import TLDS_TREE
from hcicrawler.items import Page
Expand All @@ -34,7 +34,7 @@
def timeout_alarm(*args):
raise SeleniumTimeout

RE_ARCHIVE_REDIRECT = re.compile(r'function go\(\) \{.*document.location.href = "(%s[^"]*)".*<p class="code shift red">Got an HTTP (\d+) response at crawl time</p>.*<p class="code">Redirecting to...</p>' % ARCHIVES["URL_PREFIX"], re.I|re.S)
RE_ARCHIVE_REDIRECT = r'function go\(\) \{.*document.location.href = "(%s[^"]*)".*<p class="code shift red">Got an HTTP (\d+) response at crawl time</p>.*<p class="code">Redirecting to...</p>'

def normalize(url):
return normalize_url(
Expand Down Expand Up @@ -81,11 +81,14 @@ def __init__(self, **kwargs):
self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
self.errors = 0

# TODO: handle bypassing ARCHIVES default config from job's arguments
if ARCHIVES["ENABLED"]:
self.archivedate = re.sub(r"\D", "", str(ARCHIVES["DATE"])) + "120000"
archiveprefix = ARCHIVES["URL_PREFIX"].rstrip('/')
self.archiveprefix = "%s/%s/" % (archiveprefix, self.archivedate)
self.archiveregexp = re.compile(r"^%s/(\d{14})/" % archiveprefix, re.I)
self.archiveredirect = re.compile(RE_ARCHIVE_REDIRECT % ARCHIVES["URL_PREFIX"], re.I|re.S)


self.cookies = None
if 'cookies' in args and args["cookies"]:
Expand Down Expand Up @@ -209,6 +212,7 @@ def handle_response(self, response):
redir_url = response.headers['Location']
real_url = self.archiveregexp.sub("", redir_url)
orig_url = self.archiveregexp.sub("", response.url)
# TODO: check date obtained fits into a user defined timerange and return 404 otherwise
if self.archiveregexp.match(redir_url) and normalize(real_url) == normalize(orig_url):
if "depth" in response.meta:
response.meta['depth'] -= 1
Expand Down Expand Up @@ -247,8 +251,9 @@ def parse_html(self, response):
# handle redirects
realdepth = response.meta['depth']
if ARCHIVES["ENABLED"]:
redir_url = RE_ARCHIVE_REDIRECT.search(response.body)
redir_url = self.archiveredirect.search(response.body)
if redir_url:
# TODO: check date obtained fits into a user defined timerange and return 404 otherwise
response.headers['Location'] = redir_url.group(1)
response.status = int(redir_url.group(2))

Expand Down Expand Up @@ -279,24 +284,31 @@ def parse_html(self, response):
self.log("ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), logging.ERROR)
links = []
self.errors += 1

for link in links:
try:
url = link.url
except AttributeError:
url = link['url']

if ARCHIVES["ENABLED"]:
url = self.archiveregexp.sub("", url)
if url.startswith(ARCHIVES["URL_PREFIX"]):
if url.startswith(ARCHIVES["URL_PREFIX"]) or \
url.split(":")[0].lower() in SCHEME_FILTERS:
continue

try:
lrulink = url_to_lru_clean(url, TLDS_TREE)
except (ValueError, IndexError) as e:
self.log("Error converting URL %s to LRU: %s" % (url, e), logging.ERROR)
continue

lrulinks.append((url, lrulink))

if self._should_follow(response.meta['depth'], lrulink) and \
not url_has_any_extension(url, self.ignored_exts):
yield self._request(url)

response.meta['depth'] = realdepth
yield self._make_html_page(response, lrulinks)

Expand Down

0 comments on commit 27d48a3

Please sign in to comment.