Skip to content

Commit

Permalink
pass webarchives options to crawler and prefix crawled urls with arch…
Browse files Browse the repository at this point in the history
…ive url (#372)
  • Loading branch information
boogheta committed May 20, 2021
1 parent 71c7a8e commit 21e86ee
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 4 deletions.
2 changes: 2 additions & 0 deletions hyphe_backend/crawler/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@
config["mongo-scrapy"]["host"] = os.environ.get('HYPHE_MONGODB_HOST', config["mongo-scrapy"]["host"])
for _to in ["", "idle_", "ajax_"]:
config['mongo-scrapy']['phantom_%stimeout' % _to] = config['phantom']['%stimeout' % _to]
for opt in ["enabled", "url_prefix", "date"]:
config['mongo-scrapy']['webarchives_%s' % opt] = config['webarchives'][opt]
with nested(open("hcicrawler/settings-template.py", "r"), open("hcicrawler/settings.py", "w")) as (template, generated):
generated.write(pystache.render(template.read(), config['mongo-scrapy']))
except IOError as e:
Expand Down
6 changes: 6 additions & 0 deletions hyphe_backend/crawler/hcicrawler/settings-template.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@
"AJAX_TIMEOUT": {{phantom_ajax_timeout}}
}

ARCHIVES = {
"ENABLED": {{webarchives_enabled}},
"URL_PREFIX": '{{webarchives_url_prefix}}',
"DATE": '{{webarchives_date}}'
}

STORE_HTML = {{store_crawled_html_content}}

if 'SCRAPY_JOB' in os.environ:
Expand Down
27 changes: 23 additions & 4 deletions hyphe_backend/crawler/hcicrawler/spiders/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from hcicrawler.urllru import url_to_lru_clean, lru_get_host_url, lru_get_path_url, has_prefix, lru_to_url
from hcicrawler.tlds_tree import TLDS_TREE
from hcicrawler.items import Page
from hcicrawler.settings import PROXY, HYPHE_PROJECT, PHANTOM, STORE_HTML, MONGO_HOST, MONGO_PORT, MONGO_DB, MONGO_JOBS_COL
from hcicrawler.settings import PROXY, HYPHE_PROJECT, PHANTOM, STORE_HTML, MONGO_HOST, MONGO_PORT, MONGO_DB, MONGO_JOBS_COL, ARCHIVES
from hcicrawler.errors import error_name

def timeout_alarm(*args):
Expand All @@ -44,28 +44,41 @@ def __init__(self, **kwargs):
job = mongo.find_one({"_id": kwargs["job_id"]})
args = job["crawl_arguments"]
self.args = args

self.start_urls = to_list(args['start_urls'])

self.maxdepth = int(args['max_depth'])

self.follow_prefixes = to_list(args['follow_prefixes'])
self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
self.prefixes_trie = LRUTrie()
for p in self.follow_prefixes:
self.prefixes_trie.set_lru(p, True)
for p in self.nofollow_prefixes:
self.prefixes_trie.set_lru(p, False)

self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', '')), TLDS_TREE) for u in to_list(args['discover_prefixes']) for https in ['', 's']]

# Init this dictionary to be filled by resolver from within pipelines.py
self.resolved_links = {}

self.user_agent = args['user_agent']

self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false"
self.cookies = None
if 'cookies' in args and args["cookies"]:
self.cookies = dict(cookie.split('=', 1) for cookie in re.split(r'\s*;\s*', args['cookies']) if '=' in cookie)
if self.phantom:
self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT']))
self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
self.errors = 0

if ARCHIVES["ENABLED"]:
self.archivedate = re.sub(r"\D", "", str(ARCHIVES["DATE"])) + "120000"
self.archiveprefix = "%s/%s/" % (ARCHIVES["URL_PREFIX"].rstrip('/'), self.archivedate)

self.cookies = None
if 'cookies' in args and args["cookies"]:
self.cookies = dict(cookie.split('=', 1) for cookie in re.split(r'\s*;\s*', args['cookies']) if '=' in cookie)

@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(PagesCrawler, cls).from_crawler(crawler, *args, **kwargs)
Expand All @@ -76,8 +89,12 @@ def from_crawler(cls, crawler, *args, **kwargs):
def start_requests(self):
self.log("Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], logging.INFO)
self.log("ARGUMENTS : "+str(self.args), logging.INFO)
if ARCHIVES["ENABLED"]:
self.log("Crawling on Web Archive using for prefix %s" % self.archiveprefix)

if self.phantom:
self.init_phantom()

for url in self.start_urls:
yield self._request(url)

Expand Down Expand Up @@ -281,6 +298,8 @@ def _request(self, url, noproxy=False, **kw):
kw['cookies'] = self.cookies
if self.phantom:
kw['method'] = 'HEAD'
if ARCHIVES["ENABLED"]:
return Request(self.archiveprefix + url, **kw)
return Request(url, **kw)


Expand Down

0 comments on commit 21e86ee

Please sign in to comment.