Skip to content

Commit

Permalink
adjust crawl rate and concurrency since all queries are calling the s…
Browse files Browse the repository at this point in the history
…ame domain (WIP #372)
  • Loading branch information
boogheta committed May 26, 2021
1 parent 0c0a538 commit 88dff9e
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 8 deletions.
2 changes: 2 additions & 0 deletions hyphe_backend/core.tac
Original file line number Diff line number Diff line change
Expand Up @@ -1183,6 +1183,8 @@ class Crawler(customJSONRPC):
if not starts:
returnD(format_error('No startpage defined for crawling WebEntity %s.' % webentity_id))
# preparation of the request to scrapyd
if self.corpora[corpus]["options"]["webarchives"]["enabled"]:
download_delay = 2
args = {
'project': corpus_project(corpus),
'spider': 'pages',
Expand Down
16 changes: 8 additions & 8 deletions hyphe_backend/crawler/hcicrawler/settings-template.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,14 @@
'hcicrawler.pipelines.OutputQueue': 500,
}

CONCURRENT_REQUESTS = {{max_simul_requests}}
CONCURRENT_REQUESTS_PER_DOMAIN = {{max_simul_requests_per_host}}
ARCHIVES = {
"ENABLED": {{webarchives_enabled}},
"URL_PREFIX": '{{webarchives_url_prefix}}',
"DATE": '{{webarchives_date}}'
}

CONCURRENT_REQUESTS = {{max_simul_requests}} if not ARCHIVES["ENABLED"] else 3
CONCURRENT_REQUESTS_PER_DOMAIN = {{max_simul_requests_per_host}} if not ARCHIVES["ENABLED"] else 1

DOWNLOADER_HTTPCLIENTFACTORY = 'hcicrawler.webclient.LimitSizeHTTPClientFactory'

Expand Down Expand Up @@ -48,12 +54,6 @@
"AJAX_TIMEOUT": {{phantom_ajax_timeout}}
}

ARCHIVES = {
"ENABLED": {{webarchives_enabled}},
"URL_PREFIX": '{{webarchives_url_prefix}}',
"DATE": '{{webarchives_date}}'
}

STORE_HTML = {{store_crawled_html_content}}

if 'SCRAPY_JOB' in os.environ:
Expand Down

0 comments on commit 88dff9e

Please sign in to comment.