adjust crawl rate and concurrency since all queries are calling the s…

…ame domain (WIP #372)
medialab · May 26, 2021 · 88dff9e · 88dff9e
1 parent 0c0a538
commit 88dff9e
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 8 deletions.
diff --git a/hyphe_backend/core.tac b/hyphe_backend/core.tac
@@ -1183,6 +1183,8 @@ class Crawler(customJSONRPC):
         if not starts:
             returnD(format_error('No startpage defined for crawling WebEntity %s.' % webentity_id))
         # preparation of the request to scrapyd
+        if self.corpora[corpus]["options"]["webarchives"]["enabled"]:
+            download_delay = 2
         args = {
           'project': corpus_project(corpus),
           'spider': 'pages',

diff --git a/hyphe_backend/crawler/hcicrawler/settings-template.py b/hyphe_backend/crawler/hcicrawler/settings-template.py
@@ -14,8 +14,14 @@
     'hcicrawler.pipelines.OutputQueue': 500,
 }
 
-CONCURRENT_REQUESTS = {{max_simul_requests}}
-CONCURRENT_REQUESTS_PER_DOMAIN = {{max_simul_requests_per_host}}
+ARCHIVES = {
+  "ENABLED": {{webarchives_enabled}},
+  "URL_PREFIX": '{{webarchives_url_prefix}}',
+  "DATE": '{{webarchives_date}}'
+}
+
+CONCURRENT_REQUESTS = {{max_simul_requests}} if not ARCHIVES["ENABLED"] else 3
+CONCURRENT_REQUESTS_PER_DOMAIN = {{max_simul_requests_per_host}} if not ARCHIVES["ENABLED"] else 1
 
 DOWNLOADER_HTTPCLIENTFACTORY = 'hcicrawler.webclient.LimitSizeHTTPClientFactory'
 
@@ -48,12 +54,6 @@
   "AJAX_TIMEOUT": {{phantom_ajax_timeout}}
 }
 
-ARCHIVES = {
-  "ENABLED": {{webarchives_enabled}},
-  "URL_PREFIX": '{{webarchives_url_prefix}}',
-  "DATE": '{{webarchives_date}}'
-}
-
 STORE_HTML = {{store_crawled_html_content}}
 
 if 'SCRAPY_JOB' in os.environ: