replicate frontend's cookies mechanic to user agent (closes #461)

medialab · Aug 21, 2023 · 72f1b4d · 72f1b4d
1 parent ff9af58
commit 72f1b4d
Show file tree

Hide file tree

Showing 8 changed files with 49 additions and 15 deletions.
diff --git a/doc/api.md b/doc/api.md
@@ -282,6 +282,7 @@ The API will always answer as such:
   + _`status`_ (optional, default: `"IN"`)
   + _`proxy`_ (optional, default: `null`)
   + _`cookies_string`_ (optional, default: `null`)
+  + _`user_agent`_ (optional, default: `null`)
   + _`phantom_timeouts`_ (optional, default: `{}`)
   + _`webarchives`_ (optional, default: `{}`)
   + _`corpus`_ (optional, default: `"--hyphe--"`)
@@ -290,7 +291,7 @@ The API will always answer as such:
  Optionally use PhantomJS by setting `phantom_crawl` to "true" and adjust specific `phantom_timeouts` as a json object with possible keys `timeout`/`ajax_timeout`/`idle_timeout`.
  Sets simultaneously the WebEntity's status to "IN" or optionally to another valid `status` ("undecided"/"out"/"discovered").
  Optionally add a HTTP `proxy` specified as "domain_or_IP:port".
- Also optionally add known `cookies_string` with auth rights to a protected website.
+ Also optionally add known `cookies_string` with auth rights to a protected website and/or specific `user_agent`.
  Optionally use some `webarchives` by defining a json object with keys `date`/`days_range`/`option`, the latter being one of ""/"web.archive.org"/"archivesinternet.bnf.fr".
  Will use the WebEntity's startpages if it has any or use otherwise the `corpus`' "default" `startmode` heuristic as defined in `propose_webentity_startpages` (use `crawl_webentity_with_startmode` to apply a different heuristic, see details in `propose_webentity_startpages`).
 
@@ -303,6 +304,7 @@ The API will always answer as such:
   + _`startmode`_ (optional, default: `"default"`)
   + _`proxy`_ (optional, default: `null`)
   + _`cookies_string`_ (optional, default: `null`)
+  + _`user_agent`_ (optional, default: `null`)
   + _`phantom_timeouts`_ (optional, default: `{}`)
   + _`webarchives`_ (optional, default: `{}`)
   + _`save_startpages`_ (optional, default: `false`)
@@ -312,7 +314,7 @@ The API will always answer as such:
  Optionally use PhantomJS by setting `phantom_crawl` to "true" and adjust specific `phantom_timeouts` as a json object with possible keys `timeout`/`ajax_timeout`/`idle_timeout`.
  Sets simultaneously the WebEntity's status to "IN" or optionally to another valid `status` ("undecided"/"out"/"discovered").
  Optionally add a HTTP `proxy` specified as "domain_or_IP:port".
- Also optionally add known `cookies_string` with auth rights to a protected website.
+ Also optionally add known `cookies_string` with auth rights to a protected website and/or specific `user_agent`.
  Optionally define the `startmode` strategy differently to the `corpus` "default one (see details in `propose_webentity_startpages`).
  Optionally use some `webarchives` by defining a json object with keys `date`/`days_range`/`option`, the latter being one of ""/"web.archive.org"/"archivesinternet.bnf.fr".
 
@@ -390,6 +392,7 @@ The API will always answer as such:
   + _`download_delay`_ (optional, default: `1`)
   + _`proxy`_ (optional, default: `null`)
   + _`cookies_string`_ (optional, default: `null`)
+  + _`user_agent`_ (optional, default: `null`)
   + _`webarchives`_ (optional, default: `{}`)
   + _`corpus`_ (optional, default: `"--hyphe--"`)
 
@@ -401,7 +404,8 @@ The API will always answer as such:
   * `phantom_crawl` set to "true" to use PhantomJS for this crawl and optional `phantom_timeouts` as an object with keys among `timeout`/`ajax_timeout`/`idle_timeout`
   * a `download_delay` corresponding to the time in seconds spent between two requests by the crawler.
   * an HTTP `proxy` specified as "domain_or_IP:port"
-  * a known `cookies_string` with auth rights to a protected website.
+  * a known `cookies_string` with auth rights to a protected website
+  * a specific `user_agent`.
  Optionally use some `webarchives` by defining a json object with keys `date`/`days_range`/`option`, the latter being one of ""/"web.archive.org"/"archivesinternet.bnf.fr".
 
 

diff --git a/hyphe_backend/core.tac b/hyphe_backend/core.tac
@@ -928,8 +928,8 @@ class Core(customJSONRPC):
         returnD(handle_standard_results(startpages))
 
     @inlineCallbacks
-    def jsonrpc_crawl_webentity(self, webentity_id, depth=0, phantom_crawl=False, status="IN", proxy=None, cookies_string=None, phantom_timeouts={}, webarchives={}, corpus=DEFAULT_CORPUS):
-        """Schedules a crawl for a `corpus` for an existing WebEntity defined by its `webentity_id` with a specific crawl `depth [int]`.\nOptionally use PhantomJS by setting `phantom_crawl` to "true" and adjust specific `phantom_timeouts` as a json object with possible keys `timeout`/`ajax_timeout`/`idle_timeout`.\nSets simultaneously the WebEntity's status to "IN" or optionally to another valid `status` ("undecided"/"out"/"discovered").\nOptionally add a HTTP `proxy` specified as "domain_or_IP:port".\nAlso optionally add known `cookies_string` with auth rights to a protected website.\nOptionally use some `webarchives` by defining a json object with keys `date`/`days_range`/`option`\, the latter being one of ""/"web.archive.org"/"archivesinternet.bnf.fr".\nWill use the WebEntity's startpages if it has any or use otherwise the `corpus`' "default" `startmode` heuristic as defined in `propose_webentity_startpages` (use `crawl_webentity_with_startmode` to apply a different heuristic\, see details in `propose_webentity_startpages`)."""
+    def jsonrpc_crawl_webentity(self, webentity_id, depth=0, phantom_crawl=False, status="IN", proxy=None, cookies_string=None, user_agent=None, phantom_timeouts={}, webarchives={}, corpus=DEFAULT_CORPUS):
+        """Schedules a crawl for a `corpus` for an existing WebEntity defined by its `webentity_id` with a specific crawl `depth [int]`.\nOptionally use PhantomJS by setting `phantom_crawl` to "true" and adjust specific `phantom_timeouts` as a json object with possible keys `timeout`/`ajax_timeout`/`idle_timeout`.\nSets simultaneously the WebEntity's status to "IN" or optionally to another valid `status` ("undecided"/"out"/"discovered").\nOptionally add a HTTP `proxy` specified as "domain_or_IP:port".\nAlso optionally add known `cookies_string` with auth rights to a protected website and/or specific `user_agent`.\nOptionally use some `webarchives` by defining a json object with keys `date`/`days_range`/`option`\, the latter being one of ""/"web.archive.org"/"archivesinternet.bnf.fr".\nWill use the WebEntity's startpages if it has any or use otherwise the `corpus`' "default" `startmode` heuristic as defined in `propose_webentity_startpages` (use `crawl_webentity_with_startmode` to apply a different heuristic\, see details in `propose_webentity_startpages`)."""
         if not self.corpus_ready(corpus):
             returnD(self.corpus_error(corpus))
         try:
@@ -944,12 +944,12 @@ class Core(customJSONRPC):
         if not WE["startpages"]:
             startmode = "default"
             save_startpages = True
-        res = yield self.jsonrpc_crawl_webentity_with_startmode(WE, depth=depth, phantom_crawl=phantom_crawl, status=status, startmode=startmode, proxy=proxy, cookies_string=cookies_string, phantom_timeouts=phantom_timeouts, webarchives=webarchives, save_startpages=save_startpages, corpus=corpus)
+        res = yield self.jsonrpc_crawl_webentity_with_startmode(WE, depth=depth, phantom_crawl=phantom_crawl, status=status, startmode=startmode, proxy=proxy, cookies_string=cookies_string, user_agent=user_agent, phantom_timeouts=phantom_timeouts, webarchives=webarchives, save_startpages=save_startpages, corpus=corpus)
         returnD(res)
 
     @inlineCallbacks
-    def jsonrpc_crawl_webentity_with_startmode(self, webentity_id, depth=0, phantom_crawl=False, status="IN", startmode="default", proxy=None, cookies_string=None, phantom_timeouts={}, webarchives={}, save_startpages=False, corpus=DEFAULT_CORPUS):
-        """Schedules a crawl for a `corpus` for an existing WebEntity defined by its `webentity_id` with a specific crawl `depth [int]`.\nOptionally use PhantomJS by setting `phantom_crawl` to "true" and adjust specific `phantom_timeouts` as a json object with possible keys `timeout`/`ajax_timeout`/`idle_timeout`.\nSets simultaneously the WebEntity's status to "IN" or optionally to another valid `status` ("undecided"/"out"/"discovered").\nOptionally add a HTTP `proxy` specified as "domain_or_IP:port".\nAlso optionally add known `cookies_string` with auth rights to a protected website.\nOptionally define the `startmode` strategy differently to the `corpus` "default one (see details in `propose_webentity_startpages`).\nOptionally use some `webarchives` by defining a json object with keys `date`/`days_range`/`option`\, the latter being one of ""/"web.archive.org"/"archivesinternet.bnf.fr"."""
+    def jsonrpc_crawl_webentity_with_startmode(self, webentity_id, depth=0, phantom_crawl=False, status="IN", startmode="default", proxy=None, cookies_string=None, user_agent=None, phantom_timeouts={}, webarchives={}, save_startpages=False, corpus=DEFAULT_CORPUS):
+        """Schedules a crawl for a `corpus` for an existing WebEntity defined by its `webentity_id` with a specific crawl `depth [int]`.\nOptionally use PhantomJS by setting `phantom_crawl` to "true" and adjust specific `phantom_timeouts` as a json object with possible keys `timeout`/`ajax_timeout`/`idle_timeout`.\nSets simultaneously the WebEntity's status to "IN" or optionally to another valid `status` ("undecided"/"out"/"discovered").\nOptionally add a HTTP `proxy` specified as "domain_or_IP:port".\nAlso optionally add known `cookies_string` with auth rights to a protected website and/or specific `user_agent`.\nOptionally define the `startmode` strategy differently to the `corpus` "default one (see details in `propose_webentity_startpages`).\nOptionally use some `webarchives` by defining a json object with keys `date`/`days_range`/`option`\, the latter being one of ""/"web.archive.org"/"archivesinternet.bnf.fr"."""
         if not self.corpus_ready(corpus):
             returnD(self.corpus_error(corpus))
 
@@ -1015,7 +1015,7 @@ class Core(customJSONRPC):
         elif webarchives.get("option"):
             yield self.store.jsonrpc_add_webentity_tag_value(webentity_id, "USER", "Crawl Source", webarchives["option"], corpus=corpus, _automatic=True)
 
-        res = yield self.crawler.jsonrpc_start(webentity_id, starts, WE["prefixes"], nofollow, self.corpora[corpus]["options"]["follow_redirects"], depth, phantom_crawl, phantom_timeouts, proxy=proxy, cookies_string=cookies_string, webarchives=webarchives, corpus=corpus, _autostarts=autostarts)
+        res = yield self.crawler.jsonrpc_start(webentity_id, starts, WE["prefixes"], nofollow, self.corpora[corpus]["options"]["follow_redirects"], depth, phantom_crawl, phantom_timeouts, proxy=proxy, cookies_string=cookies_string, user_agent=user_agent, webarchives=webarchives, corpus=corpus, _autostarts=autostarts)
         returnD(res)
 
     @inlineCallbacks
@@ -1218,8 +1218,8 @@ class Crawler(customJSONRPC):
         returnD(format_result('Crawling database reset.'))
 
     @inlineCallbacks
-    def jsonrpc_start(self, webentity_id, starts, follow_prefixes, nofollow_prefixes, follow_redirects=None, depth=0, phantom_crawl=False, phantom_timeouts={}, download_delay=config['mongo-scrapy']['download_delay'], proxy=None, cookies_string=None, webarchives={}, corpus=DEFAULT_CORPUS, _autostarts=[]):
-        """Starts a crawl for a `corpus` defining finely the crawl options (mainly for debug purposes):\n- a `webentity_id` associated with the crawl a list of `starts` urls to start from\n- a list of `follow_prefixes` to know which links to follow\n- a list of `nofollow_prefixes` to know which links to avoid\n- a `depth` corresponding to the maximum number of clicks done from the start pages\n- `phantom_crawl` set to "true" to use PhantomJS for this crawl and optional `phantom_timeouts` as an object with keys among `timeout`/`ajax_timeout`/`idle_timeout`\n- a `download_delay` corresponding to the time in seconds spent between two requests by the crawler.\n- an HTTP `proxy` specified as "domain_or_IP:port"\n- a known `cookies_string` with auth rights to a protected website.\nOptionally use some `webarchives` by defining a json object with keys `date`/`days_range`/`option`\, the latter being one of ""/"web.archive.org"/"archivesinternet.bnf.fr"."""
+    def jsonrpc_start(self, webentity_id, starts, follow_prefixes, nofollow_prefixes, follow_redirects=None, depth=0, phantom_crawl=False, phantom_timeouts={}, download_delay=config['mongo-scrapy']['download_delay'], proxy=None, cookies_string=None, user_agent=None, webarchives={}, corpus=DEFAULT_CORPUS, _autostarts=[]):
+        """Starts a crawl for a `corpus` defining finely the crawl options (mainly for debug purposes):\n- a `webentity_id` associated with the crawl a list of `starts` urls to start from\n- a list of `follow_prefixes` to know which links to follow\n- a list of `nofollow_prefixes` to know which links to avoid\n- a `depth` corresponding to the maximum number of clicks done from the start pages\n- `phantom_crawl` set to "true" to use PhantomJS for this crawl and optional `phantom_timeouts` as an object with keys among `timeout`/`ajax_timeout`/`idle_timeout`\n- a `download_delay` corresponding to the time in seconds spent between two requests by the crawler.\n- an HTTP `proxy` specified as "domain_or_IP:port"\n- a known `cookies_string` with auth rights to a protected website\n- a specific `user_agent`.\nOptionally use some `webarchives` by defining a json object with keys `date`/`days_range`/`option`\, the latter being one of ""/"web.archive.org"/"archivesinternet.bnf.fr"."""
         if not self.parent.corpus_ready(corpus):
             returnD(self.parent.corpus_error(corpus))
         if not phantom_crawl and urls_match_domainlist(starts, self.corpora[corpus]["options"]['phantom']['whitelist_domains']):
@@ -1261,7 +1261,7 @@ class Crawler(customJSONRPC):
           'discover_prefixes': list(follow_redirects),
           'ignore_internal_links': self.corpora[corpus]["options"]["ignore_internal_links"],
           'proxy': proxy,
-          'user_agent': self.parent.user_agents_list.get_random(),
+          'user_agent': user_agent or self.parent.user_agents_list.get_random(),
           'cookies': cookies_string,
           'webarchives': webarchives
         }

diff --git a/hyphe_frontend/app/js/service_hyphe_api.js b/hyphe_frontend/app/js/service_hyphe_api.js
@@ -441,6 +441,7 @@ angular.module('hyphe.service_hyphe_api', [])
             ,settings.status || 'IN'
             ,settings.proxy || null
             ,settings.cookies_string || null
+            ,settings.user_agent || null
             ,{}                                 // phantom timeouts
             ,settings.webarchives || {}
             ,corpus.getId()
@@ -478,7 +479,8 @@ angular.module('hyphe.service_hyphe_api', [])
             ,settings.status || 'IN'
             ,settings.startmode || 'default'
             ,settings.proxy || null
-            ,settings.cookies || null
+            ,settings.cookies_string || null
+            ,settings.user_agent || null
             ,{}                                 // phantom timeouts
             ,settings.webarchives || {}
             ,settings.saveStartpages || false

diff --git a/hyphe_frontend/app/js/service_utils.js b/hyphe_frontend/app/js/service_utils.js
@@ -693,6 +693,7 @@ angular.module('hyphe.service_utils', [])
         var richJob = ns.consolidateJob(job)
 
         richJob.max_depth = job.crawl_arguments.max_depth
+        richJob.user_agent = job.crawl_arguments.user_agent
         richJob.cookies = job.crawl_arguments.cookies
         richJob.phantom = job.crawl_arguments.phantom
         richJob.webarchives_used = job.crawl_arguments.webarchives.option

diff --git a/hyphe_frontend/app/partials/webentitystartpagesmodal.html b/hyphe_frontend/app/partials/webentitystartpagesmodal.html
@@ -250,6 +250,20 @@ <h3 class="md-title" md-colors="{'color':'default-warn'}">
               </div>
             </div>
 
+            <span style="margin: 8px"><span style="color:grey; filter: grayscale(100%);">🕵</span> User-Agent</span>
+            <div layout="column" md-colors="{background: 'default-primary-50'}" style="padding: 8px">
+              <label><small md-colors="{'color': 'default-primary'}">Paste a specific User-Agent you would like this webentity to be crawled with instead of a random commonly used one</small></label>
+              <md-input-container flex class="md-block">
+                <textarea
+                  id="add-useragent-{{webentity.id}}"
+                  ng-model="webentity.userAgent"
+                  rows="2"
+                  max-rows="3"
+                  aria-label="user-agent"
+                ></textarea>
+              </md-input-container>
+            </div>
+
             <span style="margin: 8px"><span style="color:grey; filter: grayscale(100%);">🍪</span> Cookies</span>
             <div layout="column" md-colors="{background: 'default-primary-50'}" style="padding: 8px">
               <label><small md-colors="{'color': 'default-primary'}">Paste your browser's cookies for this web entity to grant the crawler your authenticated access</small></label>

diff --git a/hyphe_frontend/app/views/monitorCrawls.html b/hyphe_frontend/app/views/monitorCrawls.html
@@ -508,10 +508,10 @@ <h3 style="margin-bottom:0px"><span hyphe-glossary="START PAGES"/></h3>
                 <h3 style="margin-bottom:0px">DETAILED SETTINGS</h3>
                 <dl class="key-value-pairs">
 
-                  <dt>User Agent</dt>
+                  <dt>🕵 User-Agent</dt>
                   <dd>{{crawljobsIndex[focusedJobId].crawl_arguments.user_agent}}</dd>
 
-                  <dt>Cookie used</dt>
+                  <dt>🍪 Cookie used</dt>
                   <dd>{{crawljobsIndex[focusedJobId].crawl_arguments.cookies || "None"}}</dd>
 
                   <dt>Additional Settings</dt>

diff --git a/hyphe_frontend/app/views/prepareCrawls.html b/hyphe_frontend/app/views/prepareCrawls.html
@@ -134,6 +134,10 @@ <h3 md-truncate>{{obj.id+1}}. {{obj.webentity.name}}</h3>
                     <md-tooltip md-direction="bottom">The crawl will browse the past using {{ obj.webentity.webarchives.option }} <span ng-if="obj.webentity.webarchives.days_range === 'infinity'">around</span><span ng-if="obj.webentity.webarchives.days_range !== 'infinity'">{{ obj.webentity.webarchives.days_range / 2 }} days before &amp; after</span> {{ obj.webentity.webarchives.date }}</md-tooltip>
                     <md-icon class="material-icons" role="img" aria-label="history">history</md-icon> 
                   </div>
+                  <div ng-show="obj.webentity.userAgent">
+                    <md-tooltip md-direction="bottom">A specific User-Agent will be used for this crawl</md-tooltip>
+                    <span style="color:grey; filter: grayscale(100%);">🕵</span>
+                  </div>
                   <div ng-show="obj.webentity.cookiesString">
                     <md-tooltip md-direction="bottom">A specific cookie will be used for this crawl</md-tooltip>
                     <span style="color:grey; filter: grayscale(100%);">🍪</span>