Skip to content

Commit

Permalink
more explicit archives option keys (#372)
Browse files Browse the repository at this point in the history
  • Loading branch information
boogheta committed Jun 28, 2021
1 parent 242b85e commit c97da99
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 6 deletions.
2 changes: 1 addition & 1 deletion config/config.json.example
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
]
},
"webarchives": {
"options": ["archive.org"],
"options": ["web.archive.org"],
"date": "2010-07-02",
"days_range": 365
},
Expand Down
6 changes: 3 additions & 3 deletions hyphe_backend/core.tac
Original file line number Diff line number Diff line change
Expand Up @@ -902,7 +902,7 @@ class Core(customJSONRPC):

@inlineCallbacks
def jsonrpc_crawl_webentity(self, webentity_id, depth=0, phantom_crawl=False, status="IN", proxy=None, cookies_string=None, phantom_timeouts={}, webarchives={}, corpus=DEFAULT_CORPUS):
"""Schedules a crawl for a `corpus` for an existing WebEntity defined by its `webentity_id` with a specific crawl `depth [int]`.\nOptionally use PhantomJS by setting `phantom_crawl` to "true" and adjust specific `phantom_timeouts` as a json object with possible keys `timeout`/`ajax_timeout`/`idle_timeout`.\nSets simultaneously the WebEntity's status to "IN" or optionally to another valid `status` ("undecided"/"out"/"discovered").\nOptionally add a HTTP `proxy` specified as "domain_or_IP:port".\nAlso optionally add known `cookies_string` with auth rights to a protected website.\nOptionally use some `webarchives` by defining a json object with keys `date`/`days_range`/`option`\, the latter being one of ""/"archive.org"/"bnf.fr".\nWill use the WebEntity's startpages if it has any or use otherwise the `corpus`' "default" `startmode` heuristic as defined in `propose_webentity_startpages` (use `crawl_webentity_with_startmode` to apply a different heuristic\, see details in `propose_webentity_startpages`)."""
"""Schedules a crawl for a `corpus` for an existing WebEntity defined by its `webentity_id` with a specific crawl `depth [int]`.\nOptionally use PhantomJS by setting `phantom_crawl` to "true" and adjust specific `phantom_timeouts` as a json object with possible keys `timeout`/`ajax_timeout`/`idle_timeout`.\nSets simultaneously the WebEntity's status to "IN" or optionally to another valid `status` ("undecided"/"out"/"discovered").\nOptionally add a HTTP `proxy` specified as "domain_or_IP:port".\nAlso optionally add known `cookies_string` with auth rights to a protected website.\nOptionally use some `webarchives` by defining a json object with keys `date`/`days_range`/`option`\, the latter being one of ""/"web.archive.org"/"archivesinternet.bnf.fr".\nWill use the WebEntity's startpages if it has any or use otherwise the `corpus`' "default" `startmode` heuristic as defined in `propose_webentity_startpages` (use `crawl_webentity_with_startmode` to apply a different heuristic\, see details in `propose_webentity_startpages`)."""
if not self.corpus_ready(corpus):
returnD(self.corpus_error(corpus))
try:
Expand All @@ -920,7 +920,7 @@ class Core(customJSONRPC):

@inlineCallbacks
def jsonrpc_crawl_webentity_with_startmode(self, webentity_id, depth=0, phantom_crawl=False, status="IN", startmode="default", proxy=None, cookies_string=None, phantom_timeouts={}, webarchives={}, corpus=DEFAULT_CORPUS):
"""Schedules a crawl for a `corpus` for an existing WebEntity defined by its `webentity_id` with a specific crawl `depth [int]`.\nOptionally use PhantomJS by setting `phantom_crawl` to "true" and adjust specific `phantom_timeouts` as a json object with possible keys `timeout`/`ajax_timeout`/`idle_timeout`.\nSets simultaneously the WebEntity's status to "IN" or optionally to another valid `status` ("undecided"/"out"/"discovered").\nOptionally add a HTTP `proxy` specified as "domain_or_IP:port".\nAlso optionally add known `cookies_string` with auth rights to a protected website.\nOptionally define the `startmode` strategy differently to the `corpus` "default one (see details in `propose_webentity_startpages`).\nOptionally use some `webarchives` by defining a json object with keys `date`/`days_range`/`option`\, the latter being one of ""/"archive.org"/"bnf.fr"."""
"""Schedules a crawl for a `corpus` for an existing WebEntity defined by its `webentity_id` with a specific crawl `depth [int]`.\nOptionally use PhantomJS by setting `phantom_crawl` to "true" and adjust specific `phantom_timeouts` as a json object with possible keys `timeout`/`ajax_timeout`/`idle_timeout`.\nSets simultaneously the WebEntity's status to "IN" or optionally to another valid `status` ("undecided"/"out"/"discovered").\nOptionally add a HTTP `proxy` specified as "domain_or_IP:port".\nAlso optionally add known `cookies_string` with auth rights to a protected website.\nOptionally define the `startmode` strategy differently to the `corpus` "default one (see details in `propose_webentity_startpages`).\nOptionally use some `webarchives` by defining a json object with keys `date`/`days_range`/`option`\, the latter being one of ""/"web.archive.org"/"archivesinternet.bnf.fr"."""
if not self.corpus_ready(corpus):
returnD(self.corpus_error(corpus))

Expand Down Expand Up @@ -1175,7 +1175,7 @@ class Crawler(customJSONRPC):

@inlineCallbacks
def jsonrpc_start(self, webentity_id, starts, follow_prefixes, nofollow_prefixes, follow_redirects=None, depth=0, phantom_crawl=False, phantom_timeouts={}, download_delay=config['mongo-scrapy']['download_delay'], proxy=None, cookies_string=None, webarchives={}, corpus=DEFAULT_CORPUS, _autostarts=[]):
"""Starts a crawl for a `corpus` defining finely the crawl options (mainly for debug purposes):\n- a `webentity_id` associated with the crawl a list of `starts` urls to start from\n- a list of `follow_prefixes` to know which links to follow\n- a list of `nofollow_prefixes` to know which links to avoid\n- a `depth` corresponding to the maximum number of clicks done from the start pages\n- `phantom_crawl` set to "true" to use PhantomJS for this crawl and optional `phantom_timeouts` as an object with keys among `timeout`/`ajax_timeout`/`idle_timeout`\n- a `download_delay` corresponding to the time in seconds spent between two requests by the crawler.\n- an HTTP `proxy` specified as "domain_or_IP:port"\n- a known `cookies_string` with auth rights to a protected website.\nOptionally use some `webarchives` by defining a json object with keys `date`/`days_range`/`option`\, the latter being one of ""/"archive.org"/"bnf.fr"."""
"""Starts a crawl for a `corpus` defining finely the crawl options (mainly for debug purposes):\n- a `webentity_id` associated with the crawl a list of `starts` urls to start from\n- a list of `follow_prefixes` to know which links to follow\n- a list of `nofollow_prefixes` to know which links to avoid\n- a `depth` corresponding to the maximum number of clicks done from the start pages\n- `phantom_crawl` set to "true" to use PhantomJS for this crawl and optional `phantom_timeouts` as an object with keys among `timeout`/`ajax_timeout`/`idle_timeout`\n- a `download_delay` corresponding to the time in seconds spent between two requests by the crawler.\n- an HTTP `proxy` specified as "domain_or_IP:port"\n- a known `cookies_string` with auth rights to a protected website.\nOptionally use some `webarchives` by defining a json object with keys `date`/`days_range`/`option`\, the latter being one of ""/"web.archive.org"/"archivesinternet.bnf.fr"."""
if not self.parent.corpus_ready(corpus):
returnD(self.parent.corpus_error(corpus))
if not phantom_crawl and urls_match_domainlist(starts, self.corpora[corpus]["options"]['phantom']['whitelist_domains']):
Expand Down
4 changes: 2 additions & 2 deletions hyphe_backend/lib/webarchives.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
"label": "Disabled",
"description": "crawl the live web, not any kind of web archive"
},
"archive.org": {
"web.archive.org": {
"label": "Web.Archive.org",
"description": "crawl worldwide web archives maintained by Archive.org",
"url_prefix": "https://web.archive.org/web/"
},
"bnf.fr": {
"archivesinternet.bnf.fr": {
"label": "ArchivesInternet.BNF.fr",
"description": "crawl France's official web archives maintained by BNF",
"url_prefix": "http://pfcarchivesinternet.bnf.fr",
Expand Down

0 comments on commit c97da99

Please sign in to comment.