Skip to content

Commit

Permalink
tag automatically entities when they were crawled using webarchives (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
boogheta committed Sep 10, 2021
1 parent 959e0ea commit 507138e
Showing 1 changed file with 12 additions and 0 deletions.
12 changes: 12 additions & 0 deletions hyphe_backend/core.tac
Original file line number Diff line number Diff line change
Expand Up @@ -975,6 +975,18 @@ class Core(customJSONRPC):

if "CORE" in WE["tags"] and "recrawlNeeded" in WE["tags"]["CORE"]:
yield self.store.jsonrpc_rm_webentity_tag_value(webentity_id, "CORE", "recrawlNeeded", "true", corpus=corpus)
if WE["crawled"]:
oldsources = WE.get("tags", {}).get("USER", {}).get("Crawl Source", [""])[0]
sources = set((oldsources or "Live Web").split(" + "))
sources.add(webarchives.get("option", "Live Web") or "Live Web")
sources = " + ".join(sources)
if not oldsources and sources != "Live Web":
yield self.store.jsonrpc_add_webentity_tag_value(webentity_id, "USER", "Crawl Source", sources, corpus=corpus, _automatic=True)
elif sources != oldsources:
yield self.store.jsonrpc_edit_webentity_tag_value(webentity_id, "USER", "Crawl Source", oldsources, sources, corpus=corpus, _automatic=True)
elif webarchives.get("option"):
yield self.store.jsonrpc_add_webentity_tag_value(webentity_id, "USER", "Crawl Source", webarchives["option"], corpus=corpus, _automatic=True)

res = yield self.crawler.jsonrpc_start(webentity_id, starts, WE["prefixes"], nofollow, self.corpora[corpus]["options"]["follow_redirects"], depth, phantom_crawl, phantom_timeouts, proxy=proxy, cookies_string=cookies_string, webarchives=webarchives, corpus=corpus, _autostarts=autostarts)
returnD(res)

Expand Down

0 comments on commit 507138e

Please sign in to comment.