Skip to content

Commit

Permalink
Merge branch 'master' into staging
Browse files Browse the repository at this point in the history
  • Loading branch information
boogheta committed Jan 13, 2021
2 parents d170ad6 + 849354a commit 42e8dd2
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 33 deletions.
2 changes: 1 addition & 1 deletion bin/build_apidoc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ echo "
defcorpus=$(grep DEFAULT_CORPUS hyphe_backend/lib/config_hci.py | head -n 1 | sed 's/^.*= //' | sed s'/"//g')
downloaddelay=$(grep download_delay config/config.json.example | head -n 1 | sed 's/^.*: //' | sed 's/[ ,]\+//')

grep 'def jsonrpc_\|accessible jsonrpc\|"""\|^ #' hyphe_backend/core.tac |
grep 'def jsonrpc_\|accessible jsonrpc\|^ """\|^ #' hyphe_backend/core.tac |
sed 's/# accessible jsonrpc.*\(".*"\)/\n\n## Commands for namespace: \1/' |
sed 's/^ #\s*/### /' |
sed 's/^ \+"""/ /' |
Expand Down
14 changes: 4 additions & 10 deletions doc/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,7 @@ The API will always answer as such:
+ _`status`_ (optional, default: `null`)
+ _`startpages`_ (optional, default: `[]`)
+ _`lruVariations`_ (optional, default: `true`)
+ _`tags`_ (optional, default: `{}`)
+ _`corpus`_ (optional, default: `"--hyphe--"`)

Creates for a `corpus` a WebEntity defined for the LRU prefix given as a `url` and optionnally for the corresponding http/https and www/no-www variations if `lruVariations` is true. Optionally set the newly created WebEntity's `name` `status` ("in"/"out"/"undecided"/"discovered") and list of `startpages`. Returns the newly created WebEntity.
Expand All @@ -425,6 +426,7 @@ The API will always answer as such:
+ _`status`_ (optional, default: `null`)
+ _`startpages`_ (optional, default: `[]`)
+ _`lruVariations`_ (optional, default: `true`)
+ _`tags`_ (optional, default: `{}`)
+ _`corpus`_ (optional, default: `"--hyphe--"`)

Creates for a `corpus` a WebEntity defined for a `lru_prefix` and optionnally for the corresponding http/https and www/no-www variations if `lruVariations` is true. Optionally set the newly created WebEntity's `name` `status` ("in"/"out"/"undecided"/"discovered") and list of `startpages`. Returns the newly created WebEntity.
Expand All @@ -436,6 +438,7 @@ The API will always answer as such:
+ _`status`_ (optional, default: `null`)
+ _`startpages`_ (optional, default: `[]`)
+ _`lruVariations`_ (optional, default: `true`)
+ _`tags`_ (optional, default: `{}`)
+ _`corpus`_ (optional, default: `"--hyphe--"`)

Creates for a `corpus` a WebEntity defined for a set of LRU prefixes given as URLs under `list_urls` and optionnally for the corresponding http/https and www/no-www variations if `lruVariations` is true. Optionally set the newly created WebEntity's `name` `status` ("in"/"out"/"undecided"/"discovered") and list of `startpages`. Returns the newly created WebEntity.
Expand All @@ -447,6 +450,7 @@ The API will always answer as such:
+ _`status`_ (optional, default: `""`)
+ _`startpages`_ (optional, default: `[]`)
+ _`lruVariations`_ (optional, default: `true`)
+ _`tags`_ (optional, default: `{}`)
+ _`corpus`_ (optional, default: `"--hyphe--"`)

Creates for a `corpus` a WebEntity defined for a set of LRU prefixes given as `list_lrus` and optionnally for the corresponding http/https and www/no-www variations if `lruVariations` is true. Optionally set the newly created WebEntity's `name` `status` ("in"/"out"/"undecided"/"discovered") and list of `startpages`. Returns the newly created WebEntity.
Expand Down Expand Up @@ -691,16 +695,6 @@ The API will always answer as such:
Returns for a `corpus` all WebEntities having at least one tag in any namespace/category equal to `value`.
Results are paginated and will include a `token` to be reused to collect the other pages via `get_webentities_page`: see `search_webentities` for explanations on `sort` `count` and `page`.

function() {
}""" % (namespace
+ _`namespace`_ (mandatory)
+ _`value)
function() {
}""" % (category
+ _`category`_ (mandatory)
+ _`value)
function() {
}""" % value

- __`get_webentities_by_tag_category`:__
+ _`namespace`_ (mandatory)
Expand Down
43 changes: 30 additions & 13 deletions hyphe_backend/core.tac
Original file line number Diff line number Diff line change
Expand Up @@ -1375,13 +1375,22 @@ class Memory_Structure(customJSONRPC):
break
returnD(homepages)

@inlineCallbacks
def clear_traph(self, corpus=DEFAULT_CORPUS):
if not self.parent.corpus_ready(corpus):
returnD(self.parent.corpus_error(corpus))
default_WECR = [cr["regexp"] for cr in self.corpora[corpus]["creation_rules"] if cr["prefix"] == "DEFAULT_WEBENTITY_CREATION_RULE"][0]
WECRs = dict((cr["prefix"], cr["regexp"]) for cr in self.corpora[corpus]["creation_rules"] if cr["prefix"] != "DEFAULT_WEBENTITY_CREATION_RULE")
res = yield self.traphs.call(corpus, "clear", default_WECR, WECRs)
returnD(res)

@inlineCallbacks
def reinitialize(self, corpus=DEFAULT_CORPUS, _noloop=False, _quiet=False, _restart=True):
if not self.parent.corpus_ready(corpus):
returnD(self.parent.corpus_error(corpus))
if not _quiet:
logger.msg("Empty Traph content", system="INFO - %s" % corpus)
res = yield self.traphs.call(corpus, "clear")
res = yield self.clear_traph(corpus)
if is_error(res):
returnD(res)
if not _quiet:
Expand Down Expand Up @@ -1457,21 +1466,21 @@ class Memory_Structure(customJSONRPC):
WEs[-1]["homepage"] = WE["homepage"]
returnD(format_result(WEs))

def jsonrpc_declare_webentity_by_lruprefix_as_url(self, url, name=None, status=None, startpages=[], lruVariations=True, corpus=DEFAULT_CORPUS):
def jsonrpc_declare_webentity_by_lruprefix_as_url(self, url, name=None, status=None, startpages=[], lruVariations=True, tags={}, corpus=DEFAULT_CORPUS):
"""Creates for a `corpus` a WebEntity defined for the LRU prefix given as a `url` and optionnally for the corresponding http/https and www/no-www variations if `lruVariations` is true. Optionally set the newly created WebEntity's `name` `status` ("in"/"out"/"undecided"/"discovered") and list of `startpages`. Returns the newly created WebEntity."""
if not self.parent.corpus_ready(corpus):
return self.parent.corpus_error(corpus)
try:
url, lru_prefix = urllru.url_clean_and_convert(url, self.corpora[corpus]["tlds"], False)
except ValueError as e:
return format_error(e)
return self.jsonrpc_declare_webentity_by_lrus([lru_prefix], name, status, startpages, lruVariations, corpus=corpus)
return self.jsonrpc_declare_webentity_by_lrus([lru_prefix], name, status, startpages, lruVariations, tags=tags, corpus=corpus)

def jsonrpc_declare_webentity_by_lru(self, lru_prefix, name=None, status=None, startpages=[], lruVariations=True, corpus=DEFAULT_CORPUS):
def jsonrpc_declare_webentity_by_lru(self, lru_prefix, name=None, status=None, startpages=[], lruVariations=True, tags={}, corpus=DEFAULT_CORPUS):
"""Creates for a `corpus` a WebEntity defined for a `lru_prefix` and optionnally for the corresponding http/https and www/no-www variations if `lruVariations` is true. Optionally set the newly created WebEntity's `name` `status` ("in"/"out"/"undecided"/"discovered") and list of `startpages`. Returns the newly created WebEntity."""
return self.jsonrpc_declare_webentity_by_lrus([lru_prefix], name, status, startpages, lruVariations, corpus=corpus)
return self.jsonrpc_declare_webentity_by_lrus([lru_prefix], name, status, startpages, lruVariations, tags=tags, corpus=corpus)

def jsonrpc_declare_webentity_by_lrus_as_urls(self, list_urls, name=None, status=None, startpages=[], lruVariations=True, corpus=DEFAULT_CORPUS):
def jsonrpc_declare_webentity_by_lrus_as_urls(self, list_urls, name=None, status=None, startpages=[], lruVariations=True, tags={}, corpus=DEFAULT_CORPUS):
"""Creates for a `corpus` a WebEntity defined for a set of LRU prefixes given as URLs under `list_urls` and optionnally for the corresponding http/https and www/no-www variations if `lruVariations` is true. Optionally set the newly created WebEntity's `name` `status` ("in"/"out"/"undecided"/"discovered") and list of `startpages`. Returns the newly created WebEntity."""
if not self.parent.corpus_ready(corpus):
returnD(self.parent.corpus_error(corpus))
Expand All @@ -1484,10 +1493,10 @@ class Memory_Structure(customJSONRPC):
list_lrus.append(lru)
except ValueError as e:
return format_error(e)
return self.jsonrpc_declare_webentity_by_lrus(list_lrus, name, status, startpages, lruVariations, corpus)
return self.jsonrpc_declare_webentity_by_lrus(list_lrus, name, status, startpages, lruVariations, tags=tags, corpus=corpus)

@inlineCallbacks
def jsonrpc_declare_webentity_by_lrus(self, list_lrus, name=None, status="", startpages=[], lruVariations=True, corpus=DEFAULT_CORPUS):
def jsonrpc_declare_webentity_by_lrus(self, list_lrus, name=None, status="", startpages=[], lruVariations=True, tags={}, corpus=DEFAULT_CORPUS):
"""Creates for a `corpus` a WebEntity defined for a set of LRU prefixes given as `list_lrus` and optionnally for the corresponding http/https and www/no-www variations if `lruVariations` is true. Optionally set the newly created WebEntity's `name` `status` ("in"/"out"/"undecided"/"discovered") and list of `startpages`. Returns the newly created WebEntity."""
if not self.parent.corpus_ready(corpus):
returnD(self.parent.corpus_error(corpus))
Expand All @@ -1507,11 +1516,19 @@ class Memory_Structure(customJSONRPC):
if is_error(weid):
returnD(weid)
weid = weid["result"]["created_webentities"].keys()[0]
tags = {}
if tags:
for ns in tags:
for cat in tags[ns]:
yield self.add_tags_to_dictionary(ns, cat, tags[ns][cat], corpus=corpus)
if startpages:
if not isinstance(startpages, list):
startpages = [startpages]
tags["CORE-STARTPAGES"] = {"user": startpages}
if "CORE-STARTPAGES" not in tags:
tags["CORE-STARTPAGES"] = {"user": startpages}
elif "user" not in tags["CORE-STARTPAGES"]:
tags["CORE-STARTPAGES"]["user"] = startpages
else:
tags["CORE-STARTPAGES"]["user"] = list(set(tags["CORE-STARTPAGES"]["user"] + startpages))
yield self.add_tags_to_dictionary("CORE-STARTPAGES", "user", startpages, corpus=corpus)
WEstatus = "DISCOVERED"
if status:
Expand Down Expand Up @@ -2077,7 +2094,7 @@ class Memory_Structure(customJSONRPC):
returnD(False)
if self.corpora[corpus]['reset']:
yield self.db.queue(corpus).drop()
yield self.traphs.call(corpus, "clear")
yield self.clear_traph(corpus)
returnD(None)
self.corpora[corpus]['loop_running'] = "Diagnosing"
yield self.count_webentities(corpus)
Expand All @@ -2099,7 +2116,7 @@ class Memory_Structure(customJSONRPC):
if not jobs:
self.corpora[corpus]['reset'] = True
yield self.db.queue(corpus).drop()
yield self.traphs.call(corpus, "clear")
yield self.clear_traph(corpus)
self.corpora[corpus]['reset'] = False
returnD(None)
logger.msg("Indexing job with pages in queue but not found in jobs: %s" % oldest_page_in_queue['_job'], system="WARNING - %s" % corpus)
Expand Down Expand Up @@ -2156,7 +2173,7 @@ class Memory_Structure(customJSONRPC):
yield self.parent.jsonrpc_set_corpus_options(corpus, {"keepalive": int(self.corpora[corpus]['links_duration'] * 2)})
logger.msg("...got WebEntity links in %ss." % s, system="INFO - %s" % corpus)
if self.corpora[corpus]['reset']:
yield self.traphs.call(corpus, "clear")
yield self.clear_traph(corpus)
self.corpora[corpus]['loop_running'] = None

@inlineCallbacks
Expand Down
17 changes: 9 additions & 8 deletions hyphe_frontend/app/js/service_hyphe_api.js
Original file line number Diff line number Diff line change
Expand Up @@ -209,15 +209,16 @@ angular.module('hyphe.service_hyphe_api', [])
ns.declareWebentity = buildApiCall(
API.WEBENTITY_LIST_CREATE_BY_LRU_LIST
,function(settings){
return [
settings.prefixes // LRU list
,settings.name || '' // Name
,'IN' // Status
,settings.startPages || [] // Start pages
return {
list_lrus: settings.prefixes // LRU list
,name: settings.name || '' // Name
,status: 'IN' // Status
,startpages: settings.startPages || [] // Start pages
// Automatically include LRU variations (http/https www/nowww)
,settings.lruVariations || false
,corpus.getId()
]}
,lruVariations: settings.lruVariations || false
,tags: settings.tags || {}
,corpus: corpus.getId()
}}
)

ns.urlLookup = buildApiCall(
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ service_identity
virtualenvwrapper
urllib3[secure]
fake-useragent==0.1.10
hyphe-traph>=1.2.0
hyphe-traph>=1.3.1
msgpack-python>=0.3
Scrapy==1.6.0
scrapyd-client==1.2.0a1
Expand Down

0 comments on commit 42e8dd2

Please sign in to comment.