Skip to content

Commit

Permalink
make visible a distinction between successfully crawled pages and err…
Browse files Browse the repository at this point in the history
…or ones and add a SUPSICIOUS level of globalStatus (closes #425)
  • Loading branch information
boogheta committed Dec 20, 2021
1 parent df0782a commit 2a01999
Show file tree
Hide file tree
Showing 8 changed files with 62 additions and 13 deletions.
5 changes: 4 additions & 1 deletion hyphe_backend/core.tac
Original file line number Diff line number Diff line change
Expand Up @@ -778,6 +778,7 @@ class Core(customJSONRPC):
kwargs["projection"] = [
"webentity_id",
"nb_crawled_pages",
"nb_crawled_pages_200",
"nb_unindexed_pages",
"nb_pages",
"nb_links",
Expand Down Expand Up @@ -854,7 +855,7 @@ class Core(customJSONRPC):
yield self.db.add_log(corpus, update_ids, "INDEX_"+indexing_statuses.FINISHED)
if corpus in self.corpora and self.corpora[corpus]['options']['phantom'].get('autoretry', False):
# Try to restart in phantom mode all regular crawls that seem to have failed (less than 3 pages found for a depth of at least 1)
res = yield self.db.list_jobs(corpus, {'_id': {'$in': update_ids}, 'nb_crawled_pages': {'$lt': 3}, 'crawl_arguments.phantom': False, 'crawl_arguments.max_depth': {'$gt': 0}})
res = yield self.db.list_jobs(corpus, {'_id': {'$in': update_ids}, 'nb_crawled_pages_200': {'$lt': 3}, 'crawl_arguments.phantom': False, 'crawl_arguments.max_depth': {'$gt': 0}})
for job in res:
logger.msg("Crawl job %s seems to have failed, trying to restart it in phantom mode" % job['_id'], system="INFO - %s" % corpus)
yield self.jsonrpc_crawl_webentity(job['webentity_id'], min(job['crawl_arguments']['max_depth'], 2), True, corpus=corpus)
Expand Down Expand Up @@ -2141,9 +2142,11 @@ class Memory_Structure(customJSONRPC):

crawled_pages_left = yield self.db.count_queue(corpus, job['crawljob_id'])
tot_crawled_pages = yield self.db.count_pages(corpus, job['crawljob_id'])
success_crawled_pages = yield self.db.count_pages_by_code(corpus, job['crawljob_id'], 200)
if job['_id'] != 'unknown':
update = {
'nb_crawled_pages': tot_crawled_pages,
'nb_crawled_pages_200': success_crawled_pages,
'nb_unindexed_pages': crawled_pages_left,
'indexing_status': indexing_statuses.BATCH_FINISHED
}
Expand Down
10 changes: 9 additions & 1 deletion hyphe_backend/lib/mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def init_corpus_indexes(self, corpus, retry=True):
yield self.pages(corpus).create_index(sortasc('timestamp'), background=True)
yield self.pages(corpus).create_index(sortasc('_job'), background=True)
yield self.pages(corpus).create_index(sortasc('_job') + sortasc('forgotten'), background=True)
yield self.pages(corpus).create_index(sortasc('_job') + sortasc('forgotten') + sortasc('status'), background=True)
yield self.pages(corpus).create_index(sortasc('url'), background=True)
yield self.queue(corpus).create_index(sortasc('timestamp'), background=True)
yield self.queue(corpus).create_index(sortasc('_job'), background=True)
Expand Down Expand Up @@ -317,6 +318,7 @@ def add_job(self, corpus, webentity_id, args, timestamp=None):
"crawljob_id": None,
"webentity_id": webentity_id,
"nb_crawled_pages": 0,
"nb_crawled_pages_200": 0,
"nb_unindexed_pages": 0,
"nb_pages": 0,
"nb_links": 0,
Expand Down Expand Up @@ -367,6 +369,11 @@ def count_pages(self, corpus, job, **kwargs):
tot = yield self.pages(corpus).count({"_job": job, "forgotten": False}, **kwargs)
returnD(tot)

@inlineCallbacks
def count_pages_by_code(self, corpus, job, code, **kwargs):
tot = yield self.pages(corpus).count({"_job": job, "forgotten": False, "status": code}, **kwargs)
returnD(tot)

@inlineCallbacks
def get_pages(self, corpus, urls_or_lrus, include_metas=False, include_body=False, include_links=False):
projection = {}
Expand Down Expand Up @@ -398,8 +405,9 @@ def get_pages(self, corpus, urls_or_lrus, include_metas=False, include_body=Fals
@inlineCallbacks
def update_job_pages(self, corpus, job_id):
crawled_pages = yield self.count_pages(corpus, job_id)
success_pages = yield self.count_pages_by_code(corpus, job_id, 200)
unindexed_pages = yield self.count_queue(corpus, job_id)
yield self.update_jobs(corpus, {"crawljob_id": job_id}, {'nb_crawled_pages': crawled_pages, 'nb_unindexed_pages': unindexed_pages})
yield self.update_jobs(corpus, {"crawljob_id": job_id}, {'nb_crawled_pages': crawled_pages, 'nb_crawled_pages_200': success_pages, 'nb_unindexed_pages': unindexed_pages})

@inlineCallbacks
def get_queue(self, corpus, specs={}, **kwargs):
Expand Down
1 change: 1 addition & 0 deletions hyphe_frontend/app/app.css
Original file line number Diff line number Diff line change
Expand Up @@ -694,6 +694,7 @@ textarea.over-error {

.crawljobs-col-inde,
.crawljobs-col-craw,
.crawljobs-col-crok,
.crawljobs-col-disp,
.crawljobs-col-disl {
padding: 0px 8px 0px 8px;
Expand Down
4 changes: 3 additions & 1 deletion hyphe_frontend/app/js/service_utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -660,8 +660,10 @@ angular.module('hyphe.service_utils', [])
} else if(job.crawling_status != 'FINISHED'){
job.globalStatus = job.crawling_status
} else if(job.indexing_status == 'FINISHED'){
if(job.nb_crawled_pages > 0){
if(job.nb_crawled_pages_200 > 2){
job.globalStatus = 'ACHIEVED'
} else if(job.nb_crawled_pages_200 > 0){
job.globalStatus = 'SUSPICIOUS'
} else {
job.globalStatus = 'UNSUCCESSFUL'
}
Expand Down
35 changes: 30 additions & 5 deletions hyphe_frontend/app/views/monitorCrawls.html
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,11 @@ <h3>No crawl job</h3>
: (
(job.globalStatus == 'CANCELED')
? ('default-background-400')
: ('default-background-100')
: (
(job.globalStatus == 'SUSPICIOUS')
? ('default-warn-100')
: ('default-background-100')
)
)
)
)
Expand Down Expand Up @@ -124,7 +128,10 @@ <h3>No crawl job</h3>
</div>
<div layout="column" layout-align="center center" class="stats">
<div>
{{job.nb_crawled_pages}} page{{job.nb_crawled_pages | plural}} crawled
{{job.nb_crawled_pages_200}} page{{job.nb_crawled_pages_200 | plural}} crawled
<small ng-if="job.nb_crawled_pages - job.nb_crawled_pages_200">
(+{{job.nb_crawled_pages - job.nb_crawled_pages_200}} error{{job.nb_crawled_pages - job.nb_crawled_pages_200 | plural}})
</small>
</div>
<div ng-show="job.nb_unindexed_pages">
({{job.nb_unindexed_pages}} not indexed yet)
Expand Down Expand Up @@ -178,14 +185,20 @@ <h3>No crawl job</h3>
md-colors="{'color': (sort == 'nb_pages_indexed') ? ('default-accent') : ('default-primary')}"
ng-click="toggleSort('nb_pages_indexed')"
>
<span><md-tooltip md-direction="top">Pages indexed</md-tooltip>PI</span>
<span><md-tooltip md-direction="top">Pages Indexed</md-tooltip>PI</span>
</div>
<div class="md-secondary crawljobs-col-craw"
md-colors="{'color': (sort == 'nb_crawled_pages') ? ('default-accent') : ('default-primary')}"
ng-click="toggleSort('nb_crawled_pages')"
>
<span><md-tooltip md-direction="top">Pages Crawled</md-tooltip>PC</span>
</div>
<div class="md-secondary crawljobs-col-crok"
md-colors="{'color': (sort == 'nb_crawled_pages_200') ? ('default-accent') : ('default-primary')}"
ng-click="toggleSort('nb_crawled_pages_200')"
>
<span><md-tooltip md-direction="top">Pages Crawled Successfully</md-tooltip>PC ✓</span>
</div>
<div class="md-secondary crawljobs-col-disp"
md-colors="{'color': (sort == 'nb_pages') ? ('default-accent') : ('default-primary')}"
ng-click="toggleSort('nb_pages')"
Expand Down Expand Up @@ -233,7 +246,11 @@ <h3>No crawl job</h3>
: (
(job.globalStatus == 'CANCELED')
? ('default-background-400')
: ('default-background-100')
: (
(job.globalStatus == 'SUSPICIOUS')
? ('default-warn-100')
: ('default-background-100')
)
)
)
)
Expand All @@ -252,6 +269,7 @@ <h3>No crawl job</h3>
<div class="md-secondary crawljobs-col-stat">{{job.globalStatus}}</div>
<div class="md-secondary crawljobs-col-inde">{{job.nb_pages_indexed}}</div>
<div class="md-secondary crawljobs-col-craw">{{job.nb_crawled_pages}}</div>
<div class="md-secondary crawljobs-col-crok">{{job.nb_crawled_pages_200}}</div>
<div class="md-secondary crawljobs-col-disp">{{job.nb_pages}}</div>
<div class="md-secondary crawljobs-col-disl">{{job.nb_links}}</div>
<div class="md-secondary crawljobs-col-sche"><span><md-tooltip md-direction="left">{{job.created_at|date}}</md-tooltip>{{job.created_at|prettyDate}}</span></div>
Expand Down Expand Up @@ -291,7 +309,11 @@ <h3>No crawl job</h3>
: (
(crawljobsIndex[focusedJobId].globalStatus == 'CANCELED')
? ('default-background-400')
: ('default-background-100')
: (
(crawljobsIndex[focusedJobId].globalStatus == 'SUSPICIOUS')
? ('default-warn-100')
: ('default-background-100')
)
)
)
)
Expand Down Expand Up @@ -415,6 +437,9 @@ <h3 style="margin-bottom:0px">CONTENT</h3>
<dt>Crawled pages</dt>
<dd>{{crawljobsIndex[focusedJobId].nb_crawled_pages}}</dd>

<dt>Crawled pages successfully</dt>
<dd>{{crawljobsIndex[focusedJobId].nb_crawled_pages_200}}</dd>

<div ng-if="crawljobsIndex[focusedJobId].nb_unindexed_pages">
<dt>not indexed yet</dt>
<dd>{{crawljobsIndex[focusedJobId].nb_unindexed_pages}}</dd>
Expand Down
3 changes: 3 additions & 0 deletions hyphe_frontend/app/views/monitorCrawls.js
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ angular.module('hyphe.monitorcrawlsController', [])
,nb_crawled_pages: {
type: 'number'
}
,nb_crawled_pages_200: {
type: 'number'
}
,nb_pages_indexed: {
type: 'number'
}
Expand Down
15 changes: 11 additions & 4 deletions hyphe_frontend/app/views/webentity.html
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,11 @@ <h1 class="word-break">
: (
(job.globalStatus == 'CANCELED')
? ('default-background-400')
: ('default-background-100')
: (
(job.globalStatus == 'SUSPICIOUS')
? ('default-warn-100')
: ('default-background-100')
)
)
)
)
Expand Down Expand Up @@ -329,7 +333,7 @@ <h1 class="word-break">

<md-button
class="md-raised md-mini"
ng-show="job.globalStatus == 'ACHIEVED' || job.globalStatus == 'UNSUCCESSFUL' || job.globalStatus == 'CANCELED'"
ng-show="job.globalStatus == 'ACHIEVED' || job.globalStatus == 'UNSUCCESSFUL' || job.globalStatus == 'CANCELED' || job.globalStatus == 'SUSPICIOUS'"
ng-click="reCrawl(job)"
>
<md-icon>autorenew</md-icon>
Expand All @@ -338,7 +342,7 @@ <h1 class="word-break">

<md-button
class="md-raised md-mini"
ng-show="job.globalStatus != 'ACHIEVED' && job.globalStatus != 'UNSUCCESSFUL' && job.globalStatus != 'CANCELED'"
ng-show="job.globalStatus != 'ACHIEVED' && job.globalStatus != 'UNSUCCESSFUL' && job.globalStatus != 'CANCELED' || job.globalStatus != 'SUSPICIOUS'"
ng-click="abortCrawl(job)"
>
<md-icon>clear</md-icon>
Expand All @@ -348,7 +352,10 @@ <h1 class="word-break">
</div>
<div layout="column" layout-align="center center" class="stats">
<div>
{{job.nb_crawled_pages}} page{{job.nb_crawled_pages | plural}} crawled
{{job.nb_crawled_pages_200}} page{{job.nb_crawled_pages_200 | plural}} crawled
<small ng-if="job.nb_crawled_pages - job.nb_crawled_pages_200">
(+{{job.nb_crawled_pages - job.nb_crawled_pages_200}} error{{job.nb_crawled_pages - job.nb_crawled_pages_200 | plural}})
</small>
</div>
<div ng-show="job.nb_unindexed_pages">
({{job.nb_unindexed_pages}} not indexed yet)
Expand Down
2 changes: 1 addition & 1 deletion hyphe_frontend/app/views/webentity.js
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ angular.module('hyphe.webentityController', [])
return b.created_at - a.created_at
})
if ($scope.crawls.some(function(job){
return job.globalStatus != 'ACHIEVED' && job.globalStatus != 'UNSUCCESSFUL' && job.globalStatus != 'CANCELED'
return job.globalStatus != 'ACHIEVED' && job.globalStatus != 'UNSUCCESSFUL' && job.globalStatus != 'CANCELED' && job.globalStatus != 'SUSPICIOUS'
})) $timeout(fetchCrawls, 3000)
}
,function(){
Expand Down

0 comments on commit 2a01999

Please sign in to comment.