Skip to content

Commit

Permalink
keep pages not found in timerange as error in db and display them as …
Browse files Browse the repository at this point in the history
…such in front (#372)
  • Loading branch information
boogheta committed Jun 28, 2021
1 parent cd2d128 commit 6b1e8b7
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 11 deletions.
2 changes: 1 addition & 1 deletion hyphe_backend/crawler/hcicrawler/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,5 +57,5 @@ def close(self, reason):

def log(self, request, spider):
if self.logdupes:
spider.log("Filtered duplicate request: %ss - no more duplicates will be shown (see DUPEFILTER_CLASS)" % request, logging.DEBUG)
spider.log("Filtered duplicate request: %s - no more duplicates will be shown (see DUPEFILTER_CLASS)" % request, logging.DEBUG)
self.logdupes = False
11 changes: 7 additions & 4 deletions hyphe_backend/crawler/hcicrawler/spiders/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def handle_response(self, response):
# Check date obtained fits into a user defined timerange and return 404 otherwise
if not (self.archivemindate <= match.group(1) <= self.archivemaxdate):
self.log("Skipping archive page (%s) with date (%s) outside desired range (%s/%s)" % (redir_url, match.group(1), self.archivemindate, self.archivemaxdate), logging.DEBUG)
return
return self._make_raw_page(response, archive_fail_url=redir_url)
if normalize(real_url) == normalize(orig_url):
if "depth" in response.meta:
response.meta['depth'] -= 1
Expand Down Expand Up @@ -307,6 +307,7 @@ def parse_html(self, response):
archive_timestamp = archive_timestamp.group(1)
if not (self.archivemindate <= archive_timestamp <= self.archivemaxdate):
self.log("Skipping archive page (%s) with date (%s) outside desired range (%s/%s)" % (response.url, archive_timestamp, self.archivemindate, self.archivemaxdate), logging.DEBUG)
yield self._make_raw_page(response, archive_fail_url=archive_url)
return
# Remove BNF banner
clean_body = RE_BNF_ARCHIVES_BANNER.sub("", response.body)
Expand All @@ -321,6 +322,7 @@ def parse_html(self, response):
match = self.archiveregexp.search(redir_location)
if match and not (self.archivemindate <= match.group(1) <= self.archivemaxdate):
self.log("Skipping archive page (%s) with date (%s) outside desired range (%s/%s)" % (redir_location, match.group(1), self.archivemindate, self.archivemaxdate), logging.DEBUG)
yield self._make_raw_page(response, archive_fail_url=redir_location)
return
response.headers['Location'] = redir_location

Expand Down Expand Up @@ -381,6 +383,7 @@ def parse_html(self, response):
if self._should_follow(response.meta['depth'], lrulink) and \
not url_has_any_extension(url, self.ignored_exts):
yield self._request(url)
return

response.meta['depth'] = realdepth
yield self._make_html_page(response, lrulinks, archive_url=archive_url, archive_timestamp=archive_timestamp, modified_body=clean_body)
Expand All @@ -395,14 +398,14 @@ def _make_html_page(self, response, lrulinks, archive_url=None, archive_timestam
p['archive_date_obtained'] = archive_timestamp
return p

def _make_raw_page(self, response, modified_body=None):
def _make_raw_page(self, response, modified_body=None, archive_fail_url=None):
p = Page()
p['url'] = response.url
if self.webarchives:
p['url'] = self.archiveregexp.sub("", response.url)
p['archive_url'] = response.url
p['archive_url'] = archive_fail_url or response.url
p['archive_date_requested'] = self.archivedate
if 'archive_timestamp' in response.meta:
if 'archive_timestamp' in response.meta and not archive_fail_url:
p['archive_date_obtained'] = response.meta['archive_timestamp']
p['lru'] = url_to_lru_clean(p['url'], TLDS_TREE)
p['depth'] = 0
Expand Down
14 changes: 9 additions & 5 deletions hyphe_frontend/app/views/webentity.html
Original file line number Diff line number Diff line change
Expand Up @@ -487,23 +487,27 @@ <h3 style="padding: 8px; margin: 0px" ng-if="webentity.pages_total>=1">
md-virtual-repeat="page in pages | filter:urlSearchQuery:false:url"
>
<div class="md-primary">
<a href="{{page.url}}" target="_blank">
<a href="{{ page.url }}" target="_blank">
<md-tooltip md-direction="left">Open in a new tab</md-tooltip>
<md-icon>link</md-icon>
</a>
<a ng-if="page.archive_url && page.archive_date_obtained" href="{{page.archive_url}}" target="_blank">
<a ng-if="page.archive_url && page.archive_date_obtained" href="{{ page.archive_url }}" target="_blank">
<md-tooltip md-direction="left">Open in a new tab archived page from {{ page.archive_date_obtained }}</md-tooltip>
<md-icon>history</md-icon>
</a>
<span ng-if="page.archive_date_requested && !page.archive_date_obtained">
<md-tooltip md-direction="left">This page could not be found in web archives around {{ page.archive_date_requested }}</md-tooltip>
<md-icon style="opacity: 0.2">history</md-icon>
</a>
</div>
<div ng-if="page.url===webentity.homepage">
<div ng-if="page.url === webentity.homepage">
<md-tooltip md-direction="left">Homepage</md-tooltip>
&nbsp;<md-icon>home</md-icon>
</div>
<div md-truncate flex>
<span>
<md-tooltip md-direction="bottom">{{page.url}}</md-tooltip>
&nbsp;{{page.url}}
<md-tooltip md-direction="bottom">{{ page.url }}</md-tooltip>
&nbsp;{{ page.url }}
</span>
</div>
<div class="md-primary">
Expand Down
5 changes: 4 additions & 1 deletion hyphe_frontend/app/views/webentity.js
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,10 @@ angular.module('hyphe.webentityController', [])
var pagesBatch = []
var required_fields = ["crawled", "archive_url", "archive_date_obtained", "archive_date_requested"]
result.pages.forEach(function(page){
if (page.archive_url && page.archive_date_obtained) {
if (page.archive_date_requested) {
page.archive_date_requested = page.archive_date_requested.replace(/^(....)(..)(..).*$/, "$1-$2-$3")
}
if (page.archive_date_obtained) {
page.archive_date_obtained = page.archive_date_obtained.replace(/^(....)(..)(..).*$/, "$1-$2-$3")
}
if (!$scope.webentity.startpages_lrus.includes(page.lru)) {
Expand Down

0 comments on commit 6b1e8b7

Please sign in to comment.