Skip to content

Commit

Permalink
fix modifying body to remove bnf banner (#372)
Browse files Browse the repository at this point in the history
  • Loading branch information
Benjamin Ooghe committed Jun 25, 2021
1 parent 04713d6 commit c8dcb70
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 8 deletions.
17 changes: 10 additions & 7 deletions hyphe_backend/crawler/hcicrawler/spiders/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ def handle_response(self, response):
with open(os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js:
get_bod_w_iframes = js.read()
bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
# TODO use modifed_body instead od _set_body
response._set_body(bod_w_iframes.encode('utf-8'))

# Try to scroll and unfold page
Expand Down Expand Up @@ -212,6 +213,7 @@ def handle_response(self, response):
self.errors += 1
return self._make_raw_page(response)
bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
# TODO use modifed_body instead od _set_body
response._set_body(bod_w_iframes.encode('utf-8'))

# Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
Expand Down Expand Up @@ -277,6 +279,7 @@ def handle_error(self, failure, response=None):
def parse_html(self, response):
archive_url = None
archive_timestamp = None
clean_body = None
orig_url = response.url
if self.webarchives:
orig_url = self.archiveregexp.sub("", orig_url)
Expand Down Expand Up @@ -305,7 +308,7 @@ def parse_html(self, response):
self.log("Skipping archive page (%s) with date (%s) outside desired range (%s/%s)" % (response.url, archive_timestamp, self.archivemindate, self.archivemaxdate), logging.DEBUG)
return
# Remove BNF banner
response.body = RE_BNF_ARCHIVES_BANNER.sub("", response.body)
clean_body = RE_BNF_ARCHIVES_BANNER.sub("", response.body)

# Specific case of redirections from website returned by archives as JS redirections with code 200
elif redir_url:
Expand Down Expand Up @@ -379,19 +382,19 @@ def parse_html(self, response):
yield self._request(url)

response.meta['depth'] = realdepth
yield self._make_html_page(response, lrulinks, archive_url=archive_url, archive_timestamp=archive_timestamp)
yield self._make_html_page(response, lrulinks, archive_url=archive_url, archive_timestamp=archive_timestamp, modified_body=clean_body)

def _make_html_page(self, response, lrulinks, archive_url=None, archive_timestamp=None):
p = self._make_raw_page(response)
def _make_html_page(self, response, lrulinks, archive_url=None, archive_timestamp=None, modified_body=None):
p = self._make_raw_page(response, modified_body=modified_body)
if STORE_HTML:
p['body'] = Binary(response.body.encode('zip'))
p['body'] = Binary((modified_body or response.body).encode('zip'))
p['lrulinks'] = lrulinks
if self.webarchives and archive_url:
p['archive_url'] = archive_url
p['archive_timestamp'] = archive_timestamp
return p

def _make_raw_page(self, response):
def _make_raw_page(self, response, modified_body=None):
p = Page()
p['url'] = response.url
if self.webarchives:
Expand All @@ -404,7 +407,7 @@ def _make_raw_page(self, response):
p['depth'] = 0
p['timestamp'] = int(time.time()*1000)
p['status'] = response.status
p['size'] = len(response.body)
p['size'] = len(modified_body or response.body)
if isinstance(response, HtmlResponse):
p['encoding'] = response.encoding
if response.meta.get('depth'):
Expand Down
2 changes: 1 addition & 1 deletion hyphe_backend/lib/webarchives.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,5 @@ def validateArchiveDate(dt):

RE_ARCHIVE_REDIRECT = r'function go\(\) \{.*document.location.href = "(%s/[^"]*)".*<p class="code shift red">Got an HTTP (\d+) response at crawl time</p>.*<p class="code">Redirecting to...</p>'
RE_BNF_ARCHIVES_PERMALINK = re.compile(r'<input id="permalink" class="BANNER_PERMALIEN_LINK_CUSTOMED" value="([^"]+)"')
RE_BNF_ARCHIVES_BANNER = re.compile(r'<!--\n\s+FILE ARCHIVED ON.*<!--\n\s+END.*?-->', re.DOTALL)
RE_BNF_ARCHIVES_BANNER = re.compile(r'<!--[\r\n]+\s+FILE ARCHIVED ON .*<!--[\r\n]+\s+END.*?-->', re.DOTALL)

0 comments on commit c8dcb70

Please sign in to comment.