Skip to content

Commit

Permalink
don't filter duplicates based on empty body instead of status + limit…
Browse files Browse the repository at this point in the history
… to max 10 redirs (#426)
  • Loading branch information
boogheta committed Nov 10, 2021
1 parent 378a6c2 commit 10e3b5f
Showing 1 changed file with 6 additions and 3 deletions.
9 changes: 6 additions & 3 deletions hyphe_backend/crawler/hcicrawler/spiders/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,10 @@ def handle_response(self, response):
response.meta['depth'] -= 1
else:
response.meta['depth'] = -1
return self._request(redir_url, redirection=True, dont_filter=(not response.status))
redir = response.meta.get('redirections', 0) + 1
if not response.body and redir > 10:
return self.parse_html(response)
return self._request(redir_url, redirection=redir, dont_filter=(not response.body))
real_url = self.archiveregexp.sub("", redir_url)
orig_url = self.archiveregexp.sub("", response.url)
match = self.archiveregexp.search(redir_url)
Expand Down Expand Up @@ -433,8 +436,8 @@ def _should_follow(self, depth, tolru):
c2 = self.prefixes_trie.match_lru(tolru)
return c1 and c2

def _request(self, url, noproxy=False, redirection=False, **kw):
kw['meta'] = {'handle_httpstatus_all': True, 'noproxy': noproxy}
def _request(self, url, noproxy=False, redirection=0, **kw):
kw['meta'] = {'handle_httpstatus_all': True, 'noproxy': noproxy, 'redirections': redirection}
kw['callback'] = self.handle_response
kw['errback'] = self.handle_error
if self.cookies:
Expand Down

0 comments on commit 10e3b5f

Please sign in to comment.