Skip to content

Commit

Permalink
use user-agent in resolving routine (cf #284)
Browse files Browse the repository at this point in the history
  • Loading branch information
boogheta committed Oct 22, 2021
1 parent a763a97 commit 9ede2be
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 7 deletions.
12 changes: 6 additions & 6 deletions hyphe_backend/crawler/hcicrawler/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ def process_item(self, item, spider):

class ResolveLinks(object):

def __init__(self, proxy=None):
def __init__(self, proxy=None, user_agent=None):
self.user_agent = user_agent
self.proxy = None
if proxy:
proxy_host, proxy_port = proxy.split(":", 1)
Expand All @@ -71,10 +72,9 @@ def __init__(self, proxy=None):

@classmethod
def from_crawler(cls, crawler):
proxy = crawler.spider.proxy
if proxy:
return cls(proxy)
return cls()
user_agent = crawler.spider.user_agent or None
proxy = crawler.spider.proxy or None
return cls(proxy=proxy, user_agent=user_agent)

@inlineCallbacks
def process_item(self, item, spider):
Expand All @@ -85,7 +85,7 @@ def process_item(self, item, spider):
lru = spider.resolved_links[url]
else:
try:
agent = ResolverAgent(proxy=self.proxy)
agent = ResolverAgent(proxy=self.proxy, user_agent=self.user_agent)
rurl = yield agent.resolve(url)
if rurl == url and has_prefix(lru, spider.discover_prefixes):
rurl = yield agent.resolve(url)
Expand Down
10 changes: 9 additions & 1 deletion hyphe_backend/crawler/hcicrawler/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
from twisted.internet import reactor, defer
from twisted.internet.endpoints import TCP4ClientEndpoint
from twisted.web.client import Agent, ProxyAgent, RedirectAgent, _HTTP11ClientFactory
from twisted.web.http_headers import Headers
_HTTP11ClientFactory.noisy = False

class ResolverAgent(RedirectAgent):

def __init__(self, redirectLimit=5, connectTimeout=30, proxy=None):
def __init__(self, redirectLimit=5, connectTimeout=30, proxy=None, user_agent=None):
self.lastURI = None
self.user_agent = user_agent
if proxy:
try:
endpoint = TCP4ClientEndpoint(reactor, proxy["host"], proxy["port"], timeout=connectTimeout)
Expand All @@ -26,6 +28,12 @@ def resolve(self, url):
defer.returnValue(self.lastURI)

def _handleRedirect(self, response, method, uri, headers, redirectCount):
if not headers:
headers = Headers({'user-agent': [self.user_agent]})
else:
if headers.hasHeader('user-agent'):
headers.removeHeader('user-agent')
headers.addRawHeader('user-agent', self.user_agent)

if redirectCount >= self._redirectLimit:
# Infinite redirection detected, keep lastURI
Expand Down

0 comments on commit 9ede2be

Please sign in to comment.