Skip to content

Commit

Permalink
fix stack when resolving urls from scrapy
Browse files Browse the repository at this point in the history
  • Loading branch information
boogheta committed Sep 10, 2021
1 parent f659f84 commit 01aac8a
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 4 deletions.
2 changes: 1 addition & 1 deletion hyphe_backend/crawler/hcicrawler/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def process_item(self, item, spider):
lru = url_to_lru_clean(rurl, TLDS_TREE)
spider.resolved_links[url] = lru
except Exception, e:
spider.log("Error resolving redirects from URL %s: %s %s" % (url, type(e), e), logging.INFO)
spider.log("Error resolving redirects for URL %s (found into %s): %s %s" % (url, item['url'], type(e), e), logging.WARNING)
lrulinks.append(lru)
item["lrulinks"] = lrulinks
returnValue(item)
Expand Down
4 changes: 1 addition & 3 deletions hyphe_backend/crawler/hcicrawler/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
from twisted.web.client import Agent, ProxyAgent, RedirectAgent, _HTTP11ClientFactory
_HTTP11ClientFactory.noisy = False

from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory

class ResolverAgent(RedirectAgent):

def __init__(self, redirectLimit=5, connectTimeout=30, proxy=None):
Expand All @@ -18,7 +16,7 @@ def __init__(self, redirectLimit=5, connectTimeout=30, proxy=None):
raise TypeError("ResolverAgent's proxy argument need to be a dict with fields host and port")
agent = ProxyAgent(endpoint)
else:
agent = Agent(reactor, connectTimeout=connectTimeout, contextFactory=ScrapyClientContextFactory)
agent = Agent(reactor, connectTimeout=connectTimeout)
RedirectAgent.__init__(self, agent, redirectLimit=redirectLimit)

@defer.inlineCallbacks
Expand Down

0 comments on commit 01aac8a

Please sign in to comment.