diff --git a/scrapyrt/cmdline.py b/scrapyrt/cmdline.py index 203eb6d..2a34a74 100644 --- a/scrapyrt/cmdline.py +++ b/scrapyrt/cmdline.py @@ -4,6 +4,10 @@ import os import sys +# WARNING: Do place me before importing any Scrapy-related packages +import scrapyrt.utils +scrapyrt.utils.patch_logging() + from scrapy.utils.conf import closest_scrapy_cfg from scrapy.utils.misc import load_object from twisted.application import app diff --git a/scrapyrt/core.py b/scrapyrt/core.py index 1688175..28778dd 100644 --- a/scrapyrt/core.py +++ b/scrapyrt/core.py @@ -33,6 +33,7 @@ class ScrapyrtCrawler(Crawler): def __init__(self, spidercls, crawler_settings, start_requests=False): super(ScrapyrtCrawler, self).__init__(spidercls, crawler_settings) self.start_requests = start_requests + self.errors = [] @defer.inlineCallbacks def crawl(self, *args, **kwargs): @@ -72,8 +73,6 @@ def crawl(self, spidercls, *args, **kwargs): signals.item_dropped) crawler.signals.connect(self.scrapyrt_manager.spider_idle, signals.spider_idle) - crawler.signals.connect(self.scrapyrt_manager.handle_spider_error, - signals.spider_error) crawler.signals.connect(self.scrapyrt_manager.handle_scheduling, signals.request_scheduled) dfd = super(ScrapyrtCrawlerProcess, self).crawl(crawler, *args, **kwargs) @@ -135,7 +134,6 @@ def __init__(self, spider_name, request_kwargs, max_requests=None): self.log_dir = settings.LOG_DIR self.items = [] self.items_dropped = [] - self.errors = [] self.max_requests = int(max_requests) if max_requests else None self.timeout_limit = int(settings.TIMEOUT_LIMIT) self.request_count = 0 @@ -228,11 +226,6 @@ def limit_requests(self, spider): else: self.request_count += 1 - def handle_spider_error(self, failure, spider): - if spider is self.crawler.spider and self.debug: - fail_data = failure.getTraceback() - self.errors.append(fail_data) - def get_item(self, item, response, spider): if spider is self.crawler.spider: self.items.append(item) @@ -255,7 +248,7 @@ def return_items(self, result): "spider_name": self.spider_name, } if self.debug: - results["errors"] = self.errors + results["errors"] = self.crawler.errors return results def create_spider_request(self, kwargs): diff --git a/scrapyrt/utils.py b/scrapyrt/utils.py new file mode 100644 index 0000000..298bb8a --- /dev/null +++ b/scrapyrt/utils.py @@ -0,0 +1,78 @@ +# coding: utf8 + +# Have `logging` patched before scrapyrt is loaded, since scrapyrt loads +# scrapy very early, which makes it hard to patch it when loading our +# customizations, e.g. CrawlResource + +import logging +import inspect + + +def patch_logging(): + """Have `logging.getLogger` patched""" + old_get_logger = logging.getLogger + + def get_logger(*args, **kwargs): + logger = old_get_logger(*args, **kwargs) + logger.__class__ = ScrapyRTLogger + return logger + + logging.getLogger = get_logger + + +class ScrapyRTLogger(logging.Logger): + + def handle(self, record): + """Handles a logging record""" + ret = super(ScrapyRTLogger, self).handle(record) + stack = inspect.stack() + if self._is_error(stack, record): + crawler = self._get_crawler(stack) + import scrapyrt.core + assert isinstance(crawler, scrapyrt.core.ScrapyrtCrawler) + crawler.errors.append(record.getMessage()) + return ret + + def _is_error(self, stack, record): + """Returns whether we have an error here""" + if record.levelno >= logging.ERROR: + return True + if any( + '/scrapy/spidermiddlewares/httperror.py' in x[1] + for x in stack): + # HttpErrorMiddleware logs to DEBUG, but that kind of log looks + # important to us as well. + return True + if record.msg.startswith('Error'): + # There are several such logs, e.g. `logger.info('Error ...')` + return True + return False + + def _get_crawler(self, stack): + """Returns a cralwer instance found from the stack, or None""" + for record in stack: + frame, path = record[:2] + if '/scrapy/' in path: + for key, value in frame.f_locals.items(): + crawler = self._get_crawler_from_obj(value) + if crawler: + return crawler + + def _get_crawler_from_obj(self, obj): + """Returns a cralwer instance found from the object, or None""" + # WARNING: Do not import any Scrapy package too early + import scrapy.crawler + if isinstance(obj, scrapy.crawler.Crawler): + return obj + if getattr(obj, 'crawler', None): + crawler = self._get_crawler_from_obj(obj.crawler) + if crawler: + return crawler + if getattr(obj, 'spider', None): + crawler = self._get_crawler_from_obj(obj.spider) + if crawler: + return crawler + return None + + +# WARNING: Do not import any Scrapy-related packages above this line