Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added: Enhanced spdier log/error handling #33

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions scrapyrt/cmdline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
import os
import sys

# WARNING: Do place me before importing any Scrapy-related packages
import scrapyrt.utils
scrapyrt.utils.patch_logging()

from scrapy.utils.conf import closest_scrapy_cfg
from scrapy.utils.misc import load_object
from twisted.application import app
Expand Down
11 changes: 2 additions & 9 deletions scrapyrt/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class ScrapyrtCrawler(Crawler):
def __init__(self, spidercls, crawler_settings, start_requests=False):
super(ScrapyrtCrawler, self).__init__(spidercls, crawler_settings)
self.start_requests = start_requests
self.errors = []

@defer.inlineCallbacks
def crawl(self, *args, **kwargs):
Expand Down Expand Up @@ -72,8 +73,6 @@ def crawl(self, spidercls, *args, **kwargs):
signals.item_dropped)
crawler.signals.connect(self.scrapyrt_manager.spider_idle,
signals.spider_idle)
crawler.signals.connect(self.scrapyrt_manager.handle_spider_error,
signals.spider_error)
crawler.signals.connect(self.scrapyrt_manager.handle_scheduling,
signals.request_scheduled)
dfd = super(ScrapyrtCrawlerProcess, self).crawl(crawler, *args, **kwargs)
Expand Down Expand Up @@ -135,7 +134,6 @@ def __init__(self, spider_name, request_kwargs, max_requests=None):
self.log_dir = settings.LOG_DIR
self.items = []
self.items_dropped = []
self.errors = []
self.max_requests = int(max_requests) if max_requests else None
self.timeout_limit = int(settings.TIMEOUT_LIMIT)
self.request_count = 0
Expand Down Expand Up @@ -228,11 +226,6 @@ def limit_requests(self, spider):
else:
self.request_count += 1

def handle_spider_error(self, failure, spider):
if spider is self.crawler.spider and self.debug:
fail_data = failure.getTraceback()
self.errors.append(fail_data)

def get_item(self, item, response, spider):
if spider is self.crawler.spider:
self.items.append(item)
Expand All @@ -255,7 +248,7 @@ def return_items(self, result):
"spider_name": self.spider_name,
}
if self.debug:
results["errors"] = self.errors
results["errors"] = self.crawler.errors
return results

def create_spider_request(self, kwargs):
Expand Down
78 changes: 78 additions & 0 deletions scrapyrt/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# coding: utf8

# Have `logging` patched before scrapyrt is loaded, since scrapyrt loads
# scrapy very early, which makes it hard to patch it when loading our
# customizations, e.g. CrawlResource

import logging
import inspect


def patch_logging():
"""Have `logging.getLogger` patched"""
old_get_logger = logging.getLogger

def get_logger(*args, **kwargs):
logger = old_get_logger(*args, **kwargs)
logger.__class__ = ScrapyRTLogger
return logger

logging.getLogger = get_logger


class ScrapyRTLogger(logging.Logger):

def handle(self, record):
"""Handles a logging record"""
ret = super(ScrapyRTLogger, self).handle(record)
stack = inspect.stack()
if self._is_error(stack, record):
crawler = self._get_crawler(stack)
import scrapyrt.core
assert isinstance(crawler, scrapyrt.core.ScrapyrtCrawler)
crawler.errors.append(record.getMessage())
return ret

def _is_error(self, stack, record):
"""Returns whether we have an error here"""
if record.levelno >= logging.ERROR:
return True
if any(
'/scrapy/spidermiddlewares/httperror.py' in x[1]
for x in stack):
# HttpErrorMiddleware logs to DEBUG, but that kind of log looks
# important to us as well.
return True
if record.msg.startswith('Error'):
# There are several such logs, e.g. `logger.info('Error ...')`
return True
return False

def _get_crawler(self, stack):
"""Returns a cralwer instance found from the stack, or None"""
for record in stack:
frame, path = record[:2]
if '/scrapy/' in path:
for key, value in frame.f_locals.items():
crawler = self._get_crawler_from_obj(value)
if crawler:
return crawler

def _get_crawler_from_obj(self, obj):
"""Returns a cralwer instance found from the object, or None"""
# WARNING: Do not import any Scrapy package too early
import scrapy.crawler
if isinstance(obj, scrapy.crawler.Crawler):
return obj
if getattr(obj, 'crawler', None):
crawler = self._get_crawler_from_obj(obj.crawler)
if crawler:
return crawler
if getattr(obj, 'spider', None):
crawler = self._get_crawler_from_obj(obj.spider)
if crawler:
return crawler
return None


# WARNING: Do not import any Scrapy-related packages above this line