Implement SPLASH_USER and SPLASH_PASS

scrapy-plugins · Oct 4, 2021 · 2b253e5 · 2b253e5
1 parent 783c58d
commit 2b253e5
Show file tree

Hide file tree

Showing 9 changed files with 547 additions and 94 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -4,6 +4,19 @@ Changes
 0.8.0 (2021-10-04)
 ------------------
 
+*   **Security bug fix:**
+
+    If you use :ref:`HttpAuthMiddleware` (i.e. the ``http_user`` and
+    ``http_pass`` spider attributes) for Splash authentication, any non-Splash
+    request will expose your credentials to the request target. This includes
+    ``robots.txt`` requests sent by Scrapy when the ``ROBOTSTXT_OBEY`` setting
+    is set to ``True``.
+
+    Use the new ``SPLASH_USER`` and ``SPLASH_PASS`` settings instead to set
+    your Splash authentication credentials safely.
+
+    .. _HttpAuthMiddleware: http://doc.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.httpauth
+
 *   Responses now expose the HTTP status code and headers from Splash as
     ``response.splash_response_status`` and
     ``response.splash_response_headers`` (#158)

diff --git a/README.rst b/README.rst
@@ -582,12 +582,31 @@ on Splash server and is not sent with each request (it requires Splash 2.1+)::
 HTTP Basic Auth
 ===============
 
-If you need HTTP Basic Authentication to access Splash, use
-Scrapy's HttpAuthMiddleware_.
+If you need to use HTTP Basic Authentication to access Splash, use the
+``SPLASH_USER`` and ``SPLASH_PASS`` optional settings::
+
+    SPLASH_USER = 'user'
+    SPLASH_PASS = 'userpass'
 
 Another option is ``meta['splash']['splash_headers']``: it allows to set
 custom headers which are sent to Splash server; add Authorization header
-to ``splash_headers`` if HttpAuthMiddleware doesn't fit for some reason.
+to ``splash_headers`` if you want to change credentials per-request::
+
+    import scrapy
+    from w3lib.http import basic_auth_header
+
+    class MySpider(scrapy.Spider):
+        # ...
+        def start_requests(self):
+            auth = basic_auth_header('user', 'userpass')
+            yield SplashRequest(url, self.parse,
+                                splash_headers={'Authorization': auth})
+
+**WARNING:** Don't use :ref:`HttpAuthMiddleware`
+(i.e. ``http_user`` / ``http_pass`` spider attributes) for Splash
+authentication: if you occasionally send a non-Splash request from your spider,
+you may expose Splash credentials to a remote website, as HttpAuthMiddleware
+sets credentials for all requests unconditionally.
 
 .. _HttpAuthMiddleware: http://doc.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.httpauth
 

diff --git a/example/scrashtest/settings.py b/example/scrashtest/settings.py
@@ -20,3 +20,4 @@
 # SPLASH_URL = 'http://192.168.59.103:8050/'
 DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
 HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
+ROBOTSTXT_OBEY = True
diff --git a/scrapy_splash/middleware.py b/scrapy_splash/middleware.py
@@ -10,11 +10,13 @@
 from six.moves.urllib.parse import urljoin
 from six.moves.http_cookiejar import CookieJar
 
+from w3lib.http import basic_auth_header
 import scrapy
-from scrapy.exceptions import NotConfigured
+from scrapy.exceptions import NotConfigured, IgnoreRequest
 from scrapy.http.headers import Headers
 from scrapy.http.response.text import TextResponse
 from scrapy import signals
+from scrapy.downloadermiddlewares.robotstxt import RobotsTxtMiddleware
 
 from scrapy_splash.responsetypes import responsetypes
 from scrapy_splash.cookies import jar_to_har, har_to_jar
@@ -222,26 +224,34 @@ class SplashMiddleware(object):
     retry_498_priority_adjust = +50
     remote_keys_key = '_splash_remote_keys'
 
-    def __init__(self, crawler, splash_base_url, slot_policy, log_400):
+    def __init__(self, crawler, splash_base_url, slot_policy, log_400, auth):
         self.crawler = crawler
         self.splash_base_url = splash_base_url
         self.slot_policy = slot_policy
         self.log_400 = log_400
         self.crawler.signals.connect(self.spider_opened, signals.spider_opened)
+        self.auth = auth
 
     @classmethod
     def from_crawler(cls, crawler):
-        splash_base_url = crawler.settings.get('SPLASH_URL',
-                                               cls.default_splash_url)
-        log_400 = crawler.settings.getbool('SPLASH_LOG_400', True)
-        slot_policy = crawler.settings.get('SPLASH_SLOT_POLICY',
-                                           cls.default_policy)
+        s = crawler.settings
+        splash_base_url = s.get('SPLASH_URL', cls.default_splash_url)
+        log_400 = s.getbool('SPLASH_LOG_400', True)
+        slot_policy = s.get('SPLASH_SLOT_POLICY', cls.default_policy)
         if slot_policy not in SlotPolicy._known:
             raise NotConfigured("Incorrect slot policy: %r" % slot_policy)
 
-        return cls(crawler, splash_base_url, slot_policy, log_400)
+        splash_user = s.get('SPLASH_USER', '')
+        splash_pass = s.get('SPLASH_PASS', '')
+        auth = None
+        if splash_user or splash_pass:
+            auth = basic_auth_header(splash_user, splash_pass)
+        return cls(crawler, splash_base_url, slot_policy, log_400, auth)
 
     def spider_opened(self, spider):
+        if _http_auth_enabled(spider):
+            replace_downloader_middleware(self.crawler, RobotsTxtMiddleware,
+                                          SafeRobotsTxtMiddleware)
         if not hasattr(spider, 'state'):
             spider.state = {}
 
@@ -260,21 +270,24 @@ def _remote_keys(self):
     def process_request(self, request, spider):
         if 'splash' not in request.meta:
             return
+        splash_options = request.meta['splash']
 
         if request.method not in {'GET', 'POST'}:
-            logger.warning(
+            logger.error(
                 "Currently only GET and POST requests are supported by "
-                "SplashMiddleware; %(request)s will be handled without Splash",
+                "SplashMiddleware; %(request)s is dropped",
                 {'request': request},
                 extra={'spider': spider}
             )
-            return request
+            self.crawler.stats.inc_value('splash/dropped/method/{}'.format(
+                request.method))
+            raise IgnoreRequest("SplashRequest doesn't support "
+                                "HTTP {} method".format(request.method))
 
         if request.meta.get("_splash_processed"):
             # don't process the same request more than once
             return
 
-        splash_options = request.meta['splash']
         request.meta['_splash_processed'] = True
 
         slot_policy = splash_options.get('slot_policy', self.slot_policy)
@@ -319,6 +332,10 @@ def process_request(self, request, spider):
         if not splash_options.get('dont_send_headers'):
             headers = scrapy_headers_to_unicode_dict(request.headers)
             if headers:
+                # Headers set by HttpAuthMiddleware should be used for Splash,
+                # not for the remote website (backwards compatibility).
+                if _http_auth_enabled(spider):
+                    headers.pop('Authorization', None)
                 args.setdefault('headers', headers)
 
         body = json.dumps(args, ensure_ascii=False, sort_keys=True, indent=4)
@@ -353,6 +370,8 @@ def process_request(self, request, spider):
         splash_url = urljoin(splash_base_url, endpoint)
 
         headers = Headers({'Content-Type': 'application/json'})
+        if self.auth is not None:
+            headers['Authorization'] = self.auth
         headers.update(splash_options.get('splash_headers', {}))
         new_request = request.replace(
             url=splash_url,
@@ -361,6 +380,7 @@ def process_request(self, request, spider):
             headers=headers,
             priority=request.priority + self.rescheduling_priority_adjust
         )
+        new_request.meta['dont_obey_robotstxt'] = True
         self.crawler.stats.inc_value('splash/%s/request_count' % endpoint)
         return new_request
 
@@ -478,3 +498,39 @@ def _get_slot_key(self, request_or_response):
         return self.crawler.engine.downloader._get_slot_key(
             request_or_response, None
         )
+
+
+class SafeRobotsTxtMiddleware(RobotsTxtMiddleware):
+    def process_request(self, request, spider):
+        # disable robots.txt for Splash requests
+        if _http_auth_enabled(spider) and 'splash' in request.meta:
+            return
+        return super(SafeRobotsTxtMiddleware, self).process_request(
+            request, spider)
+
+
+def _http_auth_enabled(spider):
+    # FIXME: this function should always return False if HttpAuthMiddleware is
+    # not in a middleware list.
+    return getattr(spider, 'http_user', '') or getattr(spider, 'http_pass', '')
+
+
+def replace_downloader_middleware(crawler, old_cls, new_cls):
+    """ Replace downloader middleware with another one """
+    try:
+        new_mw = new_cls.from_crawler(crawler)
+    except NotConfigured:
+        return
+
+    mw_manager = crawler.engine.downloader.middleware
+    mw_manager.middlewares = tuple([
+        mw if mw.__class__ is not old_cls else new_mw
+        for mw in mw_manager.middlewares
+    ])
+    for method_name, callbacks in mw_manager.methods.items():
+        for idx, meth in enumerate(callbacks):
+            method_cls = meth.__self__.__class__
+            if method_cls is old_cls:
+                new_meth = getattr(new_mw, method_name)
+                # logger.debug("{} is replaced with {}".format(meth, new_meth))
+                callbacks[idx] = new_meth
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,11 +1,12 @@
 import os
 
 import pytest
-from scrapy.settings import Settings
+from .mockserver import MockServer
+from .resources import SplashProtected
 
 
 @pytest.fixture()
-def settings(request):
+def settings():
     """ Default scrapy-splash settings """
     s = dict(
         # collect scraped items to .collected_items attribute
@@ -28,6 +29,12 @@ def settings(request):
         DUPEFILTER_CLASS='scrapy_splash.SplashAwareDupeFilter',
         HTTPCACHE_STORAGE='scrapy_splash.SplashAwareFSCacheStorage',
     )
-    return Settings(s)
+    return s
 
 
+@pytest.fixture()
+def settings_auth(settings):
+    with MockServer(SplashProtected) as s:
+        print("splash url:", s.root_url)
+        settings['SPLASH_URL'] = s.root_url
+        yield settings
diff --git a/tests/resources.py b/tests/resources.py
@@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+import os
+from six.moves.urllib.parse import urlparse
+
+from twisted.web.resource import Resource
+from zope.interface import implementer
+from twisted.web import resource, guard, proxy
+from twisted.cred.portal import IRealm, Portal
+from twisted.cred.checkers import InMemoryUsernamePasswordDatabaseDontUse
+
+from scrapy_splash.utils import to_bytes
+
+
+class HtmlResource(Resource):
+    isLeaf = True
+    content_type = 'text/html'
+    html = ''
+    extra_headers = {}
+    status_code = 200
+
+    def render_GET(self, request):
+        request.setHeader(b'content-type', to_bytes(self.content_type))
+        for name, value in self.extra_headers.items():
+            request.setHeader(to_bytes(name), to_bytes(value))
+        request.setResponseCode(self.status_code)
+        return to_bytes(self.html)
+
+
+class HelloWorld(HtmlResource):
+    html = """
+    <html><body><script>document.write('hello world!');</script></body></html>
+    """
+    extra_headers = {'X-MyHeader': 'my value', 'Set-Cookie': 'sessionid=ABCD'}
+
+
+class HelloWorldDisallowByRobots(HelloWorld):
+    """ Disallow itself via robots.txt """
+    isLeaf = False
+
+    def getChild(self, name, request):
+        if name == b"robots.txt":
+            return self.RobotsTxt()
+        return self
+
+    class RobotsTxt(Resource):
+        isLeaf = True
+        def render_GET(self, request):
+            return b'User-Agent: *\nDisallow: /\n'
+
+
+class HelloWorldDisallowAuth(HelloWorldDisallowByRobots):
+    """ Disallow itself via robots.txt if a request to robots.txt
+    contains basic auth header. """
+    class RobotsTxt(HelloWorldDisallowByRobots.RobotsTxt):
+        def render_GET(self, request):
+            if request.requestHeaders.hasHeader('Authorization'):
+                return super(HelloWorldDisallowAuth.RobotsTxt, self).render_GET(request)
+            request.setResponseCode(404)
+            return b''
+
+
+class Http400Resource(HtmlResource):
+    status_code = 400
+    html = "Website returns HTTP 400 error"
+
+
+class ManyCookies(Resource, object):
+    class SetMyCookie(HtmlResource):
+        html = "hello!"
+        extra_headers = {'Set-Cookie': 'login=1'}
+
+    def __init__(self):
+        super(ManyCookies, self).__init__()
+        self.putChild(b'', HelloWorld())
+        self.putChild(b'login', self.SetMyCookie())
+
+
+def splash_proxy():
+    splash_url = os.environ.get('SPLASH_URL')
+    p = urlparse(splash_url)
+    return lambda: proxy.ReverseProxyResource(p.hostname, int(p.port), b'')
+
+
+def password_protected(resource_cls, username, password):
+    # Sorry, but this is nuts. A zillion of classes, arbitrary
+    # unicode / bytes requirements at random places. Is there a simpler
+    # way to get HTTP Basic Auth working in Twisted?
+    @implementer(IRealm)
+    class SimpleRealm(object):
+        def requestAvatar(self, avatarId, mind, *interfaces):
+            if resource.IResource in interfaces:
+                return resource.IResource, resource_cls(), lambda: None
+            raise NotImplementedError()
+
+    creds = {username: password}
+    checkers = [InMemoryUsernamePasswordDatabaseDontUse(**creds)]
+    return lambda: guard.HTTPAuthSessionWrapper(
+        Portal(SimpleRealm(), checkers),
+        [guard.BasicCredentialFactory(b'example.com')])
+
+
+HelloWorldProtected = password_protected(HelloWorld, 'user', b'userpass')
+HelloWorldProtected.__name__ = 'HelloWorldProtected'
+HelloWorldProtected.__module__ = __name__
+
+SplashProtected = password_protected(splash_proxy(), 'user', b'userpass')
+SplashProtected.__name__ = 'SplashProtected'
+SplashProtected.__module__ = __name__