Skip to content

Commit

Permalink
Implement SPLASH_USER and SPLASH_PASS
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Oct 4, 2021
1 parent 783c58d commit 2b253e5
Show file tree
Hide file tree
Showing 9 changed files with 547 additions and 94 deletions.
13 changes: 13 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,19 @@ Changes
0.8.0 (2021-10-04)
------------------

* **Security bug fix:**

If you use :ref:`HttpAuthMiddleware` (i.e. the ``http_user`` and
``http_pass`` spider attributes) for Splash authentication, any non-Splash
request will expose your credentials to the request target. This includes
``robots.txt`` requests sent by Scrapy when the ``ROBOTSTXT_OBEY`` setting
is set to ``True``.

Use the new ``SPLASH_USER`` and ``SPLASH_PASS`` settings instead to set
your Splash authentication credentials safely.

.. _HttpAuthMiddleware: http://doc.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.httpauth

* Responses now expose the HTTP status code and headers from Splash as
``response.splash_response_status`` and
``response.splash_response_headers`` (#158)
Expand Down
25 changes: 22 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -582,12 +582,31 @@ on Splash server and is not sent with each request (it requires Splash 2.1+)::
HTTP Basic Auth
===============

If you need HTTP Basic Authentication to access Splash, use
Scrapy's HttpAuthMiddleware_.
If you need to use HTTP Basic Authentication to access Splash, use the
``SPLASH_USER`` and ``SPLASH_PASS`` optional settings::

SPLASH_USER = 'user'
SPLASH_PASS = 'userpass'

Another option is ``meta['splash']['splash_headers']``: it allows to set
custom headers which are sent to Splash server; add Authorization header
to ``splash_headers`` if HttpAuthMiddleware doesn't fit for some reason.
to ``splash_headers`` if you want to change credentials per-request::

import scrapy
from w3lib.http import basic_auth_header

class MySpider(scrapy.Spider):
# ...
def start_requests(self):
auth = basic_auth_header('user', 'userpass')
yield SplashRequest(url, self.parse,
splash_headers={'Authorization': auth})

**WARNING:** Don't use :ref:`HttpAuthMiddleware`
(i.e. ``http_user`` / ``http_pass`` spider attributes) for Splash
authentication: if you occasionally send a non-Splash request from your spider,
you may expose Splash credentials to a remote website, as HttpAuthMiddleware
sets credentials for all requests unconditionally.

.. _HttpAuthMiddleware: http://doc.scrapy.org/en/latest/topics/downloader-middleware.html#module-scrapy.downloadermiddlewares.httpauth

Expand Down
1 change: 1 addition & 0 deletions example/scrashtest/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@
# SPLASH_URL = 'http://192.168.59.103:8050/'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
ROBOTSTXT_OBEY = True
80 changes: 68 additions & 12 deletions scrapy_splash/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@
from six.moves.urllib.parse import urljoin
from six.moves.http_cookiejar import CookieJar

from w3lib.http import basic_auth_header
import scrapy
from scrapy.exceptions import NotConfigured
from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.http.headers import Headers
from scrapy.http.response.text import TextResponse
from scrapy import signals
from scrapy.downloadermiddlewares.robotstxt import RobotsTxtMiddleware

from scrapy_splash.responsetypes import responsetypes
from scrapy_splash.cookies import jar_to_har, har_to_jar
Expand Down Expand Up @@ -222,26 +224,34 @@ class SplashMiddleware(object):
retry_498_priority_adjust = +50
remote_keys_key = '_splash_remote_keys'

def __init__(self, crawler, splash_base_url, slot_policy, log_400):
def __init__(self, crawler, splash_base_url, slot_policy, log_400, auth):
self.crawler = crawler
self.splash_base_url = splash_base_url
self.slot_policy = slot_policy
self.log_400 = log_400
self.crawler.signals.connect(self.spider_opened, signals.spider_opened)
self.auth = auth

@classmethod
def from_crawler(cls, crawler):
splash_base_url = crawler.settings.get('SPLASH_URL',
cls.default_splash_url)
log_400 = crawler.settings.getbool('SPLASH_LOG_400', True)
slot_policy = crawler.settings.get('SPLASH_SLOT_POLICY',
cls.default_policy)
s = crawler.settings
splash_base_url = s.get('SPLASH_URL', cls.default_splash_url)
log_400 = s.getbool('SPLASH_LOG_400', True)
slot_policy = s.get('SPLASH_SLOT_POLICY', cls.default_policy)
if slot_policy not in SlotPolicy._known:
raise NotConfigured("Incorrect slot policy: %r" % slot_policy)

return cls(crawler, splash_base_url, slot_policy, log_400)
splash_user = s.get('SPLASH_USER', '')
splash_pass = s.get('SPLASH_PASS', '')
auth = None
if splash_user or splash_pass:
auth = basic_auth_header(splash_user, splash_pass)
return cls(crawler, splash_base_url, slot_policy, log_400, auth)

def spider_opened(self, spider):
if _http_auth_enabled(spider):
replace_downloader_middleware(self.crawler, RobotsTxtMiddleware,
SafeRobotsTxtMiddleware)
if not hasattr(spider, 'state'):
spider.state = {}

Expand All @@ -260,21 +270,24 @@ def _remote_keys(self):
def process_request(self, request, spider):
if 'splash' not in request.meta:
return
splash_options = request.meta['splash']

if request.method not in {'GET', 'POST'}:
logger.warning(
logger.error(
"Currently only GET and POST requests are supported by "
"SplashMiddleware; %(request)s will be handled without Splash",
"SplashMiddleware; %(request)s is dropped",
{'request': request},
extra={'spider': spider}
)
return request
self.crawler.stats.inc_value('splash/dropped/method/{}'.format(
request.method))
raise IgnoreRequest("SplashRequest doesn't support "
"HTTP {} method".format(request.method))

if request.meta.get("_splash_processed"):
# don't process the same request more than once
return

splash_options = request.meta['splash']
request.meta['_splash_processed'] = True

slot_policy = splash_options.get('slot_policy', self.slot_policy)
Expand Down Expand Up @@ -319,6 +332,10 @@ def process_request(self, request, spider):
if not splash_options.get('dont_send_headers'):
headers = scrapy_headers_to_unicode_dict(request.headers)
if headers:
# Headers set by HttpAuthMiddleware should be used for Splash,
# not for the remote website (backwards compatibility).
if _http_auth_enabled(spider):
headers.pop('Authorization', None)
args.setdefault('headers', headers)

body = json.dumps(args, ensure_ascii=False, sort_keys=True, indent=4)
Expand Down Expand Up @@ -353,6 +370,8 @@ def process_request(self, request, spider):
splash_url = urljoin(splash_base_url, endpoint)

headers = Headers({'Content-Type': 'application/json'})
if self.auth is not None:
headers['Authorization'] = self.auth
headers.update(splash_options.get('splash_headers', {}))
new_request = request.replace(
url=splash_url,
Expand All @@ -361,6 +380,7 @@ def process_request(self, request, spider):
headers=headers,
priority=request.priority + self.rescheduling_priority_adjust
)
new_request.meta['dont_obey_robotstxt'] = True
self.crawler.stats.inc_value('splash/%s/request_count' % endpoint)
return new_request

Expand Down Expand Up @@ -478,3 +498,39 @@ def _get_slot_key(self, request_or_response):
return self.crawler.engine.downloader._get_slot_key(
request_or_response, None
)


class SafeRobotsTxtMiddleware(RobotsTxtMiddleware):
def process_request(self, request, spider):
# disable robots.txt for Splash requests
if _http_auth_enabled(spider) and 'splash' in request.meta:
return
return super(SafeRobotsTxtMiddleware, self).process_request(
request, spider)


def _http_auth_enabled(spider):
# FIXME: this function should always return False if HttpAuthMiddleware is
# not in a middleware list.
return getattr(spider, 'http_user', '') or getattr(spider, 'http_pass', '')


def replace_downloader_middleware(crawler, old_cls, new_cls):
""" Replace downloader middleware with another one """
try:
new_mw = new_cls.from_crawler(crawler)
except NotConfigured:
return

mw_manager = crawler.engine.downloader.middleware
mw_manager.middlewares = tuple([
mw if mw.__class__ is not old_cls else new_mw
for mw in mw_manager.middlewares
])
for method_name, callbacks in mw_manager.methods.items():
for idx, meth in enumerate(callbacks):
method_cls = meth.__self__.__class__
if method_cls is old_cls:
new_meth = getattr(new_mw, method_name)
# logger.debug("{} is replaced with {}".format(meth, new_meth))
callbacks[idx] = new_meth
13 changes: 10 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import os

import pytest
from scrapy.settings import Settings
from .mockserver import MockServer
from .resources import SplashProtected


@pytest.fixture()
def settings(request):
def settings():
""" Default scrapy-splash settings """
s = dict(
# collect scraped items to .collected_items attribute
Expand All @@ -28,6 +29,12 @@ def settings(request):
DUPEFILTER_CLASS='scrapy_splash.SplashAwareDupeFilter',
HTTPCACHE_STORAGE='scrapy_splash.SplashAwareFSCacheStorage',
)
return Settings(s)
return s


@pytest.fixture()
def settings_auth(settings):
with MockServer(SplashProtected) as s:
print("splash url:", s.root_url)
settings['SPLASH_URL'] = s.root_url
yield settings
108 changes: 108 additions & 0 deletions tests/resources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# -*- coding: utf-8 -*-
import os
from six.moves.urllib.parse import urlparse

from twisted.web.resource import Resource
from zope.interface import implementer
from twisted.web import resource, guard, proxy
from twisted.cred.portal import IRealm, Portal
from twisted.cred.checkers import InMemoryUsernamePasswordDatabaseDontUse

from scrapy_splash.utils import to_bytes


class HtmlResource(Resource):
isLeaf = True
content_type = 'text/html'
html = ''
extra_headers = {}
status_code = 200

def render_GET(self, request):
request.setHeader(b'content-type', to_bytes(self.content_type))
for name, value in self.extra_headers.items():
request.setHeader(to_bytes(name), to_bytes(value))
request.setResponseCode(self.status_code)
return to_bytes(self.html)


class HelloWorld(HtmlResource):
html = """
<html><body><script>document.write('hello world!');</script></body></html>
"""
extra_headers = {'X-MyHeader': 'my value', 'Set-Cookie': 'sessionid=ABCD'}


class HelloWorldDisallowByRobots(HelloWorld):
""" Disallow itself via robots.txt """
isLeaf = False

def getChild(self, name, request):
if name == b"robots.txt":
return self.RobotsTxt()
return self

class RobotsTxt(Resource):
isLeaf = True
def render_GET(self, request):
return b'User-Agent: *\nDisallow: /\n'


class HelloWorldDisallowAuth(HelloWorldDisallowByRobots):
""" Disallow itself via robots.txt if a request to robots.txt
contains basic auth header. """
class RobotsTxt(HelloWorldDisallowByRobots.RobotsTxt):
def render_GET(self, request):
if request.requestHeaders.hasHeader('Authorization'):
return super(HelloWorldDisallowAuth.RobotsTxt, self).render_GET(request)
request.setResponseCode(404)
return b''


class Http400Resource(HtmlResource):
status_code = 400
html = "Website returns HTTP 400 error"


class ManyCookies(Resource, object):
class SetMyCookie(HtmlResource):
html = "hello!"
extra_headers = {'Set-Cookie': 'login=1'}

def __init__(self):
super(ManyCookies, self).__init__()
self.putChild(b'', HelloWorld())
self.putChild(b'login', self.SetMyCookie())


def splash_proxy():
splash_url = os.environ.get('SPLASH_URL')
p = urlparse(splash_url)
return lambda: proxy.ReverseProxyResource(p.hostname, int(p.port), b'')


def password_protected(resource_cls, username, password):
# Sorry, but this is nuts. A zillion of classes, arbitrary
# unicode / bytes requirements at random places. Is there a simpler
# way to get HTTP Basic Auth working in Twisted?
@implementer(IRealm)
class SimpleRealm(object):
def requestAvatar(self, avatarId, mind, *interfaces):
if resource.IResource in interfaces:
return resource.IResource, resource_cls(), lambda: None
raise NotImplementedError()

creds = {username: password}
checkers = [InMemoryUsernamePasswordDatabaseDontUse(**creds)]
return lambda: guard.HTTPAuthSessionWrapper(
Portal(SimpleRealm(), checkers),
[guard.BasicCredentialFactory(b'example.com')])


HelloWorldProtected = password_protected(HelloWorld, 'user', b'userpass')
HelloWorldProtected.__name__ = 'HelloWorldProtected'
HelloWorldProtected.__module__ = __name__

SplashProtected = password_protected(splash_proxy(), 'user', b'userpass')
SplashProtected.__name__ = 'SplashProtected'
SplashProtected.__module__ = __name__
Loading

0 comments on commit 2b253e5

Please sign in to comment.