Skip to content

Commit

Permalink
Improve error handling for Zyte API (#123)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio authored Aug 2, 2024
1 parent 3435cd7 commit 0fb96a6
Show file tree
Hide file tree
Showing 7 changed files with 114 additions and 38 deletions.
3 changes: 3 additions & 0 deletions .bandit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
skips:
- B101 # assert_used, needed for mypy
exclude_dirs: ['tests']
15 changes: 10 additions & 5 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@ jobs:
- python-version: "3.4"
env:
TOXENV: py34
- python-version: "3.5"
env:
TOXENV: py
# 3.5 cannot be tested in CI
# https://github.com/MatteoH2O1999/setup-python/issues/49#issuecomment-2209940822
- python-version: "3.6"
env:
TOXENV: py
Expand All @@ -33,6 +32,9 @@ jobs:
- python-version: "3.10"
env:
TOXENV: py
- python-version: "3.10"
env:
TOXENV: mypy
- python-version: "3.11"
env:
TOXENV: py
Expand All @@ -41,16 +43,19 @@ jobs:
TOXENV: py
- python-version: "3.12"
env:
TOXENV: security
TOXENV: pre-commit
- python-version: "3.12"
env:
TOXENV: docs
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: MatteoH2O1999/setup-python@v2
uses: MatteoH2O1999/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
allow-build: info
cache-build: true
cache: pip
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
6 changes: 6 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
repos:
- repo: https://github.com/PyCQA/bandit
rev: 1.7.9
hooks:
- id: bandit
args: [-r, -c, .bandit.yml]
45 changes: 28 additions & 17 deletions scrapy_zyte_smartproxy/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
import warnings
from base64 import urlsafe_b64decode
from collections import defaultdict
from typing import Dict, List
try:
from urllib.request import _parse_proxy
from urllib.request import _parse_proxy # type: ignore
except ImportError:
from urllib2 import _parse_proxy
from urllib2 import _parse_proxy # type: ignore

from six.moves.urllib.parse import urlparse, urlunparse
from w3lib.http import basic_auth_header
Expand Down Expand Up @@ -41,9 +42,9 @@ class ZyteSmartProxyMiddleware(object):
backoff_step = 15
backoff_max = 180
exp_backoff = None
force_enable_on_http_codes = []
force_enable_on_http_codes = [] # type: List[int]
max_auth_retry_times = 10
enabled_for_domain = {}
enabled_for_domain = {} # type: Dict[str, bool]
apikey = ""
zyte_api_to_spm_translations = {
b"zyte-device": b"x-crawlera-profile",
Expand Down Expand Up @@ -261,14 +262,11 @@ def process_request(self, request, spider):

def _is_banned(self, response):
return (
response.status == self.ban_code and
response.headers.get('X-Crawlera-Error') == b'banned'
)

def _is_no_available_proxies(self, response):
return (
response.status == self.ban_code and
response.headers.get('X-Crawlera-Error') == b'noslaves'
response.status == self.ban_code
and response.headers.get('X-Crawlera-Error') == b'banned'
) or (
response.status in {520, 521}
and response.headers.get('Zyte-Error')
)

def _is_auth_error(self, response):
Expand All @@ -277,6 +275,16 @@ def _is_auth_error(self, response):
response.headers.get('X-Crawlera-Error') == b'bad_proxy_auth'
)

def _throttle_error(self, response):
error = response.headers.get('Zyte-Error') or response.headers.get('X-Crawlera-Error')
if (
response.status in {429, 503}
and error
and error != b"banned"
):
return error.decode()
return None

def _process_error(self, response):
if "Zyte-Error" in response.headers:
value = response.headers.get('Zyte-Error')
Expand All @@ -302,17 +310,20 @@ def process_response(self, request, response, spider):
key = self._get_slot_key(request)
self._restore_original_delay(request)

if self._is_no_available_proxies(response) or self._is_auth_error(response):
if self._is_no_available_proxies(response):
reason = 'noslaves'
else:
is_auth_error = self._is_auth_error(response)
throttle_error = self._throttle_error(response)
if is_auth_error or throttle_error:
if is_auth_error:
reason = 'autherror'
else:
assert throttle_error
reason = throttle_error.lstrip("/")
self._set_custom_delay(request, next(self.exp_backoff), reason=reason, targets_zyte_api=targets_zyte_api)
else:
self._inc_stat("delay/reset_backoff", targets_zyte_api=targets_zyte_api)
self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)

if self._is_auth_error(response):
if is_auth_error:
# When Zyte Smart Proxy Manager has issues it might not be able to
# authenticate users we must retry
retries = request.meta.get('zyte_smartproxy_auth_retry_times', 0)
Expand Down
14 changes: 14 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,2 +1,16 @@
[bdist_wheel]
universal=1

[mypy]

[mypy-pytest.*]
ignore_missing_imports = True

[mypy-scrapy.*]
ignore_missing_imports = True

[mypy-twisted.*]
ignore_missing_imports = True

[mypy-w3lib.*]
ignore_missing_imports = True
49 changes: 34 additions & 15 deletions tests/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
from random import choice
from unittest import TestCase
try:
from unittest.mock import call, patch
from unittest.mock import call, patch # type: ignore
except ImportError:
from mock import call, patch
from mock import call, patch # type: ignore

from w3lib.http import basic_auth_header
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
Expand Down Expand Up @@ -666,19 +666,24 @@ def test_is_banned(self):
mw = self.mwcls.from_crawler(crawler)
mw.open_spider(self.spider)
req = self._make_fake_request(self.spider, zyte_smartproxy_enabled=True)

res = Response(req.url, status=200)
res = mw.process_response(req, res, self.spider)
self.assertFalse(mw._is_banned(res))
res = Response(req.url, status=503, headers={'X-Crawlera-Error': 'noslaves'})
res = mw.process_response(req, res, self.spider)
self.assertFalse(mw._is_banned(res))
res = Response(req.url, status=503, headers={'Zyte-Error': 'noslaves'})
res = Response(req.url, status=503, headers={'Zyte-Error': '/limits/over-global-limit'})
res = mw.process_response(req, res, self.spider)
self.assertFalse(mw._is_banned(res))

res = Response(req.url, status=503, headers={'X-Crawlera-Error': 'banned'})
res = mw.process_response(req, res, self.spider)
self.assertTrue(mw._is_banned(res))
res = Response(req.url, status=503, headers={'Zyte-Error': 'banned'})
res = Response(req.url, status=520, headers={'Zyte-Error': '/download/temporary-error'})
res = mw.process_response(req, res, self.spider)
self.assertTrue(mw._is_banned(res))
res = Response(req.url, status=521, headers={'Zyte-Error': '/download/internal-error'})
res = mw.process_response(req, res, self.spider)
self.assertTrue(mw._is_banned(res))

Expand Down Expand Up @@ -709,24 +714,38 @@ def test_noslaves_delays(self, random_uniform_patch):
noslaves_req = Request(url, meta={'download_slot': slot_key})
assert mw.process_request(noslaves_req, self.spider) is None
assert httpproxy.process_request(noslaves_req, self.spider) is None
headers = {'X-Crawlera-Error': 'noslaves'}
noslaves_res = self._mock_zyte_smartproxy_response(

# delays grow exponentially with any throttling error
noslaves_response = self._mock_zyte_smartproxy_response(
ban_url,
status=self.bancode,
headers=headers,
status=503,
headers={'X-Crawlera-Error': 'noslaves'},
)

# delays grow exponentially
mw.process_response(noslaves_req, noslaves_res, self.spider)
mw.process_response(noslaves_req, noslaves_response, self.spider)
self.assertEqual(slot.delay, backoff_step)

mw.process_response(noslaves_req, noslaves_res, self.spider)
over_use_limit_response = self._mock_zyte_smartproxy_response(
ban_url,
status=429,
headers={'Zyte-Error': '/limits/over-user-limit'},
)
mw.process_response(noslaves_req, over_use_limit_response, self.spider)
self.assertEqual(slot.delay, backoff_step * 2 ** 1)

mw.process_response(noslaves_req, noslaves_res, self.spider)
over_domain_limit_response = self._mock_zyte_smartproxy_response(
ban_url,
status=429,
headers={'Zyte-Error': '/limits/over-domain-limit'},
)
mw.process_response(noslaves_req, over_domain_limit_response, self.spider)
self.assertEqual(slot.delay, backoff_step * 2 ** 2)

mw.process_response(noslaves_req, noslaves_res, self.spider)
over_global_limit_response = self._mock_zyte_smartproxy_response(
ban_url,
status=503,
headers={'Zyte-Error': '/limits/over-global-limit'},
)
mw.process_response(noslaves_req, over_global_limit_response, self.spider)
self.assertEqual(slot.delay, max_delay)

# other responses reset delay
Expand All @@ -742,7 +761,7 @@ def test_noslaves_delays(self, random_uniform_patch):
mw.process_response(ban_req, ban_res, self.spider)
self.assertEqual(slot.delay, default_delay)

mw.process_response(noslaves_req, noslaves_res, self.spider)
mw.process_response(noslaves_req, noslaves_response, self.spider)
self.assertEqual(slot.delay, backoff_step)

good_req = Request(url, meta={'download_slot': slot_key})
Expand Down
20 changes: 19 additions & 1 deletion tox.ini
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# tox.ini
[tox]
envlist = min,py27,py34,py35,py36,py37,py38,py39,py310,py311,py312,docs,security
envlist = pre-commit,mypy,min,py27,py34,py35,py36,py37,py38,py39,py310,py311,py312,docs

[testenv]
deps =
Expand All @@ -9,6 +9,24 @@ deps =
commands =
py.test --doctest-modules --cov=scrapy_zyte_smartproxy {posargs:scrapy_zyte_smartproxy tests}

[testenv:pre-commit]
basepython = python3
deps =
pre-commit
commands =
pre-commit run {posargs:--all-files}

[testenv:mypy]
basepython = python3.10
deps =
mypy[python2]<0.980
pytest<4.7
twisted<=20.3.0
types-six<1.16.12
w3lib<2
commands =
mypy --py2 {posargs:scrapy_zyte_smartproxy tests}

[testenv:min]
basepython = python2.7
deps =
Expand Down

0 comments on commit 0fb96a6

Please sign in to comment.