From e26d3c1874a1997dd42e158d8925c41d4b7516cb Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta <1731933+elacuesta@users.noreply.github.com> Date: Wed, 10 Jul 2024 23:02:00 -0300 Subject: [PATCH] Restart on browser crash (#295) * Browser disconnected callback * PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER setting * Document PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER * Readme adjustment * Rename tests * Test browser restart * Simplify * Less lines * Remove browser name from logs --- README.md | 11 ++++ scrapy_playwright/handler.py | 15 +++++ tests/__init__.py | 8 ++- ...er_server.js => launch_chromium_server.js} | 1 + .../{test_remote.py => test_browser.py} | 60 ++++++++++++++++--- 5 files changed, 84 insertions(+), 11 deletions(-) rename tests/{launch_browser_server.js => launch_chromium_server.js} (89%) rename tests/tests_asyncio/{test_remote.py => test_browser.py} (68%) diff --git a/README.md b/README.md index 2126ffc4..1242d669 100644 --- a/README.md +++ b/README.md @@ -324,6 +324,17 @@ def custom_headers( PLAYWRIGHT_PROCESS_REQUEST_HEADERS = custom_headers ``` +### `PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER` +Type `bool`, default `True` + +Whether the browser will be restarted if it gets disconnected, for instance if the local +browser crashes or a remote connection times out. +Implemented by listening to the +[`disconnected` Browser event](https://playwright.dev/python/docs/api/class-browser#browser-event-disconnected), +for this reason it does not apply to persistent contexts since +[BrowserType.launch_persistent_context](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context) +returns the context directly. + ### `PLAYWRIGHT_MAX_PAGES_PER_CONTEXT` Type `int`, defaults to the value of Scrapy's `CONCURRENT_REQUESTS` setting diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 2b2ebd58..c5466254 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -91,6 +91,7 @@ class Config: max_contexts: Optional[int] startup_context_kwargs: dict navigation_timeout: Optional[float] + restart_disconnected_browser: bool @classmethod def from_settings(cls, settings: Settings) -> "Config": @@ -111,6 +112,9 @@ def from_settings(cls, settings: Settings) -> "Config": navigation_timeout=_get_float_setting( settings, "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" ), + restart_disconnected_browser=settings.getbool( + "PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER", default=True + ), ) cfg.cdp_kwargs.pop("endpoint_url", None) cfg.connect_kwargs.pop("ws_endpoint", None) @@ -188,6 +192,7 @@ async def _maybe_launch_browser(self) -> None: logger.info("Launching browser %s", self.browser_type.name) self.browser = await self.browser_type.launch(**self.config.launch_options) logger.info("Browser %s launched", self.browser_type.name) + self.browser.on("disconnected", self._browser_disconnected_callback) async def _maybe_connect_remote_devtools(self) -> None: async with self.browser_launch_lock: @@ -197,6 +202,7 @@ async def _maybe_connect_remote_devtools(self) -> None: self.config.cdp_url, **self.config.cdp_kwargs ) logger.info("Connected using CDP: %s", self.config.cdp_url) + self.browser.on("disconnected", self._browser_disconnected_callback) async def _maybe_connect_remote(self) -> None: async with self.browser_launch_lock: @@ -206,6 +212,7 @@ async def _maybe_connect_remote(self) -> None: self.config.connect_url, **self.config.connect_kwargs ) logger.info("Connected to remote Playwright") + self.browser.on("disconnected", self._browser_disconnected_callback) async def _create_browser_context( self, @@ -599,6 +606,14 @@ def _increment_response_stats(self, response: PlaywrightResponse) -> None: self.stats.inc_value(f"{stats_prefix}/resource_type/{response.request.resource_type}") self.stats.inc_value(f"{stats_prefix}/method/{response.request.method}") + async def _browser_disconnected_callback(self) -> None: + await asyncio.gather( + *[ctx_wrapper.context.close() for ctx_wrapper in self.context_wrappers.values()] + ) + logger.debug("Browser disconnected") + if self.config.restart_disconnected_browser: + del self.browser + def _make_close_page_callback(self, context_name: str) -> Callable: def close_page_callback() -> None: if context_name in self.context_wrappers: diff --git a/tests/__init__.py b/tests/__init__.py index 1a2c5704..7f6a9fbe 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -4,6 +4,7 @@ import platform from contextlib import asynccontextmanager from functools import wraps +from typing import Optional from scrapy import Request from scrapy.http.response.html import HtmlResponse @@ -37,12 +38,13 @@ def allow_windows(test_method): @asynccontextmanager -async def make_handler(settings_dict: dict): +async def make_handler(settings_dict: Optional[dict] = None): """Convenience function to obtain an initialized handler and close it gracefully""" from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler - settings_dict.setdefault("TELNETCONSOLE_ENABLED", False) - crawler = get_crawler(settings_dict=settings_dict) + settings: dict = settings_dict or {} + settings.setdefault("TELNETCONSOLE_ENABLED", False) + crawler = get_crawler(settings_dict=settings) handler = ScrapyPlaywrightDownloadHandler(crawler=crawler) try: await handler._launch() diff --git a/tests/launch_browser_server.js b/tests/launch_chromium_server.js similarity index 89% rename from tests/launch_browser_server.js rename to tests/launch_chromium_server.js index 92447f00..f8087d75 100644 --- a/tests/launch_browser_server.js +++ b/tests/launch_chromium_server.js @@ -10,4 +10,5 @@ const { chromium } = require('playwright'); // Or 'webkit' or 'firefox'. port: process.argv[2], wsPath: process.argv[3] }); + console.log(browserServer.wsEndpoint()) })(); diff --git a/tests/tests_asyncio/test_remote.py b/tests/tests_asyncio/test_browser.py similarity index 68% rename from tests/tests_asyncio/test_remote.py rename to tests/tests_asyncio/test_browser.py index 69502c0d..78e0b26b 100644 --- a/tests/tests_asyncio/test_remote.py +++ b/tests/tests_asyncio/test_browser.py @@ -42,7 +42,7 @@ async def _run_chromium_devtools() -> Tuple[subprocess.Popen, str]: return proc, devtools_url -def _run_playwright_browser_server() -> Tuple[subprocess.Popen, str]: +def _run_chromium_browser_server() -> Tuple[subprocess.Popen, str]: """Start a Playwright server in a separate process, return the process object and a string with its websocket endpoint. Pass fixed port and ws path as arguments instead of allowing Playwright @@ -50,21 +50,21 @@ def _run_playwright_browser_server() -> Tuple[subprocess.Popen, str]: """ port = str(random.randint(60_000, 63_000)) ws_path = str(uuid.uuid4()) - launch_server_script_path = str(Path(__file__).parent.parent / "launch_browser_server.js") + launch_server_script_path = str(Path(__file__).parent.parent / "launch_chromium_server.js") command = ["node", launch_server_script_path, port, ws_path] proc = subprocess.Popen(command) # pylint: disable=consider-using-with return proc, f"ws://localhost:{port}/{ws_path}" @asynccontextmanager -async def remote_browser(is_chrome_devtools_protocol: bool = True): +async def remote_chromium(with_devtools_protocol: bool = True): """Launch a remote browser that lasts while in the context.""" proc = url = None try: - if is_chrome_devtools_protocol: + if with_devtools_protocol: proc, url = await _run_chromium_devtools() else: - proc, url = _run_playwright_browser_server() + proc, url = _run_chromium_browser_server() await asyncio.sleep(1) # allow some time for the browser to start except Exception: pass @@ -77,7 +77,7 @@ async def remote_browser(is_chrome_devtools_protocol: bool = True): proc.communicate() -class TestRemote(IsolatedAsyncioTestCase): +class TestRemoteBrowser(IsolatedAsyncioTestCase): @pytest.fixture(autouse=True) def inject_fixtures(self, caplog): caplog.set_level(logging.DEBUG) @@ -85,7 +85,7 @@ def inject_fixtures(self, caplog): @allow_windows async def test_connect_devtools(self): - async with remote_browser(is_chrome_devtools_protocol=True) as devtools_url: + async with remote_chromium(with_devtools_protocol=True) as devtools_url: settings_dict = { "PLAYWRIGHT_CDP_URL": devtools_url, "PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True}, @@ -103,7 +103,7 @@ async def test_connect_devtools(self): @allow_windows async def test_connect(self): - async with remote_browser(is_chrome_devtools_protocol=False) as browser_url: + async with remote_chromium(with_devtools_protocol=False) as browser_url: settings_dict = { "PLAYWRIGHT_CONNECT_URL": browser_url, "PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True}, @@ -128,3 +128,47 @@ async def test_connect(self): logging.WARNING, "Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS", ) in self._caplog.record_tuples + + +class TestBrowserReconnect(IsolatedAsyncioTestCase): + @pytest.fixture(autouse=True) + def inject_fixtures(self, caplog): + caplog.set_level(logging.DEBUG) + self._caplog = caplog + + @allow_windows + async def test_restart_browser(self): + spider = Spider("foo") + async with make_handler() as handler: + with StaticMockServer() as server: + req1 = Request( + server.urljoin("/index.html"), + meta={"playwright": True, "playwright_include_page": True}, + ) + resp1 = await handler._download_request(req1, spider) + page = resp1.meta["playwright_page"] + await page.context.browser.close() + req2 = Request(server.urljoin("/gallery.html"), meta={"playwright": True}) + resp2 = await handler._download_request(req2, spider) + assert_correct_response(resp1, req1) + assert_correct_response(resp2, req2) + assert ( + self._caplog.record_tuples.count( + ( + "scrapy-playwright", + logging.DEBUG, + "Browser disconnected", + ) + ) + == 2 # one mid-crawl after calling Browser.close() manually, one at the end + ) + assert ( + self._caplog.record_tuples.count( + ( + "scrapy-playwright", + logging.INFO, + "Launching browser chromium", + ) + ) + == 2 # one at the beginning, one after calling Browser.close() manually + )