Skip to content

Commit

Permalink
Restart on browser crash (#295)
Browse files Browse the repository at this point in the history
* Browser disconnected callback

* PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER setting

* Document PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER

* Readme adjustment

* Rename tests

* Test browser restart

* Simplify

* Less lines

* Remove browser name from logs
  • Loading branch information
elacuesta authored Jul 11, 2024
1 parent f3b1b25 commit e26d3c1
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 11 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,17 @@ def custom_headers(
PLAYWRIGHT_PROCESS_REQUEST_HEADERS = custom_headers
```

### `PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER`
Type `bool`, default `True`

Whether the browser will be restarted if it gets disconnected, for instance if the local
browser crashes or a remote connection times out.
Implemented by listening to the
[`disconnected` Browser event](https://playwright.dev/python/docs/api/class-browser#browser-event-disconnected),
for this reason it does not apply to persistent contexts since
[BrowserType.launch_persistent_context](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-persistent-context)
returns the context directly.

### `PLAYWRIGHT_MAX_PAGES_PER_CONTEXT`
Type `int`, defaults to the value of Scrapy's `CONCURRENT_REQUESTS` setting

Expand Down
15 changes: 15 additions & 0 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ class Config:
max_contexts: Optional[int]
startup_context_kwargs: dict
navigation_timeout: Optional[float]
restart_disconnected_browser: bool

@classmethod
def from_settings(cls, settings: Settings) -> "Config":
Expand All @@ -111,6 +112,9 @@ def from_settings(cls, settings: Settings) -> "Config":
navigation_timeout=_get_float_setting(
settings, "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT"
),
restart_disconnected_browser=settings.getbool(
"PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER", default=True
),
)
cfg.cdp_kwargs.pop("endpoint_url", None)
cfg.connect_kwargs.pop("ws_endpoint", None)
Expand Down Expand Up @@ -188,6 +192,7 @@ async def _maybe_launch_browser(self) -> None:
logger.info("Launching browser %s", self.browser_type.name)
self.browser = await self.browser_type.launch(**self.config.launch_options)
logger.info("Browser %s launched", self.browser_type.name)
self.browser.on("disconnected", self._browser_disconnected_callback)

async def _maybe_connect_remote_devtools(self) -> None:
async with self.browser_launch_lock:
Expand All @@ -197,6 +202,7 @@ async def _maybe_connect_remote_devtools(self) -> None:
self.config.cdp_url, **self.config.cdp_kwargs
)
logger.info("Connected using CDP: %s", self.config.cdp_url)
self.browser.on("disconnected", self._browser_disconnected_callback)

async def _maybe_connect_remote(self) -> None:
async with self.browser_launch_lock:
Expand All @@ -206,6 +212,7 @@ async def _maybe_connect_remote(self) -> None:
self.config.connect_url, **self.config.connect_kwargs
)
logger.info("Connected to remote Playwright")
self.browser.on("disconnected", self._browser_disconnected_callback)

async def _create_browser_context(
self,
Expand Down Expand Up @@ -599,6 +606,14 @@ def _increment_response_stats(self, response: PlaywrightResponse) -> None:
self.stats.inc_value(f"{stats_prefix}/resource_type/{response.request.resource_type}")
self.stats.inc_value(f"{stats_prefix}/method/{response.request.method}")

async def _browser_disconnected_callback(self) -> None:
await asyncio.gather(
*[ctx_wrapper.context.close() for ctx_wrapper in self.context_wrappers.values()]
)
logger.debug("Browser disconnected")
if self.config.restart_disconnected_browser:
del self.browser

def _make_close_page_callback(self, context_name: str) -> Callable:
def close_page_callback() -> None:
if context_name in self.context_wrappers:
Expand Down
8 changes: 5 additions & 3 deletions tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import platform
from contextlib import asynccontextmanager
from functools import wraps
from typing import Optional

from scrapy import Request
from scrapy.http.response.html import HtmlResponse
Expand Down Expand Up @@ -37,12 +38,13 @@ def allow_windows(test_method):


@asynccontextmanager
async def make_handler(settings_dict: dict):
async def make_handler(settings_dict: Optional[dict] = None):
"""Convenience function to obtain an initialized handler and close it gracefully"""
from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler

settings_dict.setdefault("TELNETCONSOLE_ENABLED", False)
crawler = get_crawler(settings_dict=settings_dict)
settings: dict = settings_dict or {}
settings.setdefault("TELNETCONSOLE_ENABLED", False)
crawler = get_crawler(settings_dict=settings)
handler = ScrapyPlaywrightDownloadHandler(crawler=crawler)
try:
await handler._launch()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ const { chromium } = require('playwright'); // Or 'webkit' or 'firefox'.
port: process.argv[2],
wsPath: process.argv[3]
});
console.log(browserServer.wsEndpoint())
})();
Original file line number Diff line number Diff line change
Expand Up @@ -42,29 +42,29 @@ async def _run_chromium_devtools() -> Tuple[subprocess.Popen, str]:
return proc, devtools_url


def _run_playwright_browser_server() -> Tuple[subprocess.Popen, str]:
def _run_chromium_browser_server() -> Tuple[subprocess.Popen, str]:
"""Start a Playwright server in a separate process, return the process
object and a string with its websocket endpoint.
Pass fixed port and ws path as arguments instead of allowing Playwright
to choose, for some reason I was unable to capture stdout/stderr :shrug:
"""
port = str(random.randint(60_000, 63_000))
ws_path = str(uuid.uuid4())
launch_server_script_path = str(Path(__file__).parent.parent / "launch_browser_server.js")
launch_server_script_path = str(Path(__file__).parent.parent / "launch_chromium_server.js")
command = ["node", launch_server_script_path, port, ws_path]
proc = subprocess.Popen(command) # pylint: disable=consider-using-with
return proc, f"ws://localhost:{port}/{ws_path}"


@asynccontextmanager
async def remote_browser(is_chrome_devtools_protocol: bool = True):
async def remote_chromium(with_devtools_protocol: bool = True):
"""Launch a remote browser that lasts while in the context."""
proc = url = None
try:
if is_chrome_devtools_protocol:
if with_devtools_protocol:
proc, url = await _run_chromium_devtools()
else:
proc, url = _run_playwright_browser_server()
proc, url = _run_chromium_browser_server()
await asyncio.sleep(1) # allow some time for the browser to start
except Exception:
pass
Expand All @@ -77,15 +77,15 @@ async def remote_browser(is_chrome_devtools_protocol: bool = True):
proc.communicate()


class TestRemote(IsolatedAsyncioTestCase):
class TestRemoteBrowser(IsolatedAsyncioTestCase):
@pytest.fixture(autouse=True)
def inject_fixtures(self, caplog):
caplog.set_level(logging.DEBUG)
self._caplog = caplog

@allow_windows
async def test_connect_devtools(self):
async with remote_browser(is_chrome_devtools_protocol=True) as devtools_url:
async with remote_chromium(with_devtools_protocol=True) as devtools_url:
settings_dict = {
"PLAYWRIGHT_CDP_URL": devtools_url,
"PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True},
Expand All @@ -103,7 +103,7 @@ async def test_connect_devtools(self):

@allow_windows
async def test_connect(self):
async with remote_browser(is_chrome_devtools_protocol=False) as browser_url:
async with remote_chromium(with_devtools_protocol=False) as browser_url:
settings_dict = {
"PLAYWRIGHT_CONNECT_URL": browser_url,
"PLAYWRIGHT_LAUNCH_OPTIONS": {"headless": True},
Expand All @@ -128,3 +128,47 @@ async def test_connect(self):
logging.WARNING,
"Connecting to remote browser, ignoring PLAYWRIGHT_LAUNCH_OPTIONS",
) in self._caplog.record_tuples


class TestBrowserReconnect(IsolatedAsyncioTestCase):
@pytest.fixture(autouse=True)
def inject_fixtures(self, caplog):
caplog.set_level(logging.DEBUG)
self._caplog = caplog

@allow_windows
async def test_restart_browser(self):
spider = Spider("foo")
async with make_handler() as handler:
with StaticMockServer() as server:
req1 = Request(
server.urljoin("/index.html"),
meta={"playwright": True, "playwright_include_page": True},
)
resp1 = await handler._download_request(req1, spider)
page = resp1.meta["playwright_page"]
await page.context.browser.close()
req2 = Request(server.urljoin("/gallery.html"), meta={"playwright": True})
resp2 = await handler._download_request(req2, spider)
assert_correct_response(resp1, req1)
assert_correct_response(resp2, req2)
assert (
self._caplog.record_tuples.count(
(
"scrapy-playwright",
logging.DEBUG,
"Browser disconnected",
)
)
== 2 # one mid-crawl after calling Browser.close() manually, one at the end
)
assert (
self._caplog.record_tuples.count(
(
"scrapy-playwright",
logging.INFO,
"Launching browser chromium",
)
)
== 2 # one at the beginning, one after calling Browser.close() manually
)

0 comments on commit e26d3c1

Please sign in to comment.