Skip to content

Commit

Permalink
Return proper status and headers for downloads
Browse files Browse the repository at this point in the history
  • Loading branch information
elacuesta committed Jul 6, 2024
1 parent ab103fe commit bde837a
Showing 1 changed file with 36 additions and 22 deletions.
58 changes: 36 additions & 22 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@
import logging
import platform
from contextlib import suppress
from dataclasses import dataclass
from dataclasses import dataclass, field as dataclass_field
from ipaddress import ip_address
from time import time
from typing import Awaitable, Callable, Dict, Optional, Tuple, Type, TypeVar, Union

from playwright.async_api import (
BrowserContext,
BrowserType,
Download,
Download as PlaywrightDownload,
Error as PlaywrightError,
Page,
Playwright as AsyncPlaywright,
Expand Down Expand Up @@ -66,6 +66,19 @@ class BrowserContextWrapper:
persistent: bool


@dataclass
class Download:
body: bytes = b""
url: str = ""
suggested_filename: str = ""
exception: Optional[Exception] = None
response_status: int = 200
headers: dict = dataclass_field(default_factory=dict)

def __bool__(self) -> bool:
return bool(self.body) or bool(self.exception)


@dataclass
class Config:
cdp_url: Optional[str]
Expand Down Expand Up @@ -397,7 +410,7 @@ async def _download_request_with_page(
await _set_redirect_meta(request=request, response=response)
headers = Headers(await response.all_headers())
headers.pop("Content-Encoding", None)
elif not download.get("bytes"):
elif not download:
logger.warning(
"Navigating to %s returned None, the response"
" will have empty headers and status 200",
Expand Down Expand Up @@ -428,20 +441,21 @@ async def _download_request_with_page(
server_addr = await response.server_addr()
server_ip_address = ip_address(server_addr["ipAddress"])

if download.get("exception"):
raise download["exception"]
if download and download.exception:
raise download.exception

if not request.meta.get("playwright_include_page"):
await page.close()
self.stats.inc_value("playwright/page_count/closed")

if download.get("bytes"):
request.meta["playwright_suggested_filename"] = download.get("suggested_filename")
respcls = responsetypes.from_args(url=download["url"], body=download["bytes"])
if download:
request.meta["playwright_suggested_filename"] = download.suggested_filename
respcls = responsetypes.from_args(url=download.url, body=download.body)
return respcls(
url=download["url"],
status=200,
body=download["bytes"],
url=download.url,
status=download.response_status,
headers=Headers(download.headers),
body=download.body,
request=request,
flags=["playwright"],
)
Expand All @@ -461,29 +475,29 @@ async def _download_request_with_page(

async def _get_response_and_download(
self, request: Request, page: Page, spider: Spider
) -> Tuple[Optional[PlaywrightResponse], dict]:
) -> Tuple[Optional[PlaywrightResponse], Optional[Download]]:
response: Optional[PlaywrightResponse] = None
download: dict = {} # updated in-place in _handle_download
download: Download = Download() # updated in-place in _handle_download
download_started = asyncio.Event()
download_ready = asyncio.Event()

async def _handle_download(dwnld: Download) -> None:
async def _handle_download(dwnld: PlaywrightDownload) -> None:
download_started.set()
self.stats.inc_value("playwright/download_count")
try:
if failure := await dwnld.failure():
raise RuntimeError(f"Failed to download {dwnld.url}: {failure}")
download_path = await dwnld.path()
download["bytes"] = download_path.read_bytes()
download["url"] = dwnld.url
download["suggested_filename"] = dwnld.suggested_filename
download.body = (await dwnld.path()).read_bytes()
download.url = dwnld.url
download.suggested_filename = dwnld.suggested_filename
except Exception as ex:
download["exception"] = ex
download.exception = ex
finally:
download_ready.set()

async def _handle_response(response: PlaywrightResponse) -> None:
download["response_status"] = response.status
download.response_status = response.status
download.headers = await response.all_headers()
download_started.set()

page_goto_kwargs = request.meta.get("playwright_page_goto_kwargs") or {}
Expand Down Expand Up @@ -513,7 +527,7 @@ async def _handle_response(response: PlaywrightResponse) -> None:
)
await download_started.wait()

if download.get("response_status") == 204:
if download.response_status == 204:
raise err

logger.debug(
Expand All @@ -531,7 +545,7 @@ async def _handle_response(response: PlaywrightResponse) -> None:
page.remove_listener("download", _handle_download)
page.remove_listener("response", _handle_response)

return response, download
return response, download if download else None

async def _apply_page_methods(self, page: Page, request: Request, spider: Spider) -> None:
context_name = request.meta.get("playwright_context")
Expand Down

0 comments on commit bde837a

Please sign in to comment.