From df3f7125c46b8a15a627e673bdc1900652e5ceeb Mon Sep 17 00:00:00 2001 From: LawyZheng Date: Wed, 3 Jul 2024 17:15:18 +0800 Subject: [PATCH] add timeout for page.content() --- skyvern/constants.py | 1 + skyvern/forge/agent.py | 4 ++-- skyvern/webeye/scraper/scraper.py | 19 +++++++++++++++++-- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/skyvern/constants.py b/skyvern/constants.py index cc43eb212..1fcc73ce6 100644 --- a/skyvern/constants.py +++ b/skyvern/constants.py @@ -7,6 +7,7 @@ REPO_ROOT_DIR = SKYVERN_DIR.parent INPUT_TEXT_TIMEOUT = 120000 # 2 minutes +PAGE_CONTENT_TIMEOUT = 300 # 5 mins class ScrapeType(StrEnum): diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 689ff3348..944745a6d 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -52,7 +52,7 @@ from skyvern.webeye.actions.models import AgentStepOutput, DetailedAgentStepOutput from skyvern.webeye.actions.responses import ActionResult from skyvern.webeye.browser_factory import BrowserState -from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage, scrape_website +from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage, get_page_content, scrape_website LOG = structlog.get_logger() @@ -786,7 +786,7 @@ async def record_artifacts_after_action(self, task: Task, step: Step, browser_st ) try: - html = await browser_state.page.content() + html = await get_page_content(browser_state.page) await app.ARTIFACT_MANAGER.create_artifact( step=step, artifact_type=ArtifactType.HTML_ACTION, diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index 7d7fac13b..f765810c5 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -9,7 +9,7 @@ from playwright.async_api import Frame, Page from pydantic import BaseModel -from skyvern.constants import SKYVERN_DIR, SKYVERN_ID_ATTR +from skyvern.constants import PAGE_CONTENT_TIMEOUT, SKYVERN_DIR, SKYVERN_ID_ATTR from skyvern.exceptions import FailedToTakeScreenshot, UnknownElementTreeFormat from skyvern.forge.sdk.settings_manager import SettingsManager from skyvern.webeye.browser_factory import BrowserState @@ -289,6 +289,16 @@ async def scrape_web_unsafe( text_content = await get_frame_text(page.main_frame) + html = "" + try: + html = await get_page_content(page) + except Exception: + LOG.error( + "Failed out to get HTML content", + url=url, + exc_info=True, + ) + return ScrapedPage( elements=elements, id_to_xpath_dict=id_to_xpath_dict, @@ -298,11 +308,16 @@ async def scrape_web_unsafe( element_tree_trimmed=trim_element_tree(copy.deepcopy(element_tree)), screenshots=screenshots, url=page.url, - html=await page.content(), + html=html, extracted_text=text_content, ) +async def get_page_content(page: Page, timeout: float = PAGE_CONTENT_TIMEOUT) -> str: + async with asyncio.timeout(timeout): + return await page.content() + + async def get_select2_options(page: Page) -> list[dict[str, Any]]: await page.evaluate(JS_FUNCTION_DEFS) js_script = "async () => await getSelect2Options()"