Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add timeout for page.content() #541

Merged
merged 1 commit into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions skyvern/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
REPO_ROOT_DIR = SKYVERN_DIR.parent

INPUT_TEXT_TIMEOUT = 120000 # 2 minutes
PAGE_CONTENT_TIMEOUT = 300 # 5 mins


class ScrapeType(StrEnum):
Expand Down
4 changes: 2 additions & 2 deletions skyvern/forge/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
from skyvern.webeye.actions.models import AgentStepOutput, DetailedAgentStepOutput
from skyvern.webeye.actions.responses import ActionResult
from skyvern.webeye.browser_factory import BrowserState
from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage, scrape_website
from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage, get_page_content, scrape_website

LOG = structlog.get_logger()

Expand Down Expand Up @@ -786,7 +786,7 @@ async def record_artifacts_after_action(self, task: Task, step: Step, browser_st
)

try:
html = await browser_state.page.content()
html = await get_page_content(browser_state.page)
await app.ARTIFACT_MANAGER.create_artifact(
step=step,
artifact_type=ArtifactType.HTML_ACTION,
Expand Down
19 changes: 17 additions & 2 deletions skyvern/webeye/scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from playwright.async_api import Frame, Page
from pydantic import BaseModel

from skyvern.constants import SKYVERN_DIR, SKYVERN_ID_ATTR
from skyvern.constants import PAGE_CONTENT_TIMEOUT, SKYVERN_DIR, SKYVERN_ID_ATTR
from skyvern.exceptions import FailedToTakeScreenshot, UnknownElementTreeFormat
from skyvern.forge.sdk.settings_manager import SettingsManager
from skyvern.webeye.browser_factory import BrowserState
Expand Down Expand Up @@ -289,6 +289,16 @@ async def scrape_web_unsafe(

text_content = await get_frame_text(page.main_frame)

html = ""
try:
html = await get_page_content(page)
except Exception:
LOG.error(
"Failed out to get HTML content",
url=url,
exc_info=True,
)

return ScrapedPage(
elements=elements,
id_to_xpath_dict=id_to_xpath_dict,
Expand All @@ -298,11 +308,16 @@ async def scrape_web_unsafe(
element_tree_trimmed=trim_element_tree(copy.deepcopy(element_tree)),
screenshots=screenshots,
url=page.url,
html=await page.content(),
html=html,
extracted_text=text_content,
)


async def get_page_content(page: Page, timeout: float = PAGE_CONTENT_TIMEOUT) -> str:
async with asyncio.timeout(timeout):
return await page.content()


async def get_select2_options(page: Page) -> list[dict[str, Any]]:
await page.evaluate(JS_FUNCTION_DEFS)
js_script = "async () => await getSelect2Options()"
Expand Down
Loading