diff --git a/skyvern/exceptions.py b/skyvern/exceptions.py index 1ee85950c..fadc38929 100644 --- a/skyvern/exceptions.py +++ b/skyvern/exceptions.py @@ -464,3 +464,38 @@ def __init__(self, target: str, reason: str | None) -> None: class NoElementBoudingBox(SkyvernException): def __init__(self, element_id: str) -> None: super().__init__(f"Element does not have a bounding box. element_id={element_id}") + + +class NoIncrementalElementFoundForAutoCompletion(SkyvernException): + def __init__(self, element_id: str, text: str) -> None: + super().__init__(f"No auto completion shown up after fill in [{text}]. element_id={element_id}") + + +class NoSuitableAutoCompleteOption(SkyvernException): + def __init__(self, reasoning: str | None, target_value: str) -> None: + super().__init__( + f"No suitable auto complete option to choose. target_value={target_value}, reasoning={reasoning}" + ) + + +class NoAutoCompleteOptionMeetCondition(SkyvernException): + def __init__( + self, reasoning: str | None, required_relevance: float, target_value: str, closest_relevance: float + ) -> None: + super().__init__( + f"No auto complete option meet the condition(relevance_float>{required_relevance}). reasoning={reasoning}, target_value={target_value}, closest_relevance={closest_relevance}" + ) + + +class ErrEmptyTweakValue(SkyvernException): + def __init__(self, reasoning: str | None, current_value: str) -> None: + super().__init__( + f"Empty tweaked value for the current value. reasoning={reasoning}, current_value={current_value}" + ) + + +class FailToFindAutocompleteOption(SkyvernException): + def __init__(self, current_value: str) -> None: + super().__init__( + f"Can't find a suitable auto completion for the current value, maybe retry with another reasonable value. current_value={current_value}" + ) diff --git a/skyvern/forge/prompts/skyvern/auto-completion-choose-option.j2 b/skyvern/forge/prompts/skyvern/auto-completion-choose-option.j2 new file mode 100644 index 000000000..ffb445ffc --- /dev/null +++ b/skyvern/forge/prompts/skyvern/auto-completion-choose-option.j2 @@ -0,0 +1,37 @@ +There is an input element on a HTML page. Based on the context and information you're provided, you have two goals: + - Confirm if there is an auto completion attempt showing up after the user input the current value. + - If available auto completion suggestions show up, help user choose the element that's the most relevant to the input value. + +You can confirm auto completion attempt based on the following rules: + - Several auto completion suggestions show up for the input value. + - Some messages, like "No results", "No match", also indicate an attempt to give auto completion suggestions. + +Potential auto completion suggesstion could only be: + - Element with ID from "HTML elements". Don't hallucinate any potential option outside "HTML elements". + +MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. +Each interactable element is tagged with an ID. + +Reply in JSON format with the following keys: +{ + "auto_completion_attempt": bool, // True if there's any auto completion attempt based on the rules. Otherwise, it should be False. + "reasoning": str, // The reasoning behind the decision. Be specific, referencing input value and element ids in your reasoning. Mention why you chose the element id. Keep the reasoning short and to the point. + "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence. + "relevance_float": float, // The relative between the input value and the element. Pick a number between 0.00 and 1.00. 0.00 means no relevance, 1.00 means full relevance, the precision is 0.01. + "id": str, // The id of the most relevant and interactable element to take the action. The id must be from "HTML elements". It should be null if no element is relative or there's no auto completion suggestion. +} + +Context: +``` +{{ context_reasoning }} +``` + +Input value: +``` +{{ filled_value }} +``` + +HTML elements: +``` +{{ elements }} +``` \ No newline at end of file diff --git a/skyvern/forge/prompts/skyvern/auto-completion-potential-answers.j2 b/skyvern/forge/prompts/skyvern/auto-completion-potential-answers.j2 new file mode 100644 index 000000000..23aafabf5 --- /dev/null +++ b/skyvern/forge/prompts/skyvern/auto-completion-potential-answers.j2 @@ -0,0 +1,29 @@ +You're doing an auto completion input action on HTML page. The current filled value doesn't match any option. +Based on the context and current value, give ten most potential values with the same meaning as the current value. +You can provide values like: + - Subset or superset meaning from the current value + - Summarized from the current value + - Remove too detailed information, making more general and concise +But don't add any extra information to the value. + +MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. +Reply in JSON format with the following keys: +{ + "potential_values": [ + { + "reasoning": str, // the reasoning why you recommend this value, including the relationship between the value you recommend and the current value. Keep the reasoning short and to the point. + "relevance_float": float, // The relative between the target value and the element. Pick a number between 0.00 and 1.00. 0.00 means no relevance, 1.00 means full relevance, the precision is 0.01. + "value": str, // the value you recommend + } + ], // The list of potential values. Sorted by the descending order of relevance_float +} + +Context: +``` +{{ context_reasoning }} +``` + +Current Value: +``` +{{ current_value }} +``` \ No newline at end of file diff --git a/skyvern/forge/prompts/skyvern/auto-completion-tweak-value.j2 b/skyvern/forge/prompts/skyvern/auto-completion-tweak-value.j2 new file mode 100644 index 000000000..b8d893355 --- /dev/null +++ b/skyvern/forge/prompts/skyvern/auto-completion-tweak-value.j2 @@ -0,0 +1,38 @@ +You're doing an auto completion input action on HTML page. User has tried several values, but none of them could find a match. +Based on the context, current value, tried values, option elements popped up while typing, tweak the value into a reasonable one based on the information. +You can try to change the value under the following rules: + 1. the value must be reasonably changed from the current value, like superset, subset of the current value + 2. If there're popped up elements, find the common concept among all elements, and then tweak the current value into a reasonable value based on the same concept. + +Don't add any extra information to the value. +Don't use any value from the popped up elements. + +MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. +Reply in JSON format with the following keys: +{ + "is_any_popped_up_elements": bool, // if there's any popped up elements to extract the concept + "common_concept": str, // Simple words to describe the common concept among all elements. null if there's no popped up elements. + "reasoning": str, // The reasoning behind the change. Be specific, referencing tweaked value in your reasoning. Mention why you make this decision. Keep the reasoning short and to the point. + "confidence_float": float, // The confidence of the decision. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence + "tweaked_value": str, // the value tweaked from current value. If common_concept is not null, the value should also under the same concept +} + +Context: +``` +{{ context_reasoning }} +``` + +Current Value: +``` +{{ current_value }} +``` + +Tried Values: +``` +{{ tried_values }} +``` + +Popped up elements: +``` +{{ popped_up_elements }} +``` \ No newline at end of file diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 15bc4f434..2bec132bd 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -1,4 +1,5 @@ import asyncio +import copy import json import os import urllib.parse @@ -9,13 +10,16 @@ import structlog from deprecation import deprecated from playwright.async_api import FileChooser, Locator, Page, TimeoutError +from pydantic import BaseModel -from skyvern.constants import REPO_ROOT_DIR, VERIFICATION_CODE_POLLING_TIMEOUT_MINS +from skyvern.constants import REPO_ROOT_DIR, SKYVERN_ID_ATTR, VERIFICATION_CODE_POLLING_TIMEOUT_MINS from skyvern.exceptions import ( EmptySelect, + ErrEmptyTweakValue, ErrFoundSelectableElement, FailedToFetchSecret, FailToClick, + FailToFindAutocompleteOption, FailToSelectByIndex, FailToSelectByLabel, FailToSelectByValue, @@ -24,9 +28,12 @@ MissingElement, MissingFileUrl, MultipleElementsFound, + NoAutoCompleteOptionMeetCondition, NoElementMatchedForTargetOption, + NoIncrementalElementFoundForAutoCompletion, NoIncrementalElementFoundForCustomSelection, NoLabelOrValueForCustomSelection, + NoSuitableAutoCompleteOption, OptionIndexOutOfBound, WrongElementToUploadFile, ) @@ -59,7 +66,13 @@ ) from skyvern.webeye.actions.responses import ActionFailure, ActionResult, ActionSuccess from skyvern.webeye.browser_factory import BrowserState, get_download_dir -from skyvern.webeye.scraper.scraper import ElementTreeFormat, IncrementalScrapePage, ScrapedPage +from skyvern.webeye.scraper.scraper import ( + ElementTreeFormat, + IncrementalScrapePage, + ScrapedPage, + json_to_html, + trim_element_tree, +) from skyvern.webeye.utils.dom import DomUtil, InteractiveElement, SkyvernElement from skyvern.webeye.utils.page import SkyvernFrame @@ -67,6 +80,12 @@ COMMON_INPUT_TAGS = {"input", "textarea", "select"} +class AutoCompletionResult(BaseModel): + auto_completion_attempt: bool = False + incremental_elements: list[dict] = [] + action_result: ActionResult = ActionSuccess() + + class ActionHandler: _handled_action_types: dict[ ActionType, @@ -290,6 +309,7 @@ async def handle_input_text_action( dom = DomUtil(scraped_page, page) skyvern_element = await dom.get_skyvern_element_by_id(action.element_id) skyvern_frame = await SkyvernFrame.create_instance(skyvern_element.get_frame()) + incremental_scraped = IncrementalScrapePage(skyvern_frame=skyvern_frame) timeout = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS current_text = await get_input_value(skyvern_element.get_tag_name(), skyvern_element.get_locator()) @@ -319,7 +339,6 @@ async def handle_input_text_action( return await handle_select_option_action(select_action, page, scraped_page, task, step) # press arrowdown to watch if there's any options popping up - incremental_scraped = IncrementalScrapePage(skyvern_frame=skyvern_frame) await incremental_scraped.start_listen_dom_increment() await skyvern_element.get_locator().focus(timeout=timeout) await skyvern_element.get_locator().press("ArrowDown", timeout=timeout) @@ -376,12 +395,26 @@ async def handle_input_text_action( LOG.warning("Failed to clear the input field", action=action, exc_info=True) return [ActionFailure(InvalidElementForTextInput(element_id=action.element_id, tag_name=tag_name))] + # TODO: not sure if this case will trigger auto-completion if tag_name not in COMMON_INPUT_TAGS: await skyvern_element.input_fill(text) return [ActionSuccess()] - # If the input is a text input, we type the text character by character - # 3 times the time it takes to type the text so it has time to finish typing + if len(text) == 0: + return [ActionSuccess()] + + if await skyvern_element.is_auto_completion_input(): + result = await input_or_auto_complete_input( + action=action, + page=page, + dom=dom, + text=text, + skyvern_element=skyvern_element, + step=step, + task=task, + ) + return [result] + await skyvern_element.input_sequentially(text=text) return [ActionSuccess()] @@ -848,6 +881,282 @@ async def fc_func(fc: FileChooser) -> None: return [ActionFailure(WrongElementToUploadFile(action.element_id))] +def remove_exist_elements(dom: DomUtil, element_tree: list[dict]) -> list[dict]: + new_element_tree = [] + for element in element_tree: + children_elements = element.get("children", []) + if len(children_elements) > 0: + children_elements = remove_exist_elements(dom=dom, element_tree=children_elements) + if dom.check_id_in_dom(element.get("id", "")): + new_element_tree.extend(children_elements) + else: + element["children"] = children_elements + new_element_tree.append(element) + return new_element_tree + + +async def choose_auto_completion_dropdown( + action: actions.InputTextAction, + page: Page, + dom: DomUtil, + text: str, + skyvern_element: SkyvernElement, + step: Step, + task: Task, + preserved_elements: list[dict] | None = None, + relevance_threshold: float = 0.8, +) -> AutoCompletionResult: + preserved_elements = preserved_elements or [] + clear_input = True + result = AutoCompletionResult() + + current_frame = skyvern_element.get_frame() + skyvern_frame = await SkyvernFrame.create_instance(current_frame) + incremental_scraped = IncrementalScrapePage(skyvern_frame=skyvern_frame) + await incremental_scraped.start_listen_dom_increment() + + try: + await skyvern_element.press_fill(text) + # wait for new elemnts to load + await asyncio.sleep(5) + incremental_element = await incremental_scraped.get_incremental_element_tree( + app.AGENT_FUNCTION.cleanup_element_tree_factory(task=task, step=step) + ) + incremental_element = remove_exist_elements(dom=dom, element_tree=incremental_element) + + # check if elements in preserve list are still on the page + confirmed_preserved_list: list[dict] = [] + for element in preserved_elements: + element_id = element.get("id") + if not element_id: + continue + locator = current_frame.locator(f'[{SKYVERN_ID_ATTR}="{element_id}"]') + cnt = await locator.count() + if cnt == 0: + continue + + element_handler = await locator.element_handle( + timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS + ) + if not element_handler: + continue + + current_element = await skyvern_frame.parse_element_from_html( + skyvern_element.get_frame_id(), element_handler, skyvern_element.is_interactable() + ) + confirmed_preserved_list.append(current_element) + + if len(confirmed_preserved_list) > 0: + confirmed_preserved_list = await app.AGENT_FUNCTION.cleanup_element_tree_factory(task=task, step=step)( + skyvern_frame.get_frame().url, copy.deepcopy(confirmed_preserved_list) + ) + confirmed_preserved_list = trim_element_tree(copy.deepcopy(confirmed_preserved_list)) + + incremental_element.extend(confirmed_preserved_list) + + result.incremental_elements = copy.deepcopy(incremental_element) + if len(incremental_element) == 0: + raise NoIncrementalElementFoundForAutoCompletion(element_id=skyvern_element.get_id(), text=text) + + html = incremental_scraped.build_html_tree(incremental_element) + auto_completion_confirm_prompt = prompt_engine.load_prompt( + "auto-completion-choose-option", + context_reasoning=action.reasoning, + filled_value=text, + elements=html, + ) + LOG.info( + "Confirm if it's an auto completion dropdown", + step_id=step.step_id, + task_id=task.task_id, + ) + json_response = await app.SECONDARY_LLM_API_HANDLER(prompt=auto_completion_confirm_prompt, step=step) + element_id = json_response.get("id", "") + relevance_float = json_response.get("relevance_float", 0) + if not element_id: + reasoning = json_response.get("reasoning") + raise NoSuitableAutoCompleteOption(reasoning=reasoning, target_value=text) + + if relevance_float < relevance_threshold: + LOG.info( + f"The closest option doesn't meet the condition(relevance_float>={relevance_threshold})", + element_id=element_id, + relevance_float=relevance_float, + ) + reasoning = json_response.get("reasoning") + raise NoAutoCompleteOptionMeetCondition( + reasoning=reasoning, + required_relevance=relevance_threshold, + target_value=text, + closest_relevance=relevance_float, + ) + + LOG.info( + "Find a suitable option to choose", + element_id=element_id, + step_id=step.step_id, + task_id=task.task_id, + ) + + locator = current_frame.locator(f'[{SKYVERN_ID_ATTR}="{element_id}"]') + if await locator.count() == 0: + raise MissingElement(element_id=element_id) + + await locator.click(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS) + clear_input = False + return result + except Exception as e: + LOG.info( + "Failed to choose the auto completion dropdown", + exc_info=True, + input_value=text, + task_id=task.task_id, + step_id=step.step_id, + ) + result.action_result = ActionFailure(exception=e) + return result + finally: + await incremental_scraped.stop_listen_dom_increment() + if clear_input: + await skyvern_element.input_clear() + + +async def input_or_auto_complete_input( + action: actions.InputTextAction, + page: Page, + dom: DomUtil, + text: str, + skyvern_element: SkyvernElement, + step: Step, + task: Task, +) -> ActionResult: + LOG.info( + "Trigger auto completion", + task_id=task.task_id, + step_id=step.step_id, + element_id=skyvern_element.get_id(), + ) + + # 1. press the orignal text to see if there's a match + # 2. call LLM to find 5 potential values based on the orginal text + # 3. try each potential values from #2 + # 4. call LLM to tweak the orignal text according to the information from #3, then start #1 again + + # FIXME: try the whole loop for twice now, to prevent too many LLM calls + MAX_AUTO_COMPLETE_ATTEMP = 2 + current_attemp = 0 + context_reasoning = action.reasoning + current_value = text + result = AutoCompletionResult() + + while current_attemp < MAX_AUTO_COMPLETE_ATTEMP: + current_attemp += 1 + whole_new_elements: list[dict] = [] + tried_values: list[str] = [] + + LOG.info( + "Try the potential value for auto completion", + step_id=step.step_id, + task_id=task.task_id, + input_value=current_value, + ) + result = await choose_auto_completion_dropdown( + action=action, + page=page, + dom=dom, + text=current_value, + preserved_elements=result.incremental_elements, + skyvern_element=skyvern_element, + step=step, + task=task, + ) + if isinstance(result.action_result, ActionSuccess): + return ActionSuccess() + + tried_values.append(current_value) + whole_new_elements.extend(result.incremental_elements) + + prompt = prompt_engine.load_prompt( + "auto-completion-potential-answers", + context_reasoning=context_reasoning, + current_value=current_value, + ) + + LOG.info( + "Ask LLM to give 10 potential values based on the current value", + current_value=current_value, + step_id=step.step_id, + task_id=task.task_id, + ) + json_respone = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step) + values: list[dict] = json_respone.get("potential_values", []) + + for each_value in values: + value: str = each_value.get("value", "") + if not value: + LOG.info( + "Empty potential value, skip this attempt", + step_id=step.step_id, + task_id=task.task_id, + value=each_value, + ) + continue + LOG.info( + "Try the potential value for auto completion", + step_id=step.step_id, + task_id=task.task_id, + input_value=value, + ) + result = await choose_auto_completion_dropdown( + action=action, + page=page, + dom=dom, + text=value, + preserved_elements=result.incremental_elements, + skyvern_element=skyvern_element, + step=step, + task=task, + ) + if isinstance(result.action_result, ActionSuccess): + return ActionSuccess() + + tried_values.append(value) + whole_new_elements.extend(result.incremental_elements) + + if current_attemp < MAX_AUTO_COMPLETE_ATTEMP: + LOG.info( + "Ask LLM to tweak the current value based on tried input values", + step_id=step.step_id, + task_id=task.task_id, + current_value=current_value, + current_attemp=current_attemp, + ) + prompt = prompt_engine.load_prompt( + "auto-completion-tweak-value", + context_reasoning=context_reasoning, + current_value=current_value, + tried_values=json.dumps(tried_values), + popped_up_elements="".join([json_to_html(element) for element in whole_new_elements]), + ) + json_respone = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step) + context_reasoning = json_respone.get("reasoning") + new_current_value = json_respone.get("tweaked_value", "") + if not new_current_value: + return ActionFailure(ErrEmptyTweakValue(reasoning=context_reasoning, current_value=current_value)) + LOG.info( + "Ask LLM tweaked the current value with a new value", + step_id=step.step_id, + task_id=task.task_id, + reasoning=context_reasoning, + current_value=current_value, + new_value=new_current_value, + ) + current_value = new_current_value + + else: + return ActionFailure(FailToFindAutocompleteOption(current_value=text)) + + async def select_from_dropdown( action: SelectOptionAction, page: Page, diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index 0c9146caa..d8eca8589 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -877,108 +877,104 @@ function uniqueId() { return result; } -async function buildTreeFromBody(frame = "main.frame", open_select = false) { - return buildElementTree(document.body, frame, open_select); -} - -async function buildElementTree( - starter = document.body, - frame = "main.frame", - open_select = false, -) { - var elements = []; - var resultArray = []; - - async function buildElementObject(element, interactable) { - var element_id = element.getAttribute("unique_id") ?? uniqueId(); - var elementTagNameLower = element.tagName.toLowerCase(); - element.setAttribute("unique_id", element_id); - - const attrs = {}; - for (const attr of element.attributes) { - var attrValue = attr.value; - if ( - attr.name === "required" || - attr.name === "aria-required" || - attr.name === "checked" || - attr.name === "aria-checked" || - attr.name === "selected" || - attr.name === "aria-selected" || - attr.name === "readonly" || - attr.name === "aria-readonly" - ) { - if (attrValue && attrValue.toLowerCase() === "false") { - attrValue = false; - } else { - attrValue = true; - } - } - attrs[attr.name] = attrValue; - } - +function buildElementObject(frame, element, interactable) { + var element_id = element.getAttribute("unique_id") ?? uniqueId(); + var elementTagNameLower = element.tagName.toLowerCase(); + element.setAttribute("unique_id", element_id); + + const attrs = {}; + for (const attr of element.attributes) { + var attrValue = attr.value; if ( - checkRequiredFromStyle(element) && - !attrs["required"] && - !attrs["aria-required"] + attr.name === "required" || + attr.name === "aria-required" || + attr.name === "checked" || + attr.name === "aria-checked" || + attr.name === "selected" || + attr.name === "aria-selected" || + attr.name === "readonly" || + attr.name === "aria-readonly" ) { - attrs["required"] = true; - } - - if (elementTagNameLower === "input" || elementTagNameLower === "textarea") { - if (element.type === "radio") { - attrs["value"] = "" + element.checked + ""; + if (attrValue && attrValue.toLowerCase() === "false") { + attrValue = false; } else { - attrs["value"] = element.value; + attrValue = true; } } + attrs[attr.name] = attrValue; + } - let elementObj = { - id: element_id, - frame: frame, - interactable: interactable, - tagName: elementTagNameLower, - attributes: attrs, - text: getElementContent(element), - children: [], - rect: DomUtils.getVisibleClientRect(element, true), - // don't trim any attr of this element if keepAllAttr=True - keepAllAttr: - elementTagNameLower === "svg" || element.closest("svg") !== null, - isSelectable: - elementTagNameLower === "select" || - isSelect2Dropdown(element) || - isSelect2MultiChoice(element), - isScrollable: isScrollable(element), - }; + if ( + checkRequiredFromStyle(element) && + !attrs["required"] && + !attrs["aria-required"] + ) { + attrs["required"] = true; + } - let isInShadowRoot = element.getRootNode() instanceof ShadowRoot; - if (isInShadowRoot) { - let shadowHostEle = element.getRootNode().host; - let shadowHostId = shadowHostEle.getAttribute("unique_id"); - // assign shadowHostId to the shadowHost element if it doesn't have unique_id - if (!shadowHostId) { - shadowHostId = uniqueId(); - shadowHostEle.setAttribute("unique_id", shadowHostId); - } - elementObj.shadowHost = shadowHostId; + if (elementTagNameLower === "input" || elementTagNameLower === "textarea") { + if (element.type === "radio") { + attrs["value"] = "" + element.checked + ""; + } else { + attrs["value"] = element.value; } + } - // get options for select element or for listbox element - let selectOptions = null; - let selectedValue = ""; - if (elementTagNameLower === "select") { - [selectOptions, selectedValue] = getSelectOptions(element); - } + let elementObj = { + id: element_id, + frame: frame, + interactable: interactable, + tagName: elementTagNameLower, + attributes: attrs, + text: getElementContent(element), + children: [], + rect: DomUtils.getVisibleClientRect(element, true), + // don't trim any attr of this element if keepAllAttr=True + keepAllAttr: + elementTagNameLower === "svg" || element.closest("svg") !== null, + isSelectable: + elementTagNameLower === "select" || + isSelect2Dropdown(element) || + isSelect2MultiChoice(element), + isScrollable: isScrollable(element), + }; - if (selectOptions) { - elementObj.options = selectOptions; - } - if (selectedValue) { - elementObj.attributes["selected"] = selectedValue; + let isInShadowRoot = element.getRootNode() instanceof ShadowRoot; + if (isInShadowRoot) { + let shadowHostEle = element.getRootNode().host; + let shadowHostId = shadowHostEle.getAttribute("unique_id"); + // assign shadowHostId to the shadowHost element if it doesn't have unique_id + if (!shadowHostId) { + shadowHostId = uniqueId(); + shadowHostEle.setAttribute("unique_id", shadowHostId); } + elementObj.shadowHost = shadowHostId; + } + + // get options for select element or for listbox element + let selectOptions = null; + let selectedValue = ""; + if (elementTagNameLower === "select") { + [selectOptions, selectedValue] = getSelectOptions(element); + } - return elementObj; + if (selectOptions) { + elementObj.options = selectOptions; } + if (selectedValue) { + elementObj.attributes["selected"] = selectedValue; + } + + return elementObj; +} + +function buildTreeFromBody(frame = "main.frame", open_select = false) { + return buildElementTree(document.body, frame, open_select); +} + +function buildElementTree(starter = document.body, frame = "main.frame") { + var elements = []; + var resultArray = []; function getChildElements(element) { if (element.childElementCount !== 0) { @@ -987,7 +983,7 @@ async function buildElementTree( return []; } } - async function processElement(element, parentId) { + function processElement(element, parentId) { if (element === null) { console.log("get a null element"); return; @@ -1008,7 +1004,7 @@ async function buildElementTree( // Check if the element is interactable if (isInteractable(element)) { - var elementObj = await buildElementObject(element, true); + var elementObj = buildElementObject(frame, element, true); elements.push(elementObj); // If the element is interactable but has no interactable parent, // then it starts a new tree, so add it to the result array @@ -1029,24 +1025,24 @@ async function buildElementTree( const children = getChildElements(element); for (let i = 0; i < children.length; i++) { const childElement = children[i]; - await processElement(childElement, elementObj.id); + processElement(childElement, elementObj.id); } return elementObj; } else if (element.tagName.toLowerCase() === "iframe") { - let iframeElementObject = await buildElementObject(element, false); + let iframeElementObject = buildElementObject(frame, element, false); elements.push(iframeElementObject); resultArray.push(iframeElementObject); } else if (element.shadowRoot) { // shadow host element - let shadowHostElement = await buildElementObject(element, false); + let shadowHostElement = buildElementObject(frame, element, false); elements.push(shadowHostElement); resultArray.push(shadowHostElement); const children = getChildElements(element.shadowRoot); for (let i = 0; i < children.length; i++) { const childElement = children[i]; - await processElement(childElement, shadowHostElement.id); + processElement(childElement, shadowHostElement.id); } } else { // For a non-interactable element, if it has direct text, we also tagged @@ -1063,14 +1059,14 @@ async function buildElementTree( let isParentSVG = element.closest("svg"); if (element.tagName.toLowerCase() === "svg") { // if element is we save all attributes and its children - elementObj = await buildElementObject(element, false); + elementObj = buildElementObject(frame, element, false); } else if (isParentSVG && isParentSVG.getAttribute("unique_id")) { // if elemnet is the children of the with an unique_id - elementObj = await buildElementObject(element, false); + elementObj = buildElementObject(frame, element, false); } else if (isTableRelatedElement(element)) { // build all table related elements into skyvern element // we need these elements to preserve the DOM structure - elementObj = await buildElementObject(element, false); + elementObj = buildElementObject(frame, element, false); } else { // character length limit for non-interactable elements should be 5000 // we don't use element context in HTML format, @@ -1083,7 +1079,7 @@ async function buildElementTree( } } if (textContent && textContent.length <= 5000) { - elementObj = await buildElementObject(element, false); + elementObj = buildElementObject(frame, element, false); } } @@ -1104,7 +1100,7 @@ async function buildElementTree( const children = getChildElements(element); for (let i = 0; i < children.length; i++) { const childElement = children[i]; - await processElement(childElement, parentId); + processElement(childElement, parentId); } } } @@ -1313,7 +1309,7 @@ async function buildElementTree( }; // setup before parsing the dom - await processElement(starter, null); + processElement(starter, null); for (var element of elements) { if ( @@ -1545,17 +1541,17 @@ function removeBoundingBoxes() { } } -async function scrollToTop(draw_boxes) { +function scrollToTop(draw_boxes) { removeBoundingBoxes(); window.scroll({ left: 0, top: 0, behavior: "instant" }); if (draw_boxes) { - var elementsAndResultArray = await buildTreeFromBody(); + var elementsAndResultArray = buildTreeFromBody(); drawBoundingBoxes(elementsAndResultArray[0]); } return window.scrollY; } -async function scrollToNextPage(draw_boxes) { +function scrollToNextPage(draw_boxes) { // remove bounding boxes, scroll to next page with 200px overlap, then draw bounding boxes again // return true if there is a next page, false otherwise removeBoundingBoxes(); @@ -1565,7 +1561,7 @@ async function scrollToNextPage(draw_boxes) { behavior: "instant", }); if (draw_boxes) { - var elementsAndResultArray = await buildTreeFromBody(); + var elementsAndResultArray = buildTreeFromBody(); drawBoundingBoxes(elementsAndResultArray[0]); } return window.scrollY; @@ -1688,7 +1684,7 @@ function stopGlobalIncrementalObserver() { window.globalOneTimeIncrementElements = []; } -async function getIncrementElements(frame) { +function getIncrementElements(frame) { const domDepthMap = new Map(); for (const element of window.globalOneTimeIncrementElements) { @@ -1700,7 +1696,7 @@ async function getIncrementElements(frame) { } for (const child of element.newNodes) { - const [_, newNodeTree] = await buildElementTree(child, frame, false); + const [_, newNodeTree] = buildElementTree(child, frame, false); if (newNodeTree.length > 0) { newNodesTreeList.push(...newNodeTree); } diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index 7d648ba30..cbdeb6f5c 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -337,7 +337,7 @@ async def get_interactable_element_tree_in_frame( unique_id = await frame_element.get_attribute("unique_id") - frame_js_script = f"async () => await buildTreeFromBody('{unique_id}', true)" + frame_js_script = f"() => buildTreeFromBody('{unique_id}', true)" await frame.evaluate(JS_FUNCTION_DEFS) frame_elements, frame_element_tree = await frame.evaluate(frame_js_script) @@ -373,7 +373,7 @@ async def get_interactable_element_tree( :return: Tuple containing the element tree and a map of element IDs to elements. """ await page.evaluate(JS_FUNCTION_DEFS) - main_frame_js_script = "async () => await buildTreeFromBody('main.frame', true)" + main_frame_js_script = "() => buildTreeFromBody('main.frame', true)" elements, element_tree = await page.evaluate(main_frame_js_script) if len(page.main_frame.child_frames) > 0: @@ -415,7 +415,7 @@ async def get_incremental_element_tree( exc_info=True, ) - js_script = f"async () => await getIncrementElements('{frame_id}')" + js_script = f"() => getIncrementElements('{frame_id}')" incremental_elements, incremental_tree = await frame.evaluate(js_script) # we listen the incremental elements seperated by frames, so all elements will be in the same SkyvernFrame self.id_to_css_dict, self.id_to_element_dict, _ = build_element_dict(incremental_elements) @@ -473,7 +473,8 @@ def trim_element_tree(elements: list[dict]) -> list[dict]: else: del queue_ele["attributes"] # remove the tag, don't need it in the HTML tree - del queue_ele["keepAllAttr"] + if "keepAllAttr" in queue_ele: + del queue_ele["keepAllAttr"] if "children" in queue_ele: queue.extend(queue_ele["children"]) diff --git a/skyvern/webeye/utils/dom.py b/skyvern/webeye/utils/dom.py index 0aaea0b35..b5c105f38 100644 --- a/skyvern/webeye/utils/dom.py +++ b/skyvern/webeye/utils/dom.py @@ -159,6 +159,22 @@ async def is_combobox_dropdown(self) -> bool: haspopup = await self.get_attr("aria-haspopup") return tag_name == InteractiveElement.INPUT and role == "combobox" and haspopup == "listbox" + async def is_auto_completion_input(self) -> bool: + tag_name = self.get_tag_name() + if tag_name != InteractiveElement.INPUT: + return False + + haspopup = await self.get_attr("aria-haspopup") + autocomplete = await self.get_attr("aria-autocomplete") + if haspopup and autocomplete: + return True + + element_id = await self.get_attr("id") + if element_id == "location-input": + return True + + return False + async def is_checkbox(self) -> bool: tag_name = self.get_tag_name() if tag_name != "input": @@ -181,6 +197,9 @@ def is_interactable(self) -> bool: async def is_selectable(self) -> bool: return self.get_selectable() or self.get_tag_name() in SELECTABLE_ELEMENT + def get_element_dict(self) -> dict: + return self.__static_element + def get_scrollable(self) -> bool: return self.__static_element.get("isScrollable", False) @@ -193,6 +212,9 @@ def get_tag_name(self) -> str: def get_id(self) -> str: return self.__static_element.get("id", "") + def get_frame_id(self) -> str: + return self.__static_element.get("frame", "") + def get_attributes(self) -> typing.Dict: return self.__static_element.get("attributes", {}) @@ -314,10 +336,15 @@ async def input_sequentially( if length > TEXT_PRESS_MAX_LENGTH: # if the text is longer than TEXT_PRESS_MAX_LENGTH characters, we will locator.fill in initial texts until the last TEXT_PRESS_MAX_LENGTH characters # and then type the last TEXT_PRESS_MAX_LENGTH characters with locator.press_sequentially - await self.get_locator().fill(text[: length - TEXT_PRESS_MAX_LENGTH]) + await self.input_fill(text[: length - TEXT_PRESS_MAX_LENGTH]) text = text[length - TEXT_PRESS_MAX_LENGTH :] - await self.get_locator().press_sequentially(text, delay=TEXT_INPUT_DELAY, timeout=default_timeout) + await self.press_fill(text, timeout=default_timeout) + + async def press_fill( + self, text: str, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS + ) -> None: + await self.get_locator().press_sequentially(text, delay=TEXT_INPUT_DELAY, timeout=timeout) async def input_fill( self, text: str, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS @@ -377,6 +404,12 @@ def __init__(self, scraped_page: ScrapedPage, page: Page) -> None: self.scraped_page = scraped_page self.page = page + def check_id_in_dom(self, element_id: str) -> bool: + css_selector = self.scraped_page.id_to_css_dict.get(element_id, "") + if css_selector: + return True + return False + async def get_skyvern_element_by_id(self, element_id: str) -> SkyvernElement: element = self.scraped_page.id_to_element_dict.get(element_id) if not element: diff --git a/skyvern/webeye/utils/page.py b/skyvern/webeye/utils/page.py index a9bf8b441..07c2dc0f6 100644 --- a/skyvern/webeye/utils/page.py +++ b/skyvern/webeye/utils/page.py @@ -168,6 +168,10 @@ async def get_combobox_options(self, element: ElementHandle) -> List[Dict[str, A js_script = "async (element) => await getListboxOptions(element)" return await self.frame.evaluate(js_script, element) + async def parse_element_from_html(self, frame: str, element: ElementHandle, interactable: bool) -> Dict: + js_script = "([frame, element, interactable]) => buildElementObject(frame, element, interactable)" + return await self.frame.evaluate(js_script, [frame, element, interactable]) + async def scroll_to_top(self, draw_boxes: bool) -> float: """ Scroll to the top of the page and take a screenshot. @@ -175,7 +179,7 @@ async def scroll_to_top(self, draw_boxes: bool) -> float: :param page: Page instance to take the screenshot from. :return: Screenshot of the page. """ - js_script = f"async () => await scrollToTop({str(draw_boxes).lower()})" + js_script = f"() => scrollToTop({str(draw_boxes).lower()})" scroll_y_px = await self.frame.evaluate(js_script) return scroll_y_px @@ -186,7 +190,7 @@ async def scroll_to_next_page(self, draw_boxes: bool) -> float: :param page: Page instance to take the screenshot from. :return: Screenshot of the page. """ - js_script = f"async () => await scrollToNextPage({str(draw_boxes).lower()})" + js_script = f"() => scrollToNextPage({str(draw_boxes).lower()})" scroll_y_px = await self.frame.evaluate(js_script) return scroll_y_px