From 436b9de50b8125d8fc3fb20f43e99c10d07a6512 Mon Sep 17 00:00:00 2001 From: garyzhang99 <46197280+garyzhang99@users.noreply.github.com> Date: Fri, 6 Sep 2024 11:19:03 +0800 Subject: [PATCH] Enable web browsing ability in agentscope (#296) --------- Co-authored-by: wenhao Co-authored-by: DavdGao --- README.md | 12 + README_ZH.md | 13 + docs/sphinx_doc/en/source/index.rst | 1 + docs/sphinx_doc/en/source/tutorial/211-web.md | 90 ++++ docs/sphinx_doc/zh_CN/source/index.rst | 1 + .../zh_CN/source/tutorial/211-web.md | 92 ++++ .../codeact_agent.py | 2 +- .../code/conversation_with_react_agent.py | 2 +- .../swe_agent_prompts.py | 4 +- .../README.md | 140 ++++++ .../main.py | 44 ++ .../webact_agent.py | 244 +++++++++ setup.py | 6 + src/agentscope/agents/react_agent.py | 260 +++++----- src/agentscope/agents/user_agent.py | 13 +- src/agentscope/models/post_model.py | 11 +- src/agentscope/service/__init__.py | 4 + src/agentscope/service/browser/__init__.py | 0 src/agentscope/service/browser/markpage.js | 136 +++++ src/agentscope/service/browser/web_browser.py | 475 ++++++++++++++++++ src/agentscope/service/service_toolkit.py | 27 +- 21 files changed, 1431 insertions(+), 146 deletions(-) create mode 100644 docs/sphinx_doc/en/source/tutorial/211-web.md create mode 100644 docs/sphinx_doc/zh_CN/source/tutorial/211-web.md create mode 100644 examples/conversation_with_web_browser_agent/README.md create mode 100644 examples/conversation_with_web_browser_agent/main.py create mode 100644 examples/conversation_with_web_browser_agent/webact_agent.py create mode 100644 src/agentscope/service/browser/__init__.py create mode 100644 src/agentscope/service/browser/markpage.js create mode 100644 src/agentscope/service/browser/web_browser.py diff --git a/README.md b/README.md index 17606b654..327999788 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,12 @@ Start building LLM-empowered multi-agent applications in an easier way. ## News +- new**[2024-09-03]** AgentScope supports **Web Browser Control** now! Refer to our [example](https://github.com/modelscope/agentscope/tree/main/examples/conversation_with_web_browser_agent) for more details. + +
+ +
+ - new**[2024-07-18]** AgentScope supports streaming mode now! Refer to our [tutorial](https://modelscope.github.io/agentscope/en/tutorial/203-stream.html) and example [conversation in stream mode](https://github.com/modelscope/agentscope/tree/main/examples/conversation_in_stream_mode) for more details.
@@ -55,6 +61,9 @@ Start building LLM-empowered multi-agent applications in an easier way. - **[2024-06-09]** We release **AgentScope** v0.0.5 now! In this new version, [**AgentScope Workstation**](https://modelscope.github.io/agentscope/en/tutorial/209-gui.html) (the online version is running on [agentscope.io](https://agentscope.io)) is open-sourced with the refactored [**AgentScope Studio**](https://modelscope.github.io/agentscope/en/tutorial/209-gui.html)! +
+Full News + - **[2024-05-24]** We are pleased to announce that features related to the **AgentScope Workstation** will soon be open-sourced! The online website services are temporarily offline. The online website service will be upgraded and back online shortly. Stay tuned... - **[2024-05-15]** A new **Parser Module** for **formatted response** is added in AgentScope! Refer to our [tutorial](https://modelscope.github.io/agentscope/en/tutorial/203-parser.html) for more details. The [`DictDialogAgent`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/agents/dict_dialog_agent.py) and [werewolf game](https://github.com/modelscope/agentscope/tree/main/examples/game_werewolf) example are updated simultaneously. @@ -87,6 +96,8 @@ available in [PyPI](https://pypi.org/project/agentscope/)! - **[2024-02-14]** We release our paper "AgentScope: A Flexible yet Robust Multi-Agent Platform" in [arXiv](https://arxiv.org/abs/2402.14034) now! +
+ --- ## What's AgentScope? @@ -151,6 +162,7 @@ the following libraries. - Multi Modality - Wikipedia Search and Retrieval - TripAdvisor Search +- Web Browser Control **Example Applications** diff --git a/README_ZH.md b/README_ZH.md index d1704835f..6ac6746ab 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -41,6 +41,12 @@ ## 新闻 +- new**[2024-09-03]** AgentScope 已更新浏览器控制模块,利用 vision 模型实现智能体对浏览器的控制。请参考[**样例**](https://github.com/modelscope/agentscope/tree/main/examples/conversation_with_web_browser_agent) + +
+ +
+ - new**[2024-07-18]** AgentScope 已支持模型流式输出。请参考我们的 [**教程**](https://modelscope.github.io/agentscope/zh_CN/tutorial/203-stream.html) 和 [**流式对话样例**](https://github.com/modelscope/agentscope/tree/main/examples/conversation_in_stream_mode)!
@@ -48,6 +54,7 @@ agentscope-logo
+ - new**[2024-07-15]** AgentScope 中添加了 Mixture of Agents 算法。使用样例请参考 [MoA 示例](https://github.com/modelscope/agentscope/blob/main/examples/conversation_mixture_of_agents)。 - **[2024-06-14]** 新的提示调优(Prompt tuning)模块已经上线 AgentScope,用以帮助开发者生成和优化智能体的 system prompt。更多的细节和使用样例请参考 AgentScope [教程](https://modelscope.github.io/agentscope/en/tutorial/209-prompt_opt.html)! @@ -56,6 +63,9 @@ - **[2024-06-09]** AgentScope v0.0.5 已经更新!在这个新版本中,我们开源了 [**AgentScope Workstation**](https://modelscope.github.io/agentscope/en/tutorial/209-gui.html) (在线版本的网址是[agentscope.io](https://agentscope.io))! +
+完整新闻 + - **[2024-05-24]** 我们很高兴地宣布 **AgentScope Workstation** 相关功能即将开源。我们的网站服务暂时下线。在线服务会很快升级重新上线,敬请期待... - **[2024-05-15]** 用于解析模型格式化输出的**解析器**模块已经上线 AgentScope!更轻松的构建多智能体应用,使用方法请参考[教程](https://modelscope.github.io/agentscope/en/tutorial/203-parser.html)。与此同时,[`DictDialogAgent`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/agents/dict_dialog_agent.py) 类和 [狼人杀游戏](https://github.com/modelscope/agentscope/tree/main/examples/game_werewolf) 样例也已经同步更新! @@ -84,6 +94,8 @@ - **[2024-02-14]** 我们在arXiv上发布了论文“[AgentScope: A Flexible yet Robust Multi-Agent Platform](https://arxiv.org/abs/2402.14034)”! +
+ --- ## 什么是AgentScope? @@ -141,6 +153,7 @@ AgentScope支持使用以下库快速部署本地模型服务。 - 多模态生成 - 维基百科搜索 - TripAdvisor搜索 +- 浏览器控制 **样例应用** diff --git a/docs/sphinx_doc/en/source/index.rst b/docs/sphinx_doc/en/source/index.rst index 871433759..7eeb888f3 100644 --- a/docs/sphinx_doc/en/source/index.rst +++ b/docs/sphinx_doc/en/source/index.rst @@ -34,6 +34,7 @@ AgentScope Documentation tutorial/208-distribute.md tutorial/209-gui.md tutorial/210-rag.md + tutorial/211-web.md tutorial/105-logging.md tutorial/207-monitor.md tutorial/104-usecase.md diff --git a/docs/sphinx_doc/en/source/tutorial/211-web.md b/docs/sphinx_doc/en/source/tutorial/211-web.md new file mode 100644 index 000000000..e77b1d919 --- /dev/null +++ b/docs/sphinx_doc/en/source/tutorial/211-web.md @@ -0,0 +1,90 @@ +(211-web-en)= + +# Web Browser Control + +AgentScope supports web browser control with the `agentscope.service.WebBrowser` module. +It allows agent to interact with web pages, and take actions like clicking, typing and scrolling. + +> Note the current web browser module requires a vision LLM to work properly. We will provide text-based vision in the future. + +> Note the web browser module is still in beta, which will be updated frequently. + + +## Prerequisites + +The `WebBrowser` module is implemented based on [Playwright](https://playwright.dev/). +You need to install the lasted AgentScope, as well as the playwright packages as follows: + +```bash +# Install the latest AgentScope from source +git clone https://github.com/modelscope/agentscope.git +cd agentscope +pip install -e . + +# Install playwright +pip install playwright +playwright install +``` + +## Guidance + +Initialize the `WebBrowser` module as follows + +```python +from agentscope.service import WebBrowser + +browser = WebBrowser() +``` + +The `WebBrowser` module facilitates browser control and state retrieval. +The name of the control functions are all prefixed by "action_", e.g. `action_visit_url`, +and `action_click`. To see the full list of functions, calling the `get_action_functions` method. + +```python +# To see full supported actions +print(browser.get_action_functions()) + +# Visit a new webpage +browser.action_visit_url("https://www.bing.com") +``` + +To monitor the current state of the browser, you can call the function prefixed by `"page_"`, e.g. `page_url`, `page_title`, and `page_html`". + +```python +# The url +print(browser.page_url) + +# The page title +print(browser.page_title) + +# The page in MarkDown format (parsed by markdownify) +print(browser.page_markdown) + +# The page html (maybe too long) +print(browser.page_html) +``` + +Besides, to help vision models to understand the webpage better, we provide `set_interactive_marks` function, +which will mark all the interactive elements on the current webpage with index labels. +After calling `set_interactive_marks` function, more actions can be performed on the webpage. +For example, clicking a button, typing in a text box, etc. + +```python +# Set interactive marks with index labels +browser.set_interactive_marks() + +# Remove interactive marks +# browser.remove_interactive_marks() +``` + +## Work with Agent + +The above functions provide basic operations for interactive web browser control. +You can use them to build your own web browsing agent. + +In AgentScope, the web browser is also some kind of tool functions, so you can use it together with the service toolkit module to build your own agent. +We also provide a [web browser agent](https://github.com/modelscope/agentscope/tree/main/examples/conversation_with_web_browser_agent)) in our example. +You can refer to it for more details. + + +[[Back to the top]](#211-web-en) diff --git a/docs/sphinx_doc/zh_CN/source/index.rst b/docs/sphinx_doc/zh_CN/source/index.rst index da1ed8b68..ca674a0e1 100644 --- a/docs/sphinx_doc/zh_CN/source/index.rst +++ b/docs/sphinx_doc/zh_CN/source/index.rst @@ -34,6 +34,7 @@ AgentScope 文档 tutorial/208-distribute.md tutorial/209-gui.md tutorial/210-rag.md + tutorial/211-web.md tutorial/105-logging.md tutorial/207-monitor.md tutorial/104-usecase.md diff --git a/docs/sphinx_doc/zh_CN/source/tutorial/211-web.md b/docs/sphinx_doc/zh_CN/source/tutorial/211-web.md new file mode 100644 index 000000000..935aeb887 --- /dev/null +++ b/docs/sphinx_doc/zh_CN/source/tutorial/211-web.md @@ -0,0 +1,92 @@ +(211-web-cn)= + +AgentScope 支持使用 `agentscope.service.WebBrowser` 模块进行 Web 浏览器控制。 +它允许代理与网页进行交互,并执行点击、输入和滚动等网页操作。 + +> 注意当前的 Web 浏览器模块仍处于测试阶段,在未来的一段时间内将会频繁更新和优化。 + +## 预备 + +`WebBrowser` 模块基于 [Playwright](https://playwright.dev/) 实现,需要安装最新版本的 AgentScope 和 playwright 环境: + +```bash +# 从源码安装最新版本的 AgentScope +git clone https://github.com/modelscope/agentscope.git +cd agentscope +pip install -e . + +# 安装 playwright +pip install playwright +playwright install +``` + +## Guidance + +通过以下方式初始化一个 `WebBrowser` 模块实例: + +```python +from agentscope.service import WebBrowser + +browser = WebBrowser() +``` + +The `WebBrowser` module facilitates browser control and state retrieval. +The name of the control functions are all prefixed by "action_", e.g. `action_visit_url`, +and `action_click`. To see the full list of functions, calling the `get_action_functions` method. + +`WebBrowser` 模块提供了浏览器控制和状态检索的功能。 +其中控制函数的名称都以 "action_" 为前缀,例如 `action_visit_url` 和 `action_click`。可以通过调用 `get_action_functions` 方法查看完整的函数列表。 + +```python +# 查看所有支持的操作 +print(browser.get_action_functions()) + +# 访问新的网页 +browser.action_visit_url("https://www.bing.com") +``` + +为了获取当前浏览器的状态,可以调用以 `"page_"` 为前缀的函数,例如 `page_url`、`page_title` 和 `page_html`。 + +```python +# 当前网页的url +print(browser.page_url) + +# 当前网页的标题 +print(browser.page_title) + +# 以 MarkDown 的格式获取当前的页面信息(通过markdownify进行解析) +print(browser.page_markdown) + +# 当前网页的 html 源码(可能会太长) +print(browser.page_html) +``` + +Besides, to help vision models to understand the webpage better, we provide `set_interactive_marks` function, +which will mark all the interactive elements on the current webpage with index labels. +After calling `set_interactive_marks` function, more actions can be performed on the webpage. +For example, clicking a button, typing in a text box, etc. + +此外,为了帮助视觉模型更好地理解网页,我们提供了 `set_interactive_marks` 函数,该函数会把当前网页上所有的可交互元素标记出来,并用序号标签进行标注(从0开始)。 +调用 `set_interactive_marks` 函数标记网页后,我们就可以在网页上执行更多的操作,例如点击指定序号的按钮、在指定序号的文本框中进行输入等。 + +```python +# 为网页上的交互元素添加序号标签 +browser.set_interactive_marks() + +# 删除交互标记 +# browser.remove_interactive_marks() +``` + +## 与智能体结合 + +上述的所有函数为交互式的 Web 浏览器控制提供了基本操作接口。开发者可以使用这些接口来构建自己的 Web 浏览代理。 + +In AgentScope, the web browser is also some kind of tool functions, so you can use it together with the service toolkit module to build your own agent. +We also provide a [web browser agent](https://github.com/modelscope/agentscope/tree/main/examples/conversation_with_web_browser_agent)) in our example. +You can refer to it for more details. + +在 AgentScope 中,Web 浏览器也是一种工具函数,因此可以使用 `agentscope.service.ServiceToolkit` 来处理 `WebBrowser` 模块提供的函数,并构建自己的智能体。 +我们在示例中提供了一个[Web 浏览器智能体](https://github.com/modelscope/agentscope/tree/main/examples/conversation_with_web_browser_agent)的样例。 +可以参考该样例了解更多细节。 + +[[回到顶部]](#211-web-cn) diff --git a/examples/conversation_with_codeact_agent/codeact_agent.py b/examples/conversation_with_codeact_agent/codeact_agent.py index d09b7fda4..70c54713a 100644 --- a/examples/conversation_with_codeact_agent/codeact_agent.py +++ b/examples/conversation_with_codeact_agent/codeact_agent.py @@ -2,7 +2,7 @@ # pylint: disable=C0301 """An agent class that implements the CodeAct agent. This agent can execute code interactively as actions. -More details can be found at the paper of codeact agent +More details can be found at the paper of CodeAct agent https://arxiv.org/abs/2402.01030 and the original repo of codeact https://github.com/xingyaoww/code-act """ diff --git a/examples/conversation_with_react_agent/code/conversation_with_react_agent.py b/examples/conversation_with_react_agent/code/conversation_with_react_agent.py index 7c196dd1e..dc5027920 100644 --- a/examples/conversation_with_react_agent/code/conversation_with_react_agent.py +++ b/examples/conversation_with_react_agent/code/conversation_with_react_agent.py @@ -80,7 +80,7 @@ def execute_python_code(code: str) -> ServiceResponse: # pylint: disable=C0301 verbose=True, service_toolkit=service_toolkit, ) -user = UserAgent(name="User") +user = UserAgent(name="User", input_hint="User Input ('exit' to quit): ") # Build x = None diff --git a/examples/conversation_with_swe-agent/swe_agent_prompts.py b/examples/conversation_with_swe-agent/swe_agent_prompts.py index 4c30e48af..a3a6147d9 100644 --- a/examples/conversation_with_swe-agent/swe_agent_prompts.py +++ b/examples/conversation_with_swe-agent/swe_agent_prompts.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # pylint: disable=C0301 -"""The SWE-agent relay heavily on it's prompts. -This file contains the neccessary prompts for the SWE-agent. +"""The SWE-agent relay heavily on its prompts. +This file contains the necessary prompts for the SWE-agent. Some prompts are taken and modified from the original SWE-agent repo or the SWE-agent implementation from Open-Devin. """ diff --git a/examples/conversation_with_web_browser_agent/README.md b/examples/conversation_with_web_browser_agent/README.md new file mode 100644 index 000000000..5462e8a15 --- /dev/null +++ b/examples/conversation_with_web_browser_agent/README.md @@ -0,0 +1,140 @@ +# WebBrowsing in AgentScope + +This example demonstrates how to utilize AgentScope to build a web browsing agent. Throughout this tutorial, you will gain insights into the following features of AgentScope: + +- How to use the [WebBrowser](https://github.com/modelscope/agentscope/blob/main/src/agentscope/service/browser/web_browser.py) module in AgentScope +- How to build a conversation with an agent that can browse the web + +Refer to our tutorial for more details on the `WebBrowser` module. + +> Note: The `WebBrowser` module is currently in beta, and we are continuously working on enhancements, such as +> - allowing the agent to make long-term plan +> - handling web pages with CAPTCHA. +> - enabling text-based models to interact with web pages + +## Tested Models + +This example has been tested with the following model: +- GPT-4o + +We plan to test additional vision models in the near future. Additionally, we will enable web browsing capabilities for text-only models. + +## Prerequisites + +To run this example, you need to: + +1. Ensure you have access to a vision model that can handle vision tasks, and set your api key in the model config. +2. Install the necessary Playwright packages: + - Run `pip install playwright` to set up the Python environment. + - Run `playwright install` to install the required browser for Playwright. +3. [Optional] For a better understanding of how web browsing is implemented, refer to the original code in [web_browser.py](../../src/agentscope/browser/web_browser.py) and [web_voyager_agent.py](../../src/agentscope/agents/web_voyager_agent.py). + +## Running the Example + +Follow the steps below to run the example: +1. Fill your OpenAI API key in `main.py`, or providing a new configuration for vision models. +2. Run the `main.py` file directly: +```bash +python main.py +``` + +> Note +> - The screenshots of the web pages will be saved locally. + +## Code Snippets + +The `webact_agent.py` provides an agent `WebActAgent` that integrates the web browsing module into [ReAct algorithm](https://arxiv.org/abs/2210.03629). +It will interact with web pages in a reasoning-acting loop, and provide the final answer to the user by calling a built-in `finish` function. + +The major difference with the traditional ReAct algorithm is that the agent will set interactive marks and obtain the web page screenshot in the reasoning phase. +This allows the agent to interact with the web page more naturally. + +```python + # ... + + def _reasoning(self) -> Union[dict, None]: + """The reasoning process of the agent. + + Returns: + `Union[dict, None]`: + Return `None` if meet parsing error, otherwise return the + parsed function call dictionary. + """ + + # Mark the current interactive elements in the web page + self.browser.set_interactive_marks() + + # After marking, take a screenshot and save it locally + path_img = FileManager.get_instance().save_image( + self.browser.page_screenshot, + ) + + # Assemble the prompt + prompt = self.model.format( + self.memory.get_memory(), + # The observation message won't be stored in memory to avoid too + # many images in prompt + Msg( + "user", + _HINT_PROMPT.format( + url=self.browser.url, + format_instruction=self.parser.format_instruction, + ), + role="user", + url=path_img, + echo=True, + ), + ) + + # ... +``` + +To save image tokens, the message with the screenshot won't be saved to the agent's memory. +That means, in each reasoning phase, the agent will only have the latest screenshot. +Developers can modify the code to implement a more complex memory mechanism. + +### Example Demonstration + + +https://github.com/user-attachments/assets/6d03caab-6193-4ac6-8b1c-36f152ec02ec + + +In the first iter of our web browsing agent, the agent opens the default webpage, in this case, the Google webpage. + +We can see from the saved screenshot here that the interactive elements in this webpage are marked with numbers. This is called the set-of-mark prompting([github link](https://github.com/microsoft/SoM), [paper link](https://arxiv.org/abs/2310.11441)). Utilizing the set-of-mark prompting, the agent can interact with the webpage more naturally by selecting the elements with the corresponding numbers. + +After recieving the observation, the agent will give it's thought and corresponding action. +In this case, the agent select the search bar (numbered as [4]) and type in it. + +![screenshot_1](https://github.com/garyzhang99/agentscope/assets/46197280/9de208b8-4ef4-4b4f-9328-2f7bb500fcb2) + + +``` +Thought: To find out how many stars the project "agentscope" has on GitHub, I need to search for "agentscope GitHub" on Google first. + +Action: Type [4]; agentscope GitHub +``` + + +In the next iter, we see that the agent is presented with the searching result page, and the agent select the offical github link. +![screenshot_2](https://github.com/garyzhang99/agentscope/assets/46197280/9b6708c6-eced-4d8b-8ebe-cdbd197b40ea) + +``` +Thought: The search results from Google have populated, and I found a link that likely leads to the "agentscope" GitHub project page. + +Action: Click [18] +``` + +As the agent view the github page of agentscope, it note the github stars, hence it answer our question. + +![screenshot_3](https://github.com/garyzhang99/agentscope/assets/46197280/5cad5472-b45b-4ef3-a8fa-324d5a20073a) + + +``` +Thought: I can see from the screenshot that the star count for the "agentscope" project on GitHub is listed as "2.9k" stars. + +Action: ANSWER; The project agentscope has received 2.9k stars on GitHub. +``` + +The above content provides a simple example of using the web browsing agent in AgentScope. Feel free to try it out yourself and explore the capabilities of web browsing with AgentScope! + diff --git a/examples/conversation_with_web_browser_agent/main.py b/examples/conversation_with_web_browser_agent/main.py new file mode 100644 index 000000000..d3462f78d --- /dev/null +++ b/examples/conversation_with_web_browser_agent/main.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +""" Example of using web browser module. """ + +from webact_agent import WebActAgent +from agentscope.agents import UserAgent + +import agentscope + +# Fill in your OpenAI API key or create your own vision model configuration +YOUR_OPENAI_API_KEY = "xxx" + +model_config = { + "config_name": "gpt-4o_config", + "model_type": "openai_chat", + "model_name": "gpt-4o", + "api_key": YOUR_OPENAI_API_KEY, + "generate_args": { + "temperature": 0.7, + }, +} + + +agentscope.init( + model_configs=model_config, + project="Conversation with Web Browser", +) + +agent = WebActAgent( + name="assistant", + model_config_name="gpt-4o_config", + verbose=True, +) + +user = UserAgent( + "user", + input_hint="User Input (type 'exit' to break): ", +) + +x = None +while True: + x = user(x) + if x.content == "exit": + break + x = agent(x) diff --git a/examples/conversation_with_web_browser_agent/webact_agent.py b/examples/conversation_with_web_browser_agent/webact_agent.py new file mode 100644 index 000000000..9d9aeb1d8 --- /dev/null +++ b/examples/conversation_with_web_browser_agent/webact_agent.py @@ -0,0 +1,244 @@ +# -*- coding: utf-8 -*- +# pylint: disable=C0301 +"""The web act agent class, whose implementation refers to +the ReAct (https://arxiv.org/abs/2210.03629) algorithm and +WebVoyager (https://arxiv.org/abs/2401.13919).""" + +from typing import Optional, Union, Sequence + +from agentscope.agents import AgentBase +from agentscope.exception import ResponseParsingError +from agentscope.manager import FileManager +from agentscope.message import Msg +from agentscope.parsers import RegexTaggedContentParser +from agentscope.service import ( + ServiceToolkit, + ServiceResponse, + ServiceExecStatus, + WebBrowser, +) + +_SYSTEM_PROMPT = """You're an AI assistant named {name}. + +## Your Target +Help the user to solve problems with the given web browsing commands. + +## Key Guidelines You MUST follow +1 To input text, NO need to click textbox first, directly type content. After typing, the system automatically hits `ENTER` key. Sometimes you should click the search button to apply search filters. Try to use simple language when searching. +2 You must Distinguish between textbox and search button, don't type content into the button! If no textbox is found, you may need to click the search button first before the textbox is displayed. +3 Execute only one action per iteration. +4 STRICTLY Avoid repeating the same action if the webpage remains unchanged. You may have selected the wrong web element or numerical label. Continuous use of the Wait is also NOT allowed. Continuous use of the Scroll with no change of webpage is NOT allowed. +5 When a complex Task involves multiple questions or steps, select "ANSWER" only at the very end, after addressing all of these questions (steps). Flexibly combine your own abilities with the information in the web page. Double check the formatting requirements in the task when ANSWER. +6 When the user's task is a simple request, you should select "ANSWER" to complete the task. +7 When you are not sure whether you are making progress, you can select "ANSWER" and ask the user for more infomation. + +## Web Browsing Guidelines +1 Don't interact with useless web elements like Login, Sign-in, donation that appear in Webpages. Pay attention to Key Web Elements like search textbox and menu. +2 Focus on the numerical labels in the TOP LEFT corner of each rectangle (element). Ensure you don't mix them up with other numbers (e.g. Calendar) on the page. +3 Focus on the date in task, you must look for results that match the date. It may be necessary to find the correct year, month and day at calendar. +4 Pay attention to the filter and sort functions on the page, which, combined with scroll, can help you solve conditions like 'highest', 'cheapest', 'lowest', 'earliest', etc. Try your best to find the answer that best fits the task. +5 You can use "Goto" if you need to goto a specific webpage. + +""" # noqa + +_HINT_PROMPT = """You're browsing the URL "{url}". Please analyze the attached screenshot and give the thought and action. + +{format_instruction}""" # noqa + + +class WebActAgent(AgentBase): + """An agent that performs reasoning and acting iteratively based on the + web browser state at that time. The implementation refers to the ReAct + (https://arxiv.org/abs/2210.03629) algorithm and + WebVoyager (https://arxiv.org/abs/2401.13919). + + Note: + This agent requires a vision model to handle the web page screenshot. + """ + + def __init__( + self, + name: str, + model_config_name: str, + max_iters: int = 10, + verbose: bool = True, + ) -> None: + super().__init__(name=name, model_config_name=model_config_name) + + # Init the browser + self.browser = WebBrowser() + + self.browser.action_visit_url("https://www.bing.com") + + # Init the service toolkit with the browser commands. Since they don't + # require developers to specify parameters, we directly place them into + # the toolkit. + self.toolkit = ServiceToolkit() + for func in self.browser.get_action_functions().values(): + self.toolkit.add(func) + + # Add `finish` function to the toolkit to allow agent to end + # the reasoning-acting loop + self.toolkit.add(self.finish) + + # Prepare system prompt, which includes the instruction for the tools + self.sys_prompt = ( + _SYSTEM_PROMPT.format_map({"name": name}) + + self.toolkit.tools_instruction + ) + self.memory.add( + Msg( + name="system", + content=self.sys_prompt, + role="system", + ), + ) + + # Initialize a parser object to formulate the response from the model + self.parser = RegexTaggedContentParser( + format_instruction="""Respond with specific tags as outlined below: +{what you thought} +{the function name you want to call} +<{argument name}>{argument value} +<{argument name}>{argument value} +...""", # noqa + try_parse_json=True, + required_keys=["thought", "function"], + ) + + self.max_iters = max_iters + self.verbose = verbose + + def reply(self, x: Optional[Union[Msg, Sequence[Msg]]] = None) -> Msg: + """The reply method of the agent.""" + self.memory.add(x) + + for _ in range(self.max_iters): + # Step 1: Reasoning: decide what function to call + function_call = self._reasoning() + + if function_call is None: + # Meet parsing error, skip acting to reason the parsing error, + # which has been stored in memory + continue + + # Return the response directly if calling `answer` function. + # If "response" doesn't exist, we leave the error handling in the + # acting step. + if ( + function_call["function"] == "finish" + and "response" in function_call + ): + return Msg( + self.name, + function_call["response"], + "assistant", + ) + + # Step 2: Acting: execute the function accordingly + self._acting(function_call) + + # When exceeding the max iterations + hint_msg = Msg( + "system", + "You have failed to generate response within the maximum " + "iterations. Now respond directly by summarizing the current " + "situation.", + role="system", + echo=self.verbose, + ) + + # Generate a reply by summarizing the current situation + prompt = self.model.format(self.memory.get_memory(), hint_msg) + res = self.model(prompt) + self.speak(res.stream or res.text) + res_msg = Msg(self.name, res.text, "assistant") + return res_msg + + def _reasoning(self) -> Union[dict, None]: + """The reasoning process of the agent. + + Returns: + `Union[dict, None]`: + Return `None` if meet parsing error, otherwise return the + parsed function call dictionary. + """ + + # Mark the current interactive elements in the web page + self.browser.set_interactive_marks() + + # After marking, take a screenshot and save it locally + path_img = FileManager.get_instance().save_image( + self.browser.page_screenshot, + ) + + # Assemble the prompt + prompt = self.model.format( + self.memory.get_memory(), + # The observation message won't be stored in memory to avoid too + # many images in prompt + Msg( + "user", + _HINT_PROMPT.format( + url=self.browser.url, + format_instruction=self.parser.format_instruction, + ), + role="user", + url=path_img, + echo=self.verbose, + ), + ) + + # Get the response from the model and print it out + raw_response = self.model(prompt) + self.speak(raw_response.stream or raw_response.text) + self.memory.add(Msg(self.name, raw_response.text, role="assistant")) + + # Try to parse the response into function calling commands + try: + res = self.parser.parse(raw_response) + return res.parsed + except ResponseParsingError as e: + # When failed to parse the response, return the error message to + # the llm + self.memory.add(Msg("system", str(e), "system")) + return None + + def _acting(self, function_call: dict) -> None: + """The acting process of the agent.""" + + function_name = function_call["function"] + arguments = { + k: v + for k, v in function_call.items() + if k not in ["function", "thought"] + } + + formatted_function_call = [ + { + "name": function_name, + "arguments": arguments, + }, + ] + + msg_res = self.toolkit.parse_and_call_func( + formatted_function_call, + ) + self.speak(msg_res) + self.memory.add(msg_res) + + @staticmethod + def finish(response: str) -> ServiceResponse: + """Finish reasoning and generate a response to the user. + + Note: + The function won't be executed, actually. + + Args: + response (`str`): + The response to the user. + """ + return ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=response, + ) diff --git a/setup.py b/setup.py index 8966431d4..9b40356fb 100644 --- a/setup.py +++ b/setup.py @@ -56,6 +56,11 @@ studio_requires = [] +web_requires = [ + "playwright", + "markdownify", +] + # released requires minimal_requires = [ "networkx", @@ -103,6 +108,7 @@ + gradio_requires + rag_requires + studio_requires + + web_requires ) online_requires = full_requires + [ diff --git a/src/agentscope/agents/react_agent.py b/src/agentscope/agents/react_agent.py index b99823e94..a806c40a9 100644 --- a/src/agentscope/agents/react_agent.py +++ b/src/agentscope/agents/react_agent.py @@ -5,13 +5,15 @@ """ from typing import Optional, Union, Sequence -from agentscope.exception import ResponseParsingError, FunctionCallError +from agentscope.exception import ResponseParsingError from agentscope.agents import AgentBase from agentscope.message import Msg -from agentscope.parsers.regex_tagged_content_parser import ( - RegexTaggedContentParser, +from agentscope.parsers import RegexTaggedContentParser +from agentscope.service import ( + ServiceToolkit, + ServiceResponse, + ServiceExecStatus, ) -from agentscope.service import ServiceToolkit INSTRUCTION_PROMPT = """## What You Should Do: 1. First, analyze the current situation, and determine your goal. @@ -31,10 +33,17 @@ class ReActAgent(AgentBase): """An agent class that implements the ReAct algorithm. More details refer to https://arxiv.org/abs/2210.03629. - Note this is an example implementation of ReAct algorithm in AgentScope. + This is an example implementation of ReAct algorithm in AgentScope. We follow the idea within the paper, but the detailed prompt engineering maybe different. Developers are encouraged to modify the prompt to fit their own needs. + + Note: + 1. We use the "thought" field in the response to support + Chain-of-Thought, which means the tool functions cannot use "thought" + as their argument name. + 2. The function name "finish" is also a reserved name when using this + agent, which will be used to end the reasoning-acting loop. """ def __init__( @@ -42,7 +51,7 @@ def __init__( name: str, model_config_name: str, service_toolkit: ServiceToolkit, - sys_prompt: str = "You're a helpful assistant. Your name is {name}.", + sys_prompt: str = "You're a helpful assistant named {name}.", max_iters: int = 10, verbose: bool = True, ) -> None: @@ -73,6 +82,11 @@ def __init__( ) self.service_toolkit = service_toolkit + + # Add `finish` function to the toolkit to allow agent to end + # the reasoning-acting loop + self.service_toolkit.add(self.finish) + self.verbose = verbose self.max_iters = max_iters @@ -96,146 +110,50 @@ def __init__( # Initialize a parser object to formulate the response from the model self.parser = RegexTaggedContentParser( format_instruction="""Respond with specific tags as outlined below: - -- When calling tool functions (note the "arg_name" should be replaced with the actual argument name): -what you thought -the function name you want to call -the value of the argument -the value of the argument - -- When you want to generate a final response: -what you thought -what you respond +{what you thought} +{the function name you want to call} +<{argument name}>{argument value} +<{argument name}>{argument value} ...""", # noqa try_parse_json=True, - required_keys=["thought"], - keys_to_content="response", + required_keys=["thought", "function"], ) def reply(self, x: Optional[Union[Msg, Sequence[Msg]]] = None) -> Msg: - """The reply function that achieves the ReAct algorithm. - The more details please refer to https://arxiv.org/abs/2210.03629""" - + """The reply method of the agent.""" self.memory.add(x) - for _ in range(self.max_iters): - # Step 1: Thought - if self.verbose: - self.speak(f" ITER {_+1}, STEP 1: REASONING ".center(70, "#")) + for _ in range(10): + # Step 1: Reasoning: decide what function to call + function_call = self._reasoning() - # Prepare hint to remind model what the response format is - # Won't be recorded in memory to save tokens - hint_msg = Msg( - "system", - self.parser.format_instruction, - role="system", - echo=self.verbose, - ) - - # Prepare prompt for the model - prompt = self.model.format(self.memory.get_memory(), hint_msg) - - # Generate and parse the response - try: - raw_response = self.model(prompt) - - # Print out the text generated by llm in non-/streaming mode - if self.verbose: - # To be compatible with streaming and non-streaming mode - self.speak(raw_response.stream or raw_response.text) - - res = self.parser.parse(raw_response) - - # Record the raw text into memory to avoid that LLMs learn - # from the previous response format - self.memory.add(Msg(self.name, res.text, "assistant")) - - # Skip the next steps if no need to call tools - # The parsed field is a dictionary - arg_function = res.parsed.get("function", "") - if ( - isinstance(arg_function, str) - and arg_function in ["[]", ""] - or isinstance(arg_function, list) - and len(arg_function) == 0 - ): - # Only the response field is exposed to users or other - # agents - msg_returned = Msg( - self.name, - res.parsed.get("response", res.text), - "assistant", - ) - - if not self.verbose: - # Print out the returned message - self.speak(msg_returned) - - return msg_returned - - # Only catch the response parsing error and expose runtime - # errors to developers for debugging - except ResponseParsingError as e: - # Print out raw response from models for developers to debug - response_msg = Msg(self.name, e.raw_response, "assistant") - if not self.verbose: - self.speak(response_msg) - - # Re-correct by model itself - error_msg = Msg("system", str(e), "system") - self.speak(error_msg) - - self.memory.add([response_msg, error_msg]) - - # Skip acting step to re-correct the response + if function_call is None: + # Meet parsing error, skip acting to reason the parsing error, + # which has been stored in memory continue - # Step 2: Acting - if self.verbose: - self.speak(f" ITER {_+1}, STEP 2: ACTING ".center(70, "#")) - - # Parse, check and execute the tool functions in service toolkit - try: - # Reorganize the parsed response to the required format of the - # service toolkit - res.parsed["function"] = [ - { - "name": res.parsed["function"], - "arguments": { - k: v - for k, v in res.parsed.items() - if k not in ["speak", "thought", "function"] - }, - }, - ] - - # Execute the function - execute_results = self.service_toolkit.parse_and_call_func( - res.parsed["function"], + # Return the response directly if calling `finish` function. + # If the argument "response" doesn't exist, we leave the error + # handling in the acting step. + if ( + function_call["function"] == "finish" + and "response" in function_call + ): + return Msg( + self.name, + function_call["response"], + "assistant", + echo=not self.verbose, ) - # Note: Observing the execution results and generate response - # are finished in the next reasoning step. We just put the - # execution results into memory, and wait for the next loop - # to generate response. - - # Record execution results into memory as system message - msg_res = Msg("system", execute_results, "system") - self.speak(msg_res) - self.memory.add(msg_res) - - except FunctionCallError as e: - # Catch the function calling error that can be handled by - # the model - error_msg = Msg("system", str(e), "system") - self.speak(error_msg) - self.memory.add(error_msg) + # Step 2: Acting: execute the function accordingly + self._acting(function_call) - # Exceed the maximum iterations + # When exceeding the max iterations hint_msg = Msg( "system", - "You have failed to generate a response in the maximum " - "iterations. Now generate a reply by summarizing the current " + "You have failed to generate response within the maximum " + "iterations. Now respond directly by summarizing the current " "situation.", role="system", echo=self.verbose, @@ -246,5 +164,83 @@ def reply(self, x: Optional[Union[Msg, Sequence[Msg]]] = None) -> Msg: res = self.model(prompt) self.speak(res.stream or res.text) res_msg = Msg(self.name, res.text, "assistant") - return res_msg + + def _reasoning(self) -> Union[dict, None]: + """The reasoning process of the agent. + + Returns: + `Union[dict, None]`: + Return `None` if meet parsing error, otherwise return the + parsed function call dictionary. + """ + # Assemble the prompt + prompt = self.model.format( + self.memory.get_memory(), + # Hint LLM how to respond without putting hint message into memory + Msg( + "system", + self.parser.format_instruction, + role="system", + echo=self.verbose, + ), + ) + + # Get the response from the model and print it out + raw_response = self.model(prompt) + if self.verbose: + self.speak(raw_response.stream or raw_response.text) + self.memory.add(Msg(self.name, raw_response.text, role="assistant")) + + # Try to parse the response into function calling commands + try: + res = self.parser.parse(raw_response) + return res.parsed + + except ResponseParsingError as e: + # When failed to parse the response, return the error message to + # the llm + self.memory.add(Msg("system", str(e), "system", echo=self.verbose)) + return None + + def _acting(self, function_call: dict) -> None: + """The acting process of the agent.""" + + # Assemble the function call into the format that the toolkit requires + function_name = function_call["function"] + arguments = { + k: v + for k, v in function_call.items() + if k not in ["function", "thought"] + } + + formatted_function_call = [ + { + "name": function_name, + "arguments": arguments, + }, + ] + + # The execution message, may be execution output or error information + msg_execution = self.service_toolkit.parse_and_call_func( + formatted_function_call, + ) + if self.verbose: + self.speak(msg_execution) + self.memory.add(msg_execution) + + @staticmethod + def finish(response: str) -> ServiceResponse: + """Finish reasoning and generate a response to the user. + + Note: + The function won't be executed, actually. + + Args: + response (`str`): + The response to the user. + """ + return ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=response, + ) diff --git a/src/agentscope/agents/user_agent.py b/src/agentscope/agents/user_agent.py index 12b6a26b4..1cc7f1576 100644 --- a/src/agentscope/agents/user_agent.py +++ b/src/agentscope/agents/user_agent.py @@ -14,12 +14,19 @@ class UserAgent(AgentBase): """User agent class""" - def __init__(self, name: str = "User", require_url: bool = False) -> None: + def __init__( + self, + name: str = "User", + input_hint: str = "User Input: ", + require_url: bool = False, + ) -> None: """Initialize a UserAgent object. Arguments: name (`str`, defaults to `"User"`): The name of the agent. Defaults to "User". + input_hint (`str`, defaults to `"User Input: "`): + The hint of the input. Defaults to "User Input: ". require_url (`bool`, defaults to `False`): Whether the agent requires user to input a URL. Defaults to False. The URL can lead to a website, a file, @@ -29,6 +36,7 @@ def __init__(self, name: str = "User", require_url: bool = False) -> None: super().__init__(name=name) self.name = name + self.input_hint = input_hint self.require_url = require_url def reply( @@ -64,6 +72,7 @@ def reply( if self.memory: self.memory.add(x) + # When AgentScope Studio is active if _studio_client.active: logger.info( f"Waiting for input from:\n\n" @@ -83,7 +92,7 @@ def reply( # TODO: To avoid order confusion, because `input` print much # quicker than logger.chat time.sleep(0.5) - content = user_input(timeout=timeout) + content = user_input(timeout=timeout, prefix=self.input_hint) kwargs = {} if required_keys is not None: if isinstance(required_keys, str): diff --git a/src/agentscope/models/post_model.py b/src/agentscope/models/post_model.py index 385881fd2..7cb1fc25c 100644 --- a/src/agentscope/models/post_model.py +++ b/src/agentscope/models/post_model.py @@ -154,14 +154,21 @@ def __call__(self, input_: str, **kwargs: Any) -> ModelResponse: # step3: record model invocation # record the model api invocation, which will be skipped if # `FileManager.save_api_invocation` is `False` + try: + response_json = response.json() + except requests.exceptions.JSONDecodeError as e: + raise RuntimeError( + f"Fail to serialize the response to json: \n{str(response)}", + ) from e + self._save_model_invocation( arguments=request_kwargs, - response=response.json(), + response=response_json, ) # step4: parse the response if response.status_code == requests.codes.ok: - return self._parse_response(response.json()) + return self._parse_response(response_json) else: logger.error(json.dumps(request_kwargs, indent=4)) raise RuntimeError( diff --git a/src/agentscope/service/__init__.py b/src/agentscope/service/__init__.py index bce6878f1..7d33e6501 100644 --- a/src/agentscope/service/__init__.py +++ b/src/agentscope/service/__init__.py @@ -61,6 +61,8 @@ wikipedia_search_categories, ) +from .browser.web_browser import WebBrowser, WebElementInfo + def get_help() -> None: """Get help message.""" @@ -118,6 +120,8 @@ def get_help() -> None: "tripadvisor_search", "tripadvisor_search_location_photos", "tripadvisor_search_location_details", + "WebBrowser", + "WebElementInfo", # to be deprecated "ServiceFactory", ] diff --git a/src/agentscope/service/browser/__init__.py b/src/agentscope/service/browser/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/agentscope/service/browser/markpage.js b/src/agentscope/service/browser/markpage.js new file mode 100644 index 000000000..f1c49cd54 --- /dev/null +++ b/src/agentscope/service/browser/markpage.js @@ -0,0 +1,136 @@ +/* the js code here is referenced from WebVoyager and tarsier +WebVoyager js code: https://github.com/MinorJerry/WebVoyager/blob/main/utils.py +tarsier js code: https://github.com/reworkd/tarsier/blob/main/tarsier/tag_utils.ts +*/ + +//Get the visible interactive elements on the current web page +function getInteractiveElements() { + let allElements = Array.prototype.slice.call( + document.querySelectorAll('*') + ) + + var items = allElements.map(function(element) { + var vw = Math.max(document.documentElement.clientWidth || 0, window.innerWidth || 0); + var vh = Math.max(document.documentElement.clientHeight || 0, window.innerHeight || 0); + + var elementClassName = (typeof element.className === 'string' ? element.className : '').replace(/\s+/g, '.'); + + var rects = [...element.getClientRects()].filter(bb => { + var center_x = bb.left + bb.width / 2; + var center_y = bb.top + bb.height / 2; + var elAtCenter = document.elementFromPoint(center_x, center_y); + return elAtCenter === element || element.contains(elAtCenter) + }).map(bb => { + const rect = { + left: Math.max(0, bb.left), + top: Math.max(0, bb.top), + right: Math.min(vw, bb.right), + bottom: Math.min(vh, bb.bottom) + }; + return { + ...rect, + width: rect.right - rect.left, + height: rect.bottom - rect.top + }; + }); + var area = rects.reduce((acc, rect) => acc + rect.width * rect.height, 0); + return { + element: element, + include: + (element.tagName === "INPUT" || element.tagName === "TEXTAREA" || element.tagName === "SELECT") || + (element.tagName === "BUTTON" || element.tagName === "A" || (element.onclick != null) || window.getComputedStyle(element).cursor == "pointer") || + (element.tagName === "IFRAME" || element.tagName === "VIDEO" || element.tagName === "LI" || element.tagName === "TD" || element.tagName === "OPTION"), + area, + rects, + text: element.textContent.trim().replace(/\s{2,}/g, ' ') + }; + }).filter(item => + item.include && (item.area >= 20) + ); + + // Filter out buttons and elements that are contained by buttons or spans + const buttons = Array.from(document.querySelectorAll('button, a, input[type="button"], div[role="button"]' )); + items = items.filter(x => !buttons.some(y => items.some(z => z.element === y) && y.contains(x.element) && !(x.element === y))); + items = items.filter(x => + !(x.element.parentNode && + x.element.parentNode.tagName === 'SPAN' && + x.element.parentNode.children.length === 1 && + x.element.parentNode.getAttribute('role') && + items.some(y => y.element === x.element.parentNode))); + items = items.filter(x => !items.some(y => x.element.contains(y.element) && !(x == y))); + + return items; +} + +// Set interactive marks on the current web page +function setInteractiveMarks() { + var items = getInteractiveElements(); + + // Init an array to record the item information + var itemsInfo = []; + items.forEach(function(item, index) { + item.rects.forEach((bbox) => { + // Create a mark element for each interactive element + let newElement = document.createElement("div"); + newElement.classList.add("agentscope-interactive-mark") + + // border + var borderColor = "#000000"; + newElement.style.outline = `2px dashed ${borderColor}`; + newElement.style.position = "fixed"; + newElement.style.left = bbox.left + "px"; + newElement.style.top = bbox.top + "px"; + newElement.style.width = bbox.width + "px"; + newElement.style.height = bbox.height + "px"; + newElement.style.pointerEvents = "none"; + newElement.style.boxSizing = "border-box"; + newElement.style.zIndex = 2147483647; + + // index label + var label = document.createElement("span"); + label.textContent = index; + label.style.position = "absolute"; + label.style.top = Math.max(-19, -bbox.top) + "px"; + label.style.left = Math.min(Math.floor(bbox.width / 5), 2) + "px"; + label.style.background = borderColor; + label.style.color = "white"; + label.style.padding = "2px 4px"; + label.style.fontSize = "12px"; + label.style.borderRadius = "2px"; + newElement.appendChild(label); + + document.body.appendChild(newElement); + }); + + // Record the item information + itemsInfo[index] = getElementInfo(index, item.element); + }); + return {items, itemsInfo}; +} +// 0) { + marks[0].parentNode.removeChild(marks[0]); + } +} \ No newline at end of file diff --git a/src/agentscope/service/browser/web_browser.py b/src/agentscope/service/browser/web_browser.py new file mode 100644 index 000000000..b0958eb3c --- /dev/null +++ b/src/agentscope/service/browser/web_browser.py @@ -0,0 +1,475 @@ +# -*- coding: utf-8 -*- +# pylint: disable=C0301 +"""The web browser module for agent to interact with web pages.""" +import time +from pathlib import Path +from typing import Union, Callable, Optional + +import requests +from loguru import logger +from pydantic import BaseModel + +from agentscope.service import ServiceResponse, ServiceExecStatus + + +class WebElementInfo(BaseModel): + """The information of a web interactive element.""" + + html: str + """The html content of the element.""" + + tag_name: str + """The tage name of the element.""" + + node_name: str + """The node name of the element.""" + + node_value: Union[None, str] + """The node value of the element.""" + + type: Union[None, str] + """The type of the element.""" + + aria_label: Union[None, str] + """The aria label of the element.""" + + is_clickable: Union[str, bool] + """Whether the element is clickable. If clickable, the value is the + link of the element, otherwise, the value is False.""" + + meta_data: list[str] + """The meta data of the elements, e.g. attributes""" + + inner_text: str + """The text content of the element.""" + + origin_x: float + """The x coordinate of the origin of the element.""" + + origin_y: float + """The y coordinate of the origin of the element.""" + + width: float + """The width of the element.""" + + height: float + """The height of the element.""" + + +class WebBrowser: + """The web browser for agent, which is implemented with playwright. This + module allows agent to interact with web pages, such as visiting a web + page, clicking on elements, typing text, scrolling web page, etc. + + Note: + 1. This module is still under development, and changes will be made in + the future. + 2. In Playwright, because of its asynchronous operations, it is + essential to use `if __name__ == "__main__":` to designate the main + entry point of the program. This practice ensures that asynchronous + functions are executed correctly within the appropriate context. + + Install: + Execute the following code to install the required packages: + + .. code-block:: bash + + pip install playwright + playwright install + + + Details: + 1. The actions that the agent can take in the web browser includes: + "action_click", "action_type", "action_scroll_up", + "action_scroll_down", "action_press_key", and "action_visit_url". + 2. You can extract the html content, title, url, screenshot of the + current web page by calling the corresponding properties, e.g. + `page_url`, `page_html`, `page_title`, `page_screenshot`. + 3. You can set or remove the interactive marks on the web page by + calling the `set_interactive_marks` and `remove_interactive_marks` + methods. + + + Examples: + .. code-block:: python + + from agentscope.service import WebBrowser + import time + + if __name__ == "__main__": + browser = WebBrowser() + # Visit the specific web page + browser.action_visit_url("https://www.bing.com") + # Set the interactive marks on the web page + browser.set_interactive_marks() + + time.sleep(5) + + browser.close() + + """ + + _actions = [ + "action_click", + "action_type", + "action_scroll_up", + "action_scroll_down", + "action_press_key", + "action_visit_url", + ] + """The available actions for the web browser that agent can takes.""" + + _interactive_elements: list + """To record the interactive elements on the current page, which will be + set after calling the `set_interactive_marks` method, and cleared after + calling the `remove_interactive_marks` method.""" + + def __init__( + self, + timeout: int = 30, + browser_visible: bool = True, + browser_width: int = 1280, + browser_height: int = 1080, + ) -> None: + """Initialize the web browser module. + + Args: + timeout (`int`, defaults to `30`): + The timeout (in seconds) for the browser to wait for the + page to load, defaults to 60s. + browser_visible (`bool`, defaults to `True`): + Whether the browser is visible. + browser_width (`int`, defaults to `1280`): + The width of the browser. Defaults to 1280. + browser_height (`int`, defaults to `1080`): + The height of the browser. Defaults to 1080. + """ + + try: + from playwright.sync_api import sync_playwright + except ImportError as e: + raise ImportError( + "Please install the `playwright` package by running `pip " + "install playwright; playwright install` to use the web " + "browser. \n" + "More details refer to " + "https://playwright.dev/python/docs/intro.", + ) from e + + # Init a web page + playwright_process = sync_playwright().start() + self.browser = playwright_process.chromium.launch( + headless=not browser_visible, + ) + + self._page = self.browser.new_page() + self._page.set_default_timeout(timeout * 1000) + self._page.set_viewport_size( + { + "width": browser_width, + "height": browser_height, + }, + ) + + # Init a dictionary to store the interactive elements on the page + self._interactive_elements = [] + + # Load the external JavaScript to crawl the page + self._load_external_js() + + @property + def url(self) -> str: + """The url of current page.""" + return self._page.url + + @property + def page_html(self) -> str: + """The html content of current page.""" + return self._page.content() + + @property + def page_title(self) -> str: + """The title of current page.""" + return self._page.title() + + @property + def page_markdown(self) -> str: + """The content of current page in Markdown format.""" + try: + import markdownify + except ImportError as e: + raise ImportError( + "Please install the `markdownify` package to convert the page" + "content to markdown format.", + ) from e + + return markdownify.markdownify(self.page_html) + + @property + def page_screenshot(self) -> bytes: + """The screenshot of the current page.""" + return self._page.screenshot() + + # ----- Actions which are performed within a web page --------------------- + def action_click(self, element_id: int) -> ServiceResponse: + """Click on the element with the given id. + + Args: + element_id (`int`): + The id of the element to click. + + Returns: + `ServiceResponse`: + The response of the click action. + """ + if not self._verify_element_id(element_id): + return ServiceResponse( + status=ServiceExecStatus.ERROR, + content=f"ElementIDError: The element id must be in the range " + f"[0, {len(self._interactive_elements) - 1}], but " + f"got {element_id}.", + ) + + element_handle = self._interactive_elements[element_id] + element_handle.evaluate( + "element => element.setAttribute('target', '_self')", + ) + + element_handle.click() + + self._wait_for_load( + "Wait for click event", + "Finished", + ) + + return ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=f"Click on element {element_id} done", + ) + + def action_type( + self, + element_id: int, + text: str, + submit: bool, + ) -> ServiceResponse: + """Type text into the element with the given id. + + Args: + element_id (`int`): + The id of the element to type text into. + text (`str`): + The text to type into the element. + submit (`bool`): + If press the "Enter" after typing text. + + Returns: + `ServiceResponse`: + The response of the type action. + """ + if not self._verify_element_id(element_id): + return ServiceResponse( + status=ServiceExecStatus.ERROR, + content=f"ElementIDError: The element id must be in the range " + f"[0, {len(self._interactive_elements) - 1}], but " + f"got {element_id}.", + ) + + self.action_click(element_id) + web_ele = self._interactive_elements[element_id] + + # Try to clear the text within the given elements + try: + self._page.evaluate('element => element.value = ""', web_ele) + except Exception as e: + logger.info( + f"Exception {str(e)}, " + "unable to clear the value within the given elements.", + ) + + # Focus on the element to type + self._page.evaluate("element => element.focus()", web_ele) + + # Type in the text + web_ele.type(text) + self._wait_for_load( + "Wait for finish typing", + "Finished", + ) + + if submit: + self.action_press_key("Enter") + + return ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content="Typing done", + ) + + def action_scroll_up(self) -> ServiceResponse: + """Scroll up the current web page.""" + self._page.keyboard.down("Alt") + self._page.keyboard.press("ArrowUp") + self._page.keyboard.up("Alt") + return ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content="Scroll up done", + ) + + def action_scroll_down(self) -> ServiceResponse: + """Scroll down the current web page.""" + self._page.keyboard.down("Alt") + self._page.keyboard.press("ArrowDown") + self._page.keyboard.up("Alt") + return ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content="Scroll down done", + ) + + def action_press_key(self, key: str) -> ServiceResponse: + """Press down a key in the current web page. + + Args: + key (`str`): + Chosen from `F1` - `F12`, `Digit0`- `Digit9`, `KeyA`- `KeyZ`, `Backquote`, `Minus`, `Equal`, `Backslash`, `Backspace`, `Tab`, `Delete`, `Escape`, `ArrowDown`, `End`, `Enter`, `Home`, `Insert`, `PageDown`, `PageUp`, `ArrowRight`, `ArrowUp`, etc. + """ # noqa + self._page.keyboard.press(key) + + # TODO: in a more elegant way to wait for the page to be loaded rather + # then using time.sleep + # Wait for the page to be loaded + time.sleep(2) + + self._wait_for_load( + f"Wait for press key: {key}", + "Finished", + ) + response = ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=f"Press key: {key} done", + ) + return response + + # ------ Actions which are performed to change the web page --------------- + def action_visit_url(self, url: str) -> ServiceResponse: + """Visit the given url. + + Args: + url (`str`): + The url to visit in browser. + """ + self._page.goto(url) + self._wait_for_load( + f"Wait for page {url} to load.", + ) + + return ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content="Load web page successfully.", + ) + + def get_action_functions(self) -> dict[str, Callable]: + """Return a dictionary of the action functions, where the key is the + action name and the value is the corresponding function.""" + return { + action_name: getattr(self, action_name) + for action_name in self._actions + } + + # ------ Set or remove marks of interactive elements on the web page ------ + def set_interactive_marks(self) -> list[WebElementInfo]: + """Mark the interactive elements on the current web page.""" + + # Remove the existing interactive elements on the current web page + self.remove_interactive_marks() + + # Load the javascript to crawl the page + self._page.evaluate(self._mark_page_js) + + # Mark the interactive elements on the web page by calling the + # JavaScript function `crawlPage` + result_handle = self._page.evaluate_handle("setInteractiveMarks()") + + # Record the interactive elements on the current page + interactive_items_handle = result_handle.get_property("items") + items_info_handle = result_handle.get_property("itemsInfo") + + js_handles = ( + interactive_items_handle.evaluate_handle("items => items") + .get_properties() + .values() + ) + self._interactive_elements = [ + item.get_property("element").as_element() for item in js_handles + ] + + # Get the interactive items + return [ + WebElementInfo(**info) for info in items_info_handle.json_value() + ] + + def remove_interactive_marks(self) -> None: + """Remove the interactive elements on the current web page.""" + # Load the javascript to crawl the page + self._page.evaluate(self._mark_page_js) + + # Remove the interactive elements on the web page by calling the + # JavaScript function `rmInteractiveMarks` + self._page.evaluate("removeInteractiveMarks()") + + self._interactive_elements.clear() + + def close(self) -> None: + """Close the browser""" + self.browser.close() + + def _wait_for_load( + self, + hint_s: str, + hint_e: str = "Page loaded.", + timeout: Optional[int] = None, + ) -> None: + """Wait to ensure the page is loaded after certain actions. + + Args: + hint_s (`str`): + The hint message before waiting. + hint_e (`str`): + The hint message after waiting. + timeout (`Optional[int]`, defaults to `None`) + The timeout for the page to load (in seconds) + """ + logger.debug(hint_s) + + if timeout is not None: + timeout = timeout * 1000 + + self._page.wait_for_load_state("load", timeout=timeout) + logger.debug(hint_e) + + def _verify_element_id(self, element_id: int) -> bool: + """Verify the given element id is valid or not.""" + return 0 <= element_id < len(self._interactive_elements) + + def _load_external_js(self) -> None: + """Read the JavaScript from local file. The JavaScript written in + markpage.js will crawl all the interactive elements on the page, + and mark them on with square marks. + """ + current_file_path = Path(__file__) + js_file_path = current_file_path.parent / "markpage.js" + + with open(js_file_path, "r", encoding="utf-8") as file: + self._mark_page_js = file.read() + + def _get_jina_page(self) -> str: + """Return the formatted current page text, using api from jina""" + jina_url = "https://r.jina.ai/" + self.url + try: + page_text = requests.get(jina_url).text + return page_text + except Exception as e: + return ( + f"Encounter exception {str(e)}" + f"The page in {self.url} might not be loaded yet" + "you might want to check the request connection or api quota." + ) diff --git a/src/agentscope/service/service_toolkit.py b/src/agentscope/service/service_toolkit.py index e5fc47de2..8f0013bda 100644 --- a/src/agentscope/service/service_toolkit.py +++ b/src/agentscope/service/service_toolkit.py @@ -21,9 +21,11 @@ JsonParsingError, FunctionNotFoundError, FunctionCallFormatError, + FunctionCallError, ) from .service_response import ServiceResponse from .service_response import ServiceExecStatus +from ..message import Msg try: from docstring_parser import parse @@ -414,17 +416,30 @@ def _execute_func(self, cmds: List[dict]) -> str: return execute_results_prompt - def parse_and_call_func(self, text_cmd: Union[list[dict], str]) -> str: + def parse_and_call_func( + self, + text_cmd: Union[list[dict], str], + raise_exception: bool = False, + ) -> Msg: """Parse, check the text and call the function.""" - # --- Step 1: Parse the text according to the tools_call_format - cmds = self._parse_and_check_text(text_cmd) + try: + # --- Step 1: Parse the text according to the tools_call_format + cmds = self._parse_and_check_text(text_cmd) - # --- Step 2: Call the service function --- + # --- Step 2: Call the service function --- - execute_results_prompt = self._execute_func(cmds) + execute_results_prompt = self._execute_func(cmds) - return execute_results_prompt + except FunctionCallError as e: + # Catch the function calling error that can be handled by + # the model + if raise_exception: + raise e from None + + execute_results_prompt = str(e) + + return Msg("system", execute_results_prompt, "system") @classmethod def get(