langgenius · crazywoola · Nov 18, 2024 · Nov 17, 2024 · Nov 17, 2024
diff --git a/api/core/tools/provider/builtin/tavily/tavily.yaml b/api/core/tools/provider/builtin/tavily/tavily.yaml
@@ -1,14 +1,12 @@
 identity:
-  author: Yash Parmar
+  author: Yash Parmar, Kalo Chin
   name: tavily
   label:
-    en_US: Tavily
-    zh_Hans: Tavily
-    pt_BR: Tavily
+    en_US: Tavily Search & Extract
+    zh_Hans: Tavily 搜索和提取
   description:
-    en_US: Tavily
-    zh_Hans: Tavily
-    pt_BR: Tavily
+    en_US: A powerful AI-native search engine and web content extraction tool that provides highly relevant search results and raw content extraction from web pages.
+    zh_Hans: 一个强大的原生AI搜索引擎和网页内容提取工具，提供高度相关的搜索结果和网页原始内容提取。
   icon: icon.png
   tags:
     - search
@@ -19,13 +17,10 @@ credentials_for_provider:
     label:
       en_US: Tavily API key
       zh_Hans: Tavily API key
-      pt_BR: Tavily API key
     placeholder:
       en_US: Please input your Tavily API key
       zh_Hans: 请输入你的 Tavily API key
-      pt_BR: Please input your Tavily API key
     help:
       en_US: Get your Tavily API key from Tavily
       zh_Hans: 从 TavilyApi 获取您的 Tavily API key
-      pt_BR: Get your Tavily API key from Tavily
-    url: https://docs.tavily.com/docs/welcome
+    url: https://app.tavily.com/home
diff --git a/api/core/tools/provider/builtin/tavily/tools/tavily_extract.py b/api/core/tools/provider/builtin/tavily/tools/tavily_extract.py
@@ -0,0 +1,145 @@
+from typing import Any
+
+import requests
+
+from core.tools.entities.tool_entities import ToolInvokeMessage
+from core.tools.tool.builtin_tool import BuiltinTool
+
+TAVILY_API_URL = "https://api.tavily.com"
+
+
+class TavilyExtract:
+    """
+    A class for extracting content from web pages using the Tavily Extract API.
+
+    Args:
+        api_key (str): The API key for accessing the Tavily Extract API.
+
+    Methods:
+        extract_content: Retrieves extracted content from the Tavily Extract API.
+    """
+
+    def __init__(self, api_key: str) -> None:
+        self.api_key = api_key
+
+    def extract_content(self, params: dict[str, Any]) -> dict:
+        """
+        Retrieves extracted content from the Tavily Extract API.
+
+        Args:
+            params (Dict[str, Any]): The extraction parameters.
+
+        Returns:
+            dict: The extracted content.
+
+        """
+        # Ensure required parameters are set
+        if "api_key" not in params:
+            params["api_key"] = self.api_key
+
+        # Process parameters
+        processed_params = self._process_params(params)
+
+        response = requests.post(f"{TAVILY_API_URL}/extract", json=processed_params)
+        response.raise_for_status()
+        return response.json()
+
+    def _process_params(self, params: dict[str, Any]) -> dict:
+        """
+        Processes and validates the extraction parameters.
+
+        Args:
+            params (Dict[str, Any]): The extraction parameters.
+
+        Returns:
+            dict: The processed parameters.
+        """
+        processed_params = {}
+
+        # Process 'urls'
+        if "urls" in params:
+            urls = params["urls"]
+            if isinstance(urls, str):
+                processed_params["urls"] = [url.strip() for url in urls.replace(",", " ").split()]
+            elif isinstance(urls, list):
+                processed_params["urls"] = urls
+        else:
+            raise ValueError("The 'urls' parameter is required.")
+
+        # Only include 'api_key'
+        processed_params["api_key"] = params.get("api_key", self.api_key)
+
+        return processed_params
+
+
+class TavilyExtractTool(BuiltinTool):
+    """
+    A tool for extracting content from web pages using Tavily Extract.
+    """
+
+    def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage | list[ToolInvokeMessage]:
+        """
+        Invokes the Tavily Extract tool with the given user ID and tool parameters.
+
+        Args:
+            user_id (str): The ID of the user invoking the tool.
+            tool_parameters (Dict[str, Any]): The parameters for the Tavily Extract tool.
+
+        Returns:
+            ToolInvokeMessage | list[ToolInvokeMessage]: The result of the Tavily Extract tool invocation.
+        """
+        urls = tool_parameters.get("urls", "")
+        api_key = self.runtime.credentials.get("tavily_api_key")
+        if not api_key:
+            return self.create_text_message(
+                "Tavily API key is missing. Please set the 'tavily_api_key' in credentials."
+            )
+        if not urls:
+            return self.create_text_message("Please input at least one URL to extract.")
+
+        tavily_extract = TavilyExtract(api_key)
+        try:
+            raw_results = tavily_extract.extract_content(tool_parameters)
+        except requests.HTTPError as e:
+            return self.create_text_message(f"Error occurred while extracting content: {str(e)}")
+
+        if not raw_results.get("results"):
+            return self.create_text_message("No content could be extracted from the provided URLs.")
+        else:
+            # Always return JSON message with all data
+            json_message = self.create_json_message(raw_results)
+
+            # Create text message based on user-selected parameters
+            text_message_content = self._format_results_as_text(raw_results)
+            text_message = self.create_text_message(text=text_message_content)
+
+            return [json_message, text_message]
+
+    def _format_results_as_text(self, raw_results: dict) -> str:
+        """
+        Formats the raw extraction results into a markdown text based on user-selected parameters.
+
+        Args:
+            raw_results (dict): The raw extraction results.
+
+        Returns:
+            str: The formatted markdown text.
+        """
+        output_lines = []
+
+        for idx, result in enumerate(raw_results.get("results", []), 1):
+            url = result.get("url", "")
+            raw_content = result.get("raw_content", "")
+
+            output_lines.append(f"## Extracted Content {idx}: {url}\n")
+            output_lines.append(f"**Raw Content:**\n{raw_content}\n")
+            output_lines.append("---\n")
+
+        if raw_results.get("failed_results"):
+            output_lines.append("## Failed URLs:\n")
+            for failed in raw_results["failed_results"]:
+                url = failed.get("url", "")
+                error = failed.get("error", "Unknown error")
+                output_lines.append(f"- {url}: {error}\n")
+
+        return "\n".join(output_lines)
diff --git a/api/core/tools/provider/builtin/tavily/tools/tavily_extract.yaml b/api/core/tools/provider/builtin/tavily/tools/tavily_extract.yaml
@@ -0,0 +1,23 @@
+identity:
+  name: tavily_extract
+  author: Kalo Chin
+  label:
+    en_US: Tavily Extract
+    zh_Hans: Tavily Extract
+description:
+  human:
+    en_US: A web extraction tool built specifically for AI agents (LLMs), delivering raw content from web pages.
+    zh_Hans: 专为人工智能代理 (LLM) 构建的网页提取工具，提供网页的原始内容。
+  llm: A tool for extracting raw content from web pages, designed for AI agents (LLMs).
+parameters:
+  - name: urls
+    type: string
+    required: true
+    label:
+      en_US: URLs
+      zh_Hans: URLs
+    human_description:
+      en_US: A comma-separated list of URLs to extract content from.
+      zh_Hans: 要从中提取内容的 URL 的逗号分隔列表。
+    llm_description: A comma-separated list of URLs to extract content from.
+    form: llm