From bd6dd82e53fa5657ddc434912a3195e70b247830 Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Tue, 2 Jul 2024 12:49:18 +0000 Subject: [PATCH 01/21] Remove fireclaw and optimize code --- .../builtin/firecrawl/firecrawl_appx.py | 148 +++++++++--------- api/poetry.lock | 16 +- api/pyproject.toml | 1 - 3 files changed, 73 insertions(+), 92 deletions(-) diff --git a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py index a28f479170a267..a0241c7ff1e6e5 100644 --- a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py +++ b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py @@ -1,98 +1,94 @@ +import os +import logging import time - import requests +from requests.exceptions import HTTPError class FirecrawlApp: - def __init__(self, api_key=None, base_url=None): + def __init__(self, api_key=None, api_url=None): self.api_key = api_key - self.base_url = base_url or 'https://api.firecrawl.dev' - if self.api_key is None and self.base_url == 'https://api.firecrawl.dev': - raise ValueError('No API key provided') + if not self.api_key: + raise ValueError("API key is required") + self.api_url = api_url or 'https://api.firecrawl.dev' + self.logger = logging.getLogger(__name__) - def scrape_url(self, url, params=None) -> dict: + def _prepare_headers(self, idempotency_key=None): headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}' } - json_data = {'url': url} - if params: - json_data.update(params) - response = requests.post( - f'{self.base_url}/v0/scrape', - headers=headers, - json=json_data - ) - if response.status_code == 200: - response = response.json() - if response['success'] == True: - return response['data'] - else: - raise Exception(f'Failed to scrape URL. Error: {response["error"]}') + if idempotency_key: + headers['Idempotency-Key'] = idempotency_key + return headers - elif response.status_code in [402, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') - else: - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') + def _request(self, method, url, data=None, headers=None, retries=3, backoff_factor=0.3): + for i in range(retries): + try: + response = requests.request(method, url, json=data, headers=headers) + response.raise_for_status() + return response.json() + except HTTPError as e: + if response.status_code == 502 and i < retries - 1: + time.sleep(backoff_factor * (2 ** i)) + else: + self._handle_error(response) + return None - def crawl_url(self, url, params=None, wait_until_done=True, timeout=2) -> str: - headers = self._prepare_headers() - json_data = {'url': url} - if params: - json_data.update(params) - response = self._post_request(f'{self.base_url}/v0/crawl', json_data, headers) - if response.status_code == 200: - job_id = response.json().get('jobId') - if wait_until_done: - return self._monitor_job_status(job_id, headers, timeout) - else: - return {'jobId': job_id} + def _handle_error(self, response): + try: + error_detail = response.json() + except ValueError: + error_detail = response.text + if response.status_code == 500: + raise HTTPError(f'Server Error: {error_detail}') else: - self._handle_error(response, 'start crawl job') + raise HTTPError(f'{response.status_code} Error: {error_detail}') - def check_crawl_status(self, job_id) -> dict: + def scrape_url(self, url, **kwargs): + endpoint = f'{self.api_url}/v0/scrape' headers = self._prepare_headers() - response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers) - if response.status_code == 200: - return response.json() - else: - self._handle_error(response, 'check crawl status') + data = {'url': url, **kwargs} + return self._request('POST', endpoint, data, headers) - def _prepare_headers(self): - return { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' - } + def search(self, query, **kwargs): + endpoint = f'{self.api_url}/v0/search' + headers = self._prepare_headers() + data = {'query': query, **kwargs} + return self._request('POST', endpoint, data, headers) - def _post_request(self, url, data, headers): - return requests.post(url, headers=headers, json=data) + def crawl_url(self, url, wait=False, poll_interval=5, idempotency_key=None, **kwargs): + endpoint = f'{self.api_url}/v0/crawl' + headers = self._prepare_headers(idempotency_key) + data = {'url': url, **kwargs} + job_id = self._request('POST', endpoint, data, headers) + if wait: + return self._monitor_job_status(job_id, headers, poll_interval) + return job_id - def _get_request(self, url, headers): - return requests.get(url, headers=headers) + def check_crawl_status(self, job_id): + endpoint = f'{self.api_url}/v0/crawl/status/{job_id}' + headers = self._prepare_headers() + return self._request('GET', endpoint, headers=headers) - def _monitor_job_status(self, job_id, headers, timeout): + def _monitor_job_status(self, job_id, headers, poll_interval): while True: - status_response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers) - if status_response.status_code == 200: - status_data = status_response.json() - if status_data['status'] == 'completed': - if 'data' in status_data: - return status_data['data'] - else: - raise Exception('Crawl job completed but no data was returned') - elif status_data['status'] in ['active', 'paused', 'pending', 'queued']: - if timeout < 2: - timeout = 2 - time.sleep(timeout) # Wait for the specified timeout before checking again - else: - raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}') - else: - self._handle_error(status_response, 'check crawl status') + status = self.check_crawl_status(job_id) + if status['status'] == 'completed': + return status + elif status['status'] == 'failed': + raise HTTPError(f'Job {job_id} failed: {status["error"]}') + time.sleep(poll_interval) - def _handle_error(self, response, action): - if response.status_code in [402, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') - else: - raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}') + +# Example usage +if __name__ == "__main__": + logging.basicConfig(level=logging.DEBUG) + api_key = os.getenv('FIRECRAWL_API_KEY') + app = DifyFirecrawlApp(api_key) + try: + result = app.scrape_url('https://example.com') + print(result) + exit(0) + except HTTPError as e: + print("Error:", e) \ No newline at end of file diff --git a/api/poetry.lock b/api/poetry.lock index 961b2748b44f76..1bfa9716815805 100644 --- a/api/poetry.lock +++ b/api/poetry.lock @@ -2083,20 +2083,6 @@ files = [ {file = "filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb"}, ] -[[package]] -name = "firecrawl-py" -version = "0.0.5" -description = "Python SDK for Firecrawl API" -optional = false -python-versions = "*" -files = [ - {file = "firecrawl-py-0.0.5.tar.gz", hash = "sha256:3d1cc30b7d86c12aa06e6434ebb526072cd70ab9a0c8b145008efe044a1cd09c"}, - {file = "firecrawl_py-0.0.5-py3-none-any.whl", hash = "sha256:476694345141c0145a1bee9c01a8ad0103f75892c12a122dc511a3adad0785e7"}, -] - -[package.dependencies] -requests = "*" - [[package]] name = "flask" version = "3.0.3" @@ -9095,4 +9081,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "90f0e77567fbe5100d15bf2bc9472007aafc53c2fd594b6a90dd8455dea58582" +content-hash = "420c866aaff914d48c00c443a59f181c778690c24f81a955b1f970729bb441b7" diff --git a/api/pyproject.toml b/api/pyproject.toml index c919b33856af28..f157fab34637ff 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -115,7 +115,6 @@ chardet = "~5.1.0" cohere = "~5.2.4" cos-python-sdk-v5 = "1.9.30" dashscope = { version = "~1.17.0", extras = ["tokenizer"] } -firecrawl-py = "0.0.5" flask = "~3.0.1" flask-compress = "~1.14" flask-cors = "~4.0.0" From 3d8c0aacd1a72906f714565b044a51cd55df2695 Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Tue, 2 Jul 2024 13:34:01 +0000 Subject: [PATCH 02/21] update --- .../provider/builtin/firecrawl/tools/crawl.py | 27 ++++--------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/api/core/tools/provider/builtin/firecrawl/tools/crawl.py b/api/core/tools/provider/builtin/firecrawl/tools/crawl.py index ab3a73dd03a792..30ffcb0b48262c 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/crawl.py +++ b/api/core/tools/provider/builtin/firecrawl/tools/crawl.py @@ -7,8 +7,7 @@ class CrawlTool(BuiltinTool): def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: - # initialize the app object with the api key - app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url']) + app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], api_url=self.runtime.credentials['base_url']) options = { 'crawlerOptions': { @@ -21,29 +20,13 @@ def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolIn } } - # crawl the url crawl_result = app.crawl_url( url=tool_parameters['url'], params=options, - wait_until_done=True, + wait=True ) - - # reformat crawl result - crawl_output = "**Crawl Result**\n\n" - try: - for result in crawl_result: - crawl_output += f"**- Title:** {result.get('metadata', {}).get('title', '')}\n" - crawl_output += f"**- Description:** {result.get('metadata', {}).get('description', '')}\n" - crawl_output += f"**- URL:** {result.get('metadata', {}).get('ogUrl', '')}\n\n" - crawl_output += f"**- Web Content:**\n{result.get('markdown', '')}\n\n" - crawl_output += "---\n\n" - except Exception as e: - crawl_output += f"An error occurred: {str(e)}\n" - crawl_output += f"**- Title:** {result.get('metadata', {}).get('title', '')}\n" - crawl_output += f"**- Description:** {result.get('metadata', {}).get('description','')}\n" - crawl_output += f"**- URL:** {result.get('metadata', {}).get('ogUrl', '')}\n\n" - crawl_output += f"**- Web Content:**\n{result.get('markdown', '')}\n\n" - crawl_output += "---\n\n" + if not crawl_result: + return self.create_text_message("Crawl request failed.") - return self.create_text_message(crawl_output) \ No newline at end of file + return self.create_text_message(crawl_result) \ No newline at end of file From cd733c5646364b9c97cb2d4d85b405a9da4c5e09 Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Tue, 2 Jul 2024 23:22:03 +0000 Subject: [PATCH 03/21] Update --- .../builtin/firecrawl/firecrawl_appx.py | 25 ++++--------------- .../provider/builtin/firecrawl/tools/crawl.py | 10 +++++--- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py index a0241c7ff1e6e5..9cfd000801f86c 100644 --- a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py +++ b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py @@ -1,17 +1,14 @@ import os -import logging import time import requests from requests.exceptions import HTTPError - class FirecrawlApp: def __init__(self, api_key=None, api_url=None): self.api_key = api_key if not self.api_key: raise ValueError("API key is required") self.api_url = api_url or 'https://api.firecrawl.dev' - self.logger = logging.getLogger(__name__) def _prepare_headers(self, idempotency_key=None): headers = { @@ -32,19 +29,9 @@ def _request(self, method, url, data=None, headers=None, retries=3, backoff_fact if response.status_code == 502 and i < retries - 1: time.sleep(backoff_factor * (2 ** i)) else: - self._handle_error(response) + raise return None - def _handle_error(self, response): - try: - error_detail = response.json() - except ValueError: - error_detail = response.text - if response.status_code == 500: - raise HTTPError(f'Server Error: {error_detail}') - else: - raise HTTPError(f'{response.status_code} Error: {error_detail}') - def scrape_url(self, url, **kwargs): endpoint = f'{self.api_url}/v0/scrape' headers = self._prepare_headers() @@ -61,7 +48,8 @@ def crawl_url(self, url, wait=False, poll_interval=5, idempotency_key=None, **kw endpoint = f'{self.api_url}/v0/crawl' headers = self._prepare_headers(idempotency_key) data = {'url': url, **kwargs} - job_id = self._request('POST', endpoint, data, headers) + response = self._request('POST', endpoint, data, headers) + job_id = response['jobId'] # 确保使用正确的键名 if wait: return self._monitor_job_status(job_id, headers, poll_interval) return job_id @@ -80,15 +68,12 @@ def _monitor_job_status(self, job_id, headers, poll_interval): raise HTTPError(f'Job {job_id} failed: {status["error"]}') time.sleep(poll_interval) - # Example usage if __name__ == "__main__": - logging.basicConfig(level=logging.DEBUG) api_key = os.getenv('FIRECRAWL_API_KEY') - app = DifyFirecrawlApp(api_key) + app = FirecrawlApp(api_key) try: result = app.scrape_url('https://example.com') print(result) - exit(0) except HTTPError as e: - print("Error:", e) \ No newline at end of file + print("Error:", e) diff --git a/api/core/tools/provider/builtin/firecrawl/tools/crawl.py b/api/core/tools/provider/builtin/firecrawl/tools/crawl.py index 30ffcb0b48262c..37a57c201e3226 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/crawl.py +++ b/api/core/tools/provider/builtin/firecrawl/tools/crawl.py @@ -1,10 +1,10 @@ +import json from typing import Any, Union from core.tools.entities.tool_entities import ToolInvokeMessage from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp from core.tools.tool.builtin_tool import BuiltinTool - class CrawlTool(BuiltinTool): def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], api_url=self.runtime.credentials['base_url']) @@ -22,11 +22,15 @@ def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolIn crawl_result = app.crawl_url( url=tool_parameters['url'], - params=options, wait=True ) + if isinstance(crawl_result, dict): + result_message = json.dumps(crawl_result, ensure_ascii=False, indent=4) + else: + result_message = str(crawl_result) + if not crawl_result: return self.create_text_message("Crawl request failed.") - return self.create_text_message(crawl_result) \ No newline at end of file + return self.create_text_message(result_message) From e4640e0b31c8fae558f93482e2c5edca15f33317 Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Wed, 3 Jul 2024 01:16:21 +0000 Subject: [PATCH 04/21] Update --- .../provider/builtin/firecrawl/firecrawl_appx.py | 15 +++++++-------- .../provider/builtin/firecrawl/tools/crawl.py | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py index 9cfd000801f86c..94364a8742348c 100644 --- a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py +++ b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py @@ -4,11 +4,11 @@ from requests.exceptions import HTTPError class FirecrawlApp: - def __init__(self, api_key=None, api_url=None): + def __init__(self, api_key=None, base_url=None): self.api_key = api_key if not self.api_key: raise ValueError("API key is required") - self.api_url = api_url or 'https://api.firecrawl.dev' + self.base_url = base_url or 'https://api.firecrawl.dev' def _prepare_headers(self, idempotency_key=None): headers = { @@ -33,29 +33,29 @@ def _request(self, method, url, data=None, headers=None, retries=3, backoff_fact return None def scrape_url(self, url, **kwargs): - endpoint = f'{self.api_url}/v0/scrape' + endpoint = f'{self.base_url}/v0/scrape' headers = self._prepare_headers() data = {'url': url, **kwargs} return self._request('POST', endpoint, data, headers) def search(self, query, **kwargs): - endpoint = f'{self.api_url}/v0/search' + endpoint = f'{self.base_url}/v0/search' headers = self._prepare_headers() data = {'query': query, **kwargs} return self._request('POST', endpoint, data, headers) def crawl_url(self, url, wait=False, poll_interval=5, idempotency_key=None, **kwargs): - endpoint = f'{self.api_url}/v0/crawl' + endpoint = f'{self.base_url}/v0/crawl' headers = self._prepare_headers(idempotency_key) data = {'url': url, **kwargs} response = self._request('POST', endpoint, data, headers) - job_id = response['jobId'] # 确保使用正确的键名 + job_id = response['jobId'] if wait: return self._monitor_job_status(job_id, headers, poll_interval) return job_id def check_crawl_status(self, job_id): - endpoint = f'{self.api_url}/v0/crawl/status/{job_id}' + endpoint = f'{self.base_url}/v0/crawl/status/{job_id}' headers = self._prepare_headers() return self._request('GET', endpoint, headers=headers) @@ -68,7 +68,6 @@ def _monitor_job_status(self, job_id, headers, poll_interval): raise HTTPError(f'Job {job_id} failed: {status["error"]}') time.sleep(poll_interval) -# Example usage if __name__ == "__main__": api_key = os.getenv('FIRECRAWL_API_KEY') app = FirecrawlApp(api_key) diff --git a/api/core/tools/provider/builtin/firecrawl/tools/crawl.py b/api/core/tools/provider/builtin/firecrawl/tools/crawl.py index 37a57c201e3226..5c08d9865c581e 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/crawl.py +++ b/api/core/tools/provider/builtin/firecrawl/tools/crawl.py @@ -7,7 +7,7 @@ class CrawlTool(BuiltinTool): def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: - app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], api_url=self.runtime.credentials['base_url']) + app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url']) options = { 'crawlerOptions': { From 8b130cc09d4b9a205849d8d727967cb232c76fc9 Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Wed, 3 Jul 2024 01:19:55 +0000 Subject: [PATCH 05/21] self-hosted api key --- api/core/tools/provider/builtin/firecrawl/firecrawl.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/api/core/tools/provider/builtin/firecrawl/firecrawl.yaml b/api/core/tools/provider/builtin/firecrawl/firecrawl.yaml index edd28f7d22b88e..999aa8a4ba545c 100644 --- a/api/core/tools/provider/builtin/firecrawl/firecrawl.yaml +++ b/api/core/tools/provider/builtin/firecrawl/firecrawl.yaml @@ -20,10 +20,10 @@ credentials_for_provider: zh_CN: Firecrawl API 密钥 placeholder: en_US: Please input your Firecrawl API key - zh_CN: 请输入您的 Firecrawl API 密钥 + zh_CN: 请输入您的 Firecrawl API 密钥,如果是自托管版本,可以随意填写密钥 help: - en_US: Get your Firecrawl API key from your Firecrawl account settings. - zh_CN: 从您的 Firecrawl 账户设置中获取 Firecrawl API 密钥。 + en_US: Get your Firecrawl API key from your Firecrawl account settings.If you are using a self-hosted version, you may enter any key at your convenience. + zh_CN: 从您的 Firecrawl 账户设置中获取 Firecrawl API 密钥。如果是自托管版本,可以随意填写密钥。 url: https://www.firecrawl.dev/account base_url: type: text-input From 75e82a89a3acb31e142b01e09b4eabfff46f480d Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Wed, 3 Jul 2024 01:21:31 +0000 Subject: [PATCH 06/21] Update --- api/core/tools/provider/builtin/firecrawl/tools/crawl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/api/core/tools/provider/builtin/firecrawl/tools/crawl.py b/api/core/tools/provider/builtin/firecrawl/tools/crawl.py index 5c08d9865c581e..627fd1771959f4 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/crawl.py +++ b/api/core/tools/provider/builtin/firecrawl/tools/crawl.py @@ -22,6 +22,7 @@ def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolIn crawl_result = app.crawl_url( url=tool_parameters['url'], + params=options, wait=True ) From a92a21f69e7778d0b6f76d850aff9cea0210586c Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Thu, 4 Jul 2024 08:51:05 +0000 Subject: [PATCH 07/21] update --- api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py | 4 +++- api/core/tools/provider/builtin/firecrawl/tools/crawl.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py index 94364a8742348c..b712e561f6293c 100644 --- a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py +++ b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py @@ -1,14 +1,16 @@ import os import time + import requests from requests.exceptions import HTTPError + class FirecrawlApp: def __init__(self, api_key=None, base_url=None): self.api_key = api_key + self.base_url = base_url or 'https://api.firecrawl.dev' if not self.api_key: raise ValueError("API key is required") - self.base_url = base_url or 'https://api.firecrawl.dev' def _prepare_headers(self, idempotency_key=None): headers = { diff --git a/api/core/tools/provider/builtin/firecrawl/tools/crawl.py b/api/core/tools/provider/builtin/firecrawl/tools/crawl.py index 627fd1771959f4..7611f29ea6e21c 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/crawl.py +++ b/api/core/tools/provider/builtin/firecrawl/tools/crawl.py @@ -5,6 +5,7 @@ from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp from core.tools.tool.builtin_tool import BuiltinTool + class CrawlTool(BuiltinTool): def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url']) From d0a017b3996d05f744cb4b8fc5252b494bd616ae Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Fri, 5 Jul 2024 13:19:25 +0800 Subject: [PATCH 08/21] Update firecrawl.yaml --- .../tools/provider/builtin/firecrawl/firecrawl.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/api/core/tools/provider/builtin/firecrawl/firecrawl.yaml b/api/core/tools/provider/builtin/firecrawl/firecrawl.yaml index 999aa8a4ba545c..613a0e4679f165 100644 --- a/api/core/tools/provider/builtin/firecrawl/firecrawl.yaml +++ b/api/core/tools/provider/builtin/firecrawl/firecrawl.yaml @@ -6,7 +6,7 @@ identity: zh_CN: Firecrawl description: en_US: Firecrawl API integration for web crawling and scraping. - zh_CN: Firecrawl API 集成,用于网页爬取和数据抓取。 + zh_Hans: Firecrawl API 集成,用于网页爬取和数据抓取。 icon: icon.svg tags: - search @@ -17,20 +17,22 @@ credentials_for_provider: required: true label: en_US: Firecrawl API Key - zh_CN: Firecrawl API 密钥 + zh_Hans: Firecrawl API 密钥 placeholder: en_US: Please input your Firecrawl API key - zh_CN: 请输入您的 Firecrawl API 密钥,如果是自托管版本,可以随意填写密钥 + zh_Hans: 请输入您的 Firecrawl API 密钥,如果是自托管版本,可以随意填写密钥 help: en_US: Get your Firecrawl API key from your Firecrawl account settings.If you are using a self-hosted version, you may enter any key at your convenience. - zh_CN: 从您的 Firecrawl 账户设置中获取 Firecrawl API 密钥。如果是自托管版本,可以随意填写密钥。 + zh_Hans: 从您的 Firecrawl 账户设置中获取 Firecrawl API 密钥。如果是自托管版本,可以随意填写密钥。 url: https://www.firecrawl.dev/account base_url: type: text-input required: false label: en_US: Firecrawl server's Base URL + zh_Hans: Firecrawl服务器的API URL pt_BR: Firecrawl server's Base URL placeholder: en_US: https://www.firecrawl.dev + zh_HansL: https://www.firecrawl.dev pt_BR: https://www.firecrawl.dev From 47f27fa71027e5b03aac2c1f6523627cb26f798a Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Fri, 5 Jul 2024 17:16:45 +0800 Subject: [PATCH 09/21] Update firecrawl_appx.py --- api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py index b712e561f6293c..0a736f20cd92b0 100644 --- a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py +++ b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py @@ -27,13 +27,14 @@ def _request(self, method, url, data=None, headers=None, retries=3, backoff_fact response = requests.request(method, url, json=data, headers=headers) response.raise_for_status() return response.json() - except HTTPError as e: - if response.status_code == 502 and i < retries - 1: + except requests.exceptions.RequestException as e: + if i < retries - 1: time.sleep(backoff_factor * (2 ** i)) else: raise return None + def scrape_url(self, url, **kwargs): endpoint = f'{self.base_url}/v0/scrape' headers = self._prepare_headers() From bd8b884d9057a7ecd922e5bd28d6053fac2d15a0 Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Fri, 5 Jul 2024 17:22:42 +0800 Subject: [PATCH 10/21] Update firecrawl_appx.py --- .../builtin/firecrawl/firecrawl_appx.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py index 0a736f20cd92b0..9446ddbfe74ace 100644 --- a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py +++ b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py @@ -34,25 +34,32 @@ def _request(self, method, url, data=None, headers=None, retries=3, backoff_fact raise return None - def scrape_url(self, url, **kwargs): endpoint = f'{self.base_url}/v0/scrape' headers = self._prepare_headers() data = {'url': url, **kwargs} - return self._request('POST', endpoint, data, headers) + response = self._request('POST', endpoint, data, headers) + if response is None: + raise HTTPError("Failed to scrape URL after multiple retries") + return response def search(self, query, **kwargs): endpoint = f'{self.base_url}/v0/search' headers = self._prepare_headers() data = {'query': query, **kwargs} - return self._request('POST', endpoint, data, headers) + response = self._request('POST', endpoint, data, headers) + if response is None: + raise HTTPError("Failed to perform search after multiple retries") + return response def crawl_url(self, url, wait=False, poll_interval=5, idempotency_key=None, **kwargs): endpoint = f'{self.base_url}/v0/crawl' headers = self._prepare_headers(idempotency_key) data = {'url': url, **kwargs} response = self._request('POST', endpoint, data, headers) - job_id = response['jobId'] + if response is None: + raise HTTPError("Failed to initiate crawl after multiple retries") + job_id = response['jobId'] if wait: return self._monitor_job_status(job_id, headers, poll_interval) return job_id @@ -60,7 +67,10 @@ def crawl_url(self, url, wait=False, poll_interval=5, idempotency_key=None, **kw def check_crawl_status(self, job_id): endpoint = f'{self.base_url}/v0/crawl/status/{job_id}' headers = self._prepare_headers() - return self._request('GET', endpoint, headers=headers) + response = self._request('GET', endpoint, headers=headers) + if response is None: + raise HTTPError(f"Failed to check status for job {job_id} after multiple retries") + return response def _monitor_job_status(self, job_id, headers, poll_interval): while True: From bbb6c514a6c2792ab7b005ec99f7af53a051d1d3 Mon Sep 17 00:00:00 2001 From: -LAN- Date: Fri, 5 Jul 2024 18:01:15 +0800 Subject: [PATCH 11/21] refactor: Add and check type hints. --- .../builtin/firecrawl/firecrawl_appx.py | 42 ++++++++++--------- .../provider/builtin/firecrawl/tools/crawl.py | 8 ++-- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py index 9446ddbfe74ace..23cb65965229b5 100644 --- a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py +++ b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py @@ -1,18 +1,19 @@ -import os import time +from collections.abc import Mapping +from typing import Any import requests from requests.exceptions import HTTPError class FirecrawlApp: - def __init__(self, api_key=None, base_url=None): + def __init__(self, api_key: str | None = None, base_url: str | None = None): self.api_key = api_key self.base_url = base_url or 'https://api.firecrawl.dev' if not self.api_key: raise ValueError("API key is required") - def _prepare_headers(self, idempotency_key=None): + def _prepare_headers(self, idempotency_key: str | None = None): headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}' @@ -21,7 +22,15 @@ def _prepare_headers(self, idempotency_key=None): headers['Idempotency-Key'] = idempotency_key return headers - def _request(self, method, url, data=None, headers=None, retries=3, backoff_factor=0.3): + def _request( + self, + method: str, + url: str, + data: Mapping[str, Any] | None = None, + headers: Mapping[str, str] | None = None, + retries: int = 3, + backoff_factor: float = 0.3, + ) -> Mapping[str, Any] | None: for i in range(retries): try: response = requests.request(method, url, json=data, headers=headers) @@ -34,7 +43,7 @@ def _request(self, method, url, data=None, headers=None, retries=3, backoff_fact raise return None - def scrape_url(self, url, **kwargs): + def scrape_url(self, url: str, **kwargs): endpoint = f'{self.base_url}/v0/scrape' headers = self._prepare_headers() data = {'url': url, **kwargs} @@ -43,7 +52,7 @@ def scrape_url(self, url, **kwargs): raise HTTPError("Failed to scrape URL after multiple retries") return response - def search(self, query, **kwargs): + def search(self, query: str, **kwargs): endpoint = f'{self.base_url}/v0/search' headers = self._prepare_headers() data = {'query': query, **kwargs} @@ -52,19 +61,21 @@ def search(self, query, **kwargs): raise HTTPError("Failed to perform search after multiple retries") return response - def crawl_url(self, url, wait=False, poll_interval=5, idempotency_key=None, **kwargs): + def crawl_url( + self, url: str, wait: bool = False, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs + ): endpoint = f'{self.base_url}/v0/crawl' headers = self._prepare_headers(idempotency_key) data = {'url': url, **kwargs} response = self._request('POST', endpoint, data, headers) if response is None: raise HTTPError("Failed to initiate crawl after multiple retries") - job_id = response['jobId'] + job_id: str = response['jobId'] if wait: - return self._monitor_job_status(job_id, headers, poll_interval) + return self._monitor_job_status(job_id=job_id, poll_interval=poll_interval) return job_id - def check_crawl_status(self, job_id): + def check_crawl_status(self, job_id: str): endpoint = f'{self.base_url}/v0/crawl/status/{job_id}' headers = self._prepare_headers() response = self._request('GET', endpoint, headers=headers) @@ -72,7 +83,7 @@ def check_crawl_status(self, job_id): raise HTTPError(f"Failed to check status for job {job_id} after multiple retries") return response - def _monitor_job_status(self, job_id, headers, poll_interval): + def _monitor_job_status(self, job_id: str, poll_interval: int): while True: status = self.check_crawl_status(job_id) if status['status'] == 'completed': @@ -80,12 +91,3 @@ def _monitor_job_status(self, job_id, headers, poll_interval): elif status['status'] == 'failed': raise HTTPError(f'Job {job_id} failed: {status["error"]}') time.sleep(poll_interval) - -if __name__ == "__main__": - api_key = os.getenv('FIRECRAWL_API_KEY') - app = FirecrawlApp(api_key) - try: - result = app.scrape_url('https://example.com') - print(result) - except HTTPError as e: - print("Error:", e) diff --git a/api/core/tools/provider/builtin/firecrawl/tools/crawl.py b/api/core/tools/provider/builtin/firecrawl/tools/crawl.py index 7611f29ea6e21c..b000c1c6ce5cb7 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/crawl.py +++ b/api/core/tools/provider/builtin/firecrawl/tools/crawl.py @@ -27,12 +27,10 @@ def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolIn wait=True ) - if isinstance(crawl_result, dict): - result_message = json.dumps(crawl_result, ensure_ascii=False, indent=4) - else: - result_message = str(crawl_result) + if not isinstance(crawl_result, str): + crawl_result = json.dumps(crawl_result, ensure_ascii=False, indent=4) if not crawl_result: return self.create_text_message("Crawl request failed.") - return self.create_text_message(result_message) + return self.create_text_message(crawl_result) From 3fc2bc33f9232b8ec589e5b685eed10d778b2d82 Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Fri, 5 Jul 2024 18:46:45 +0800 Subject: [PATCH 12/21] feat(firecrawl): add scrape tool implementation and yaml config Implement the ScrapeTool class inheriting from BuiltinTool in scrape.py, which uses the FirecrawlApp to scrape data from a given URL. The tool supports custom scraping preferences and can return either scraped documents or URLs. Also, include the scrape.yaml configuration file to define the tool's identity, description, and parameters. BREAKING CHANGE: The addition of the scrape tool may affect existing workflows that do not account for this new tool. Ensure that your environment is prepared to handle the scrape tool beforedeploying this change. By tongyi --- .../builtin/firecrawl/tools/scrape.py | 26 +++++++++++++++++++ .../builtin/firecrawl/tools/scrape.yaml | 23 ++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 api/core/tools/provider/builtin/firecrawl/tools/scrape.py create mode 100644 api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml diff --git a/api/core/tools/provider/builtin/firecrawl/tools/scrape.py b/api/core/tools/provider/builtin/firecrawl/tools/scrape.py new file mode 100644 index 00000000000000..3a78dce8d09bff --- /dev/null +++ b/api/core/tools/provider/builtin/firecrawl/tools/scrape.py @@ -0,0 +1,26 @@ +import json +from typing import Any, Union + +from core.tools.entities.tool_entities import ToolInvokeMessage +from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp +from core.tools.tool.builtin_tool import BuiltinTool + + +class ScrapeTool(BuiltinTool): + def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: + app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url']) + + crawl_result = app.scrape_url( + url=tool_parameters['url'], + wait=True + ) + + if isinstance(crawl_result, dict): + result_message = json.dumps(crawl_result, ensure_ascii=False, indent=4) + else: + result_message = str(crawl_result) + + if not crawl_result: + return self.create_text_message("Scrape request failed.") + + return self.create_text_message(result_message) diff --git a/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml b/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml new file mode 100644 index 00000000000000..65580900c49df8 --- /dev/null +++ b/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml @@ -0,0 +1,23 @@ +identity: + name: crawl + author: Richards Tu + label: + en_US: Scrape + zh_Hans: 抓取 +description: + human: + en_US: Extract data from a single URL. + zh_Hans: 从单个URL抓取数据。 + llm: This utility initiates a web scraping process to harvest data from a designated URL. It facilitates customization of scraping preferences, including the option to include or exclude specific URL patterns, enhance image alt text generation leveraging advanced LLM technology (available with a premium plan), impose a limit on the maximum pages to scrape, and extract solely the primary content of each webpage. Depending on your preferences, the tool is capable of delivering either an inventory of scraped documents or a compilation of URLs. +parameters: + - name: url + type: string + required: true + label: + en_US: URL to scrape + zh_Hans: 要抓取的URL + human_description: + en_US: The URL of the website to scrape and extract data from. + zh_Hans: 要抓取并提取数据的网站URL。 + llm_description: The URL of the website that needs to be crawled. This is a required parameter. + form: llm \ No newline at end of file From bef367f7d80e6f1368c463344b0dab0e1df8da7e Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Fri, 5 Jul 2024 11:11:45 +0000 Subject: [PATCH 13/21] update --- api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml b/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml index 65580900c49df8..6f3f4565d37ffd 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml +++ b/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml @@ -1,5 +1,5 @@ identity: - name: crawl + name: scrape author: Richards Tu label: en_US: Scrape From d8e5b534900507a1cd6c6d242f7b34abc9b98eab Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Fri, 5 Jul 2024 19:37:44 +0800 Subject: [PATCH 14/21] feat(firecrawl): add search tool and update author in scrape tool Introduce the SearchTool class within the firecrawl tools, implementing functionality for searching data using the Firecrawl API. This update also changes the author field for the scrape tool from 'Richards Tu' to 'ahasasjeb'. BREAKING CHANGE: The addition of the search tool and modification of the scrape tool's author field may affect existing configurations or dependencies. Ensure to review and update accordingly before deploying. By tongyi --- .../builtin/firecrawl/tools/scrape.yaml | 2 +- .../builtin/firecrawl/tools/search.py | 26 +++++++++++++++++++ .../builtin/firecrawl/tools/search.yaml | 23 ++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 api/core/tools/provider/builtin/firecrawl/tools/search.py create mode 100644 api/core/tools/provider/builtin/firecrawl/tools/search.yaml diff --git a/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml b/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml index 6f3f4565d37ffd..9befe8382937b6 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml +++ b/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml @@ -1,6 +1,6 @@ identity: name: scrape - author: Richards Tu + author: ahasasjeb label: en_US: Scrape zh_Hans: 抓取 diff --git a/api/core/tools/provider/builtin/firecrawl/tools/search.py b/api/core/tools/provider/builtin/firecrawl/tools/search.py new file mode 100644 index 00000000000000..b8bda857377ac2 --- /dev/null +++ b/api/core/tools/provider/builtin/firecrawl/tools/search.py @@ -0,0 +1,26 @@ +import json +from typing import Any, Union + +from core.tools.entities.tool_entities import ToolInvokeMessage +from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp +from core.tools.tool.builtin_tool import BuiltinTool + + +class SearchTool(BuiltinTool): + def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: + app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url']) + + crawl_result = app.search( + query=tool_parameters['url'], + wait=True + ) + + if isinstance(crawl_result, dict): + result_message = json.dumps(crawl_result, ensure_ascii=False, indent=4) + else: + result_message = str(crawl_result) + + if not crawl_result: + return self.create_text_message("Search request failed.") + + return self.create_text_message(result_message) diff --git a/api/core/tools/provider/builtin/firecrawl/tools/search.yaml b/api/core/tools/provider/builtin/firecrawl/tools/search.yaml new file mode 100644 index 00000000000000..afe0301c6fe3c5 --- /dev/null +++ b/api/core/tools/provider/builtin/firecrawl/tools/search.yaml @@ -0,0 +1,23 @@ +identity: + name: search + author: ahasasjeb + label: + en_US: Search + zh_Hans: 搜索 +description: + human: + en_US: Extract data from a single URL. + zh_Hans: 从单个URL抓取数据。 + llm: This tool can perform online searches and convert the results to Markdown format. + parameters: + - name: Keyword + type: string + required: true + label: + en_US: Keyword + zh_Hans: 关键词 + human_description: + en_US: Input keywords to use Firecrawl API for search. + zh_Hans: 输入关键词即可使用Firecrawl API进行搜索。 + llm_description: Efficiently extract keywords from user text. + form: llm \ No newline at end of file From a44053c4bb4fcb7ae663dbda1c2d0469f6534d61 Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Fri, 5 Jul 2024 19:51:22 +0800 Subject: [PATCH 15/21] Update --- api/core/tools/provider/builtin/firecrawl/tools/search.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/core/tools/provider/builtin/firecrawl/tools/search.yaml b/api/core/tools/provider/builtin/firecrawl/tools/search.yaml index afe0301c6fe3c5..b25c81145101a8 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/search.yaml +++ b/api/core/tools/provider/builtin/firecrawl/tools/search.yaml @@ -6,8 +6,8 @@ identity: zh_Hans: 搜索 description: human: - en_US: Extract data from a single URL. - zh_Hans: 从单个URL抓取数据。 + en_US: Search, and output in Markdown format + zh_Hans: 搜索,并且以Markdown格式输出 llm: This tool can perform online searches and convert the results to Markdown format. parameters: - name: Keyword From 86fc60ad2ab9fc2a0ae429f8aad9022da5da7e27 Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Fri, 5 Jul 2024 19:52:41 +0800 Subject: [PATCH 16/21] Update --- api/core/tools/provider/builtin/firecrawl/tools/search.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/api/core/tools/provider/builtin/firecrawl/tools/search.yaml b/api/core/tools/provider/builtin/firecrawl/tools/search.yaml index b25c81145101a8..3b11180d501d4d 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/search.yaml +++ b/api/core/tools/provider/builtin/firecrawl/tools/search.yaml @@ -10,11 +10,11 @@ description: zh_Hans: 搜索,并且以Markdown格式输出 llm: This tool can perform online searches and convert the results to Markdown format. parameters: - - name: Keyword + - name: keyword type: string required: true label: - en_US: Keyword + en_US: keyword zh_Hans: 关键词 human_description: en_US: Input keywords to use Firecrawl API for search. From a0cc7bd2a5e47c13668359d154609874ba753b43 Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Fri, 5 Jul 2024 19:54:44 +0800 Subject: [PATCH 17/21] Update --- api/core/tools/provider/builtin/firecrawl/tools/search.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/core/tools/provider/builtin/firecrawl/tools/search.yaml b/api/core/tools/provider/builtin/firecrawl/tools/search.yaml index 3b11180d501d4d..9a3cd8d9df4825 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/search.yaml +++ b/api/core/tools/provider/builtin/firecrawl/tools/search.yaml @@ -9,7 +9,7 @@ description: en_US: Search, and output in Markdown format zh_Hans: 搜索,并且以Markdown格式输出 llm: This tool can perform online searches and convert the results to Markdown format. - parameters: +parameters: - name: keyword type: string required: true From 2af1a7a71eee06bd318b7102a51d64215c480692 Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Fri, 5 Jul 2024 20:00:08 +0800 Subject: [PATCH 18/21] Update --- api/core/tools/provider/builtin/firecrawl/tools/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/core/tools/provider/builtin/firecrawl/tools/search.py b/api/core/tools/provider/builtin/firecrawl/tools/search.py index b8bda857377ac2..cd6178189f61ed 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/search.py +++ b/api/core/tools/provider/builtin/firecrawl/tools/search.py @@ -11,7 +11,7 @@ def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolIn app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url']) crawl_result = app.search( - query=tool_parameters['url'], + query=tool_parameters['query'], wait=True ) From e330c72b671c027ef2c87ecd76b383aaa2214b5b Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Fri, 5 Jul 2024 20:06:42 +0800 Subject: [PATCH 19/21] Update --- api/core/tools/provider/builtin/firecrawl/tools/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/core/tools/provider/builtin/firecrawl/tools/search.py b/api/core/tools/provider/builtin/firecrawl/tools/search.py index cd6178189f61ed..0b118aa5f18dd2 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/search.py +++ b/api/core/tools/provider/builtin/firecrawl/tools/search.py @@ -11,7 +11,7 @@ def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolIn app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url']) crawl_result = app.search( - query=tool_parameters['query'], + query=tool_parameters['keyword'], wait=True ) From 28c29bec515adf711a74e41b1e1d56a935bcf9e1 Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Fri, 5 Jul 2024 20:23:12 +0800 Subject: [PATCH 20/21] Update --- api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml | 2 +- api/core/tools/provider/builtin/firecrawl/tools/search.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml b/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml index 9befe8382937b6..e973afed2bd5b7 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml +++ b/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml @@ -20,4 +20,4 @@ parameters: en_US: The URL of the website to scrape and extract data from. zh_Hans: 要抓取并提取数据的网站URL。 llm_description: The URL of the website that needs to be crawled. This is a required parameter. - form: llm \ No newline at end of file + form: llm diff --git a/api/core/tools/provider/builtin/firecrawl/tools/search.yaml b/api/core/tools/provider/builtin/firecrawl/tools/search.yaml index 9a3cd8d9df4825..b1513c914ec31f 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/search.yaml +++ b/api/core/tools/provider/builtin/firecrawl/tools/search.yaml @@ -20,4 +20,4 @@ parameters: en_US: Input keywords to use Firecrawl API for search. zh_Hans: 输入关键词即可使用Firecrawl API进行搜索。 llm_description: Efficiently extract keywords from user text. - form: llm \ No newline at end of file + form: llm From 5eeea25ce18674cd6e4a657a940bbc38fe0674e5 Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Fri, 5 Jul 2024 20:33:16 +0800 Subject: [PATCH 21/21] Update description --- api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml b/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml index e973afed2bd5b7..29aa5991aa8bf4 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml +++ b/api/core/tools/provider/builtin/firecrawl/tools/scrape.yaml @@ -8,7 +8,7 @@ description: human: en_US: Extract data from a single URL. zh_Hans: 从单个URL抓取数据。 - llm: This utility initiates a web scraping process to harvest data from a designated URL. It facilitates customization of scraping preferences, including the option to include or exclude specific URL patterns, enhance image alt text generation leveraging advanced LLM technology (available with a premium plan), impose a limit on the maximum pages to scrape, and extract solely the primary content of each webpage. Depending on your preferences, the tool is capable of delivering either an inventory of scraped documents or a compilation of URLs. + llm: This tool is designed to scrape URL and output the content in Markdown format. parameters: - name: url type: string