From cc63af8e72d213c28b1c0972ff4cda1ff957d1f1 Mon Sep 17 00:00:00 2001 From: ahasasjeb Date: Fri, 5 Jul 2024 18:04:51 +0800 Subject: [PATCH] Removed firecrawl-py, fixed and improved firecrawl tool (#5896) Co-authored-by: -LAN- --- .../provider/builtin/firecrawl/firecrawl.yaml | 12 +- .../builtin/firecrawl/firecrawl_appx.py | 147 +++++++++--------- .../provider/builtin/firecrawl/tools/crawl.py | 33 ++-- api/poetry.lock | 16 +- api/pyproject.toml | 1 - 5 files changed, 89 insertions(+), 120 deletions(-) diff --git a/api/core/tools/provider/builtin/firecrawl/firecrawl.yaml b/api/core/tools/provider/builtin/firecrawl/firecrawl.yaml index edd28f7d22b88e..613a0e4679f165 100644 --- a/api/core/tools/provider/builtin/firecrawl/firecrawl.yaml +++ b/api/core/tools/provider/builtin/firecrawl/firecrawl.yaml @@ -6,7 +6,7 @@ identity: zh_CN: Firecrawl description: en_US: Firecrawl API integration for web crawling and scraping. - zh_CN: Firecrawl API 集成,用于网页爬取和数据抓取。 + zh_Hans: Firecrawl API 集成,用于网页爬取和数据抓取。 icon: icon.svg tags: - search @@ -17,20 +17,22 @@ credentials_for_provider: required: true label: en_US: Firecrawl API Key - zh_CN: Firecrawl API 密钥 + zh_Hans: Firecrawl API 密钥 placeholder: en_US: Please input your Firecrawl API key - zh_CN: 请输入您的 Firecrawl API 密钥 + zh_Hans: 请输入您的 Firecrawl API 密钥,如果是自托管版本,可以随意填写密钥 help: - en_US: Get your Firecrawl API key from your Firecrawl account settings. - zh_CN: 从您的 Firecrawl 账户设置中获取 Firecrawl API 密钥。 + en_US: Get your Firecrawl API key from your Firecrawl account settings.If you are using a self-hosted version, you may enter any key at your convenience. + zh_Hans: 从您的 Firecrawl 账户设置中获取 Firecrawl API 密钥。如果是自托管版本,可以随意填写密钥。 url: https://www.firecrawl.dev/account base_url: type: text-input required: false label: en_US: Firecrawl server's Base URL + zh_Hans: Firecrawl服务器的API URL pt_BR: Firecrawl server's Base URL placeholder: en_US: https://www.firecrawl.dev + zh_HansL: https://www.firecrawl.dev pt_BR: https://www.firecrawl.dev diff --git a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py index a28f479170a267..23cb65965229b5 100644 --- a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py +++ b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py @@ -1,98 +1,93 @@ import time +from collections.abc import Mapping +from typing import Any import requests +from requests.exceptions import HTTPError class FirecrawlApp: - def __init__(self, api_key=None, base_url=None): + def __init__(self, api_key: str | None = None, base_url: str | None = None): self.api_key = api_key self.base_url = base_url or 'https://api.firecrawl.dev' - if self.api_key is None and self.base_url == 'https://api.firecrawl.dev': - raise ValueError('No API key provided') + if not self.api_key: + raise ValueError("API key is required") - def scrape_url(self, url, params=None) -> dict: + def _prepare_headers(self, idempotency_key: str | None = None): headers = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}' } - json_data = {'url': url} - if params: - json_data.update(params) - response = requests.post( - f'{self.base_url}/v0/scrape', - headers=headers, - json=json_data - ) - if response.status_code == 200: - response = response.json() - if response['success'] == True: - return response['data'] - else: - raise Exception(f'Failed to scrape URL. Error: {response["error"]}') + if idempotency_key: + headers['Idempotency-Key'] = idempotency_key + return headers - elif response.status_code in [402, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') - else: - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') + def _request( + self, + method: str, + url: str, + data: Mapping[str, Any] | None = None, + headers: Mapping[str, str] | None = None, + retries: int = 3, + backoff_factor: float = 0.3, + ) -> Mapping[str, Any] | None: + for i in range(retries): + try: + response = requests.request(method, url, json=data, headers=headers) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + if i < retries - 1: + time.sleep(backoff_factor * (2 ** i)) + else: + raise + return None - def crawl_url(self, url, params=None, wait_until_done=True, timeout=2) -> str: + def scrape_url(self, url: str, **kwargs): + endpoint = f'{self.base_url}/v0/scrape' headers = self._prepare_headers() - json_data = {'url': url} - if params: - json_data.update(params) - response = self._post_request(f'{self.base_url}/v0/crawl', json_data, headers) - if response.status_code == 200: - job_id = response.json().get('jobId') - if wait_until_done: - return self._monitor_job_status(job_id, headers, timeout) - else: - return {'jobId': job_id} - else: - self._handle_error(response, 'start crawl job') + data = {'url': url, **kwargs} + response = self._request('POST', endpoint, data, headers) + if response is None: + raise HTTPError("Failed to scrape URL after multiple retries") + return response - def check_crawl_status(self, job_id) -> dict: + def search(self, query: str, **kwargs): + endpoint = f'{self.base_url}/v0/search' headers = self._prepare_headers() - response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers) - if response.status_code == 200: - return response.json() - else: - self._handle_error(response, 'check crawl status') - - def _prepare_headers(self): - return { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' - } + data = {'query': query, **kwargs} + response = self._request('POST', endpoint, data, headers) + if response is None: + raise HTTPError("Failed to perform search after multiple retries") + return response - def _post_request(self, url, data, headers): - return requests.post(url, headers=headers, json=data) + def crawl_url( + self, url: str, wait: bool = False, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs + ): + endpoint = f'{self.base_url}/v0/crawl' + headers = self._prepare_headers(idempotency_key) + data = {'url': url, **kwargs} + response = self._request('POST', endpoint, data, headers) + if response is None: + raise HTTPError("Failed to initiate crawl after multiple retries") + job_id: str = response['jobId'] + if wait: + return self._monitor_job_status(job_id=job_id, poll_interval=poll_interval) + return job_id - def _get_request(self, url, headers): - return requests.get(url, headers=headers) + def check_crawl_status(self, job_id: str): + endpoint = f'{self.base_url}/v0/crawl/status/{job_id}' + headers = self._prepare_headers() + response = self._request('GET', endpoint, headers=headers) + if response is None: + raise HTTPError(f"Failed to check status for job {job_id} after multiple retries") + return response - def _monitor_job_status(self, job_id, headers, timeout): + def _monitor_job_status(self, job_id: str, poll_interval: int): while True: - status_response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers) - if status_response.status_code == 200: - status_data = status_response.json() - if status_data['status'] == 'completed': - if 'data' in status_data: - return status_data['data'] - else: - raise Exception('Crawl job completed but no data was returned') - elif status_data['status'] in ['active', 'paused', 'pending', 'queued']: - if timeout < 2: - timeout = 2 - time.sleep(timeout) # Wait for the specified timeout before checking again - else: - raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}') - else: - self._handle_error(status_response, 'check crawl status') - - def _handle_error(self, response, action): - if response.status_code in [402, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') - else: - raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}') + status = self.check_crawl_status(job_id) + if status['status'] == 'completed': + return status + elif status['status'] == 'failed': + raise HTTPError(f'Job {job_id} failed: {status["error"]}') + time.sleep(poll_interval) diff --git a/api/core/tools/provider/builtin/firecrawl/tools/crawl.py b/api/core/tools/provider/builtin/firecrawl/tools/crawl.py index ab3a73dd03a792..b000c1c6ce5cb7 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/crawl.py +++ b/api/core/tools/provider/builtin/firecrawl/tools/crawl.py @@ -1,3 +1,4 @@ +import json from typing import Any, Union from core.tools.entities.tool_entities import ToolInvokeMessage @@ -7,7 +8,6 @@ class CrawlTool(BuiltinTool): def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: - # initialize the app object with the api key app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url']) options = { @@ -21,29 +21,16 @@ def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolIn } } - # crawl the url crawl_result = app.crawl_url( url=tool_parameters['url'], params=options, - wait_until_done=True, + wait=True ) - - # reformat crawl result - crawl_output = "**Crawl Result**\n\n" - try: - for result in crawl_result: - crawl_output += f"**- Title:** {result.get('metadata', {}).get('title', '')}\n" - crawl_output += f"**- Description:** {result.get('metadata', {}).get('description', '')}\n" - crawl_output += f"**- URL:** {result.get('metadata', {}).get('ogUrl', '')}\n\n" - crawl_output += f"**- Web Content:**\n{result.get('markdown', '')}\n\n" - crawl_output += "---\n\n" - except Exception as e: - crawl_output += f"An error occurred: {str(e)}\n" - crawl_output += f"**- Title:** {result.get('metadata', {}).get('title', '')}\n" - crawl_output += f"**- Description:** {result.get('metadata', {}).get('description','')}\n" - crawl_output += f"**- URL:** {result.get('metadata', {}).get('ogUrl', '')}\n\n" - crawl_output += f"**- Web Content:**\n{result.get('markdown', '')}\n\n" - crawl_output += "---\n\n" - - - return self.create_text_message(crawl_output) \ No newline at end of file + + if not isinstance(crawl_result, str): + crawl_result = json.dumps(crawl_result, ensure_ascii=False, indent=4) + + if not crawl_result: + return self.create_text_message("Crawl request failed.") + + return self.create_text_message(crawl_result) diff --git a/api/poetry.lock b/api/poetry.lock index 961b2748b44f76..1bfa9716815805 100644 --- a/api/poetry.lock +++ b/api/poetry.lock @@ -2083,20 +2083,6 @@ files = [ {file = "filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb"}, ] -[[package]] -name = "firecrawl-py" -version = "0.0.5" -description = "Python SDK for Firecrawl API" -optional = false -python-versions = "*" -files = [ - {file = "firecrawl-py-0.0.5.tar.gz", hash = "sha256:3d1cc30b7d86c12aa06e6434ebb526072cd70ab9a0c8b145008efe044a1cd09c"}, - {file = "firecrawl_py-0.0.5-py3-none-any.whl", hash = "sha256:476694345141c0145a1bee9c01a8ad0103f75892c12a122dc511a3adad0785e7"}, -] - -[package.dependencies] -requests = "*" - [[package]] name = "flask" version = "3.0.3" @@ -9095,4 +9081,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "90f0e77567fbe5100d15bf2bc9472007aafc53c2fd594b6a90dd8455dea58582" +content-hash = "420c866aaff914d48c00c443a59f181c778690c24f81a955b1f970729bb441b7" diff --git a/api/pyproject.toml b/api/pyproject.toml index c919b33856af28..f157fab34637ff 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -115,7 +115,6 @@ chardet = "~5.1.0" cohere = "~5.2.4" cos-python-sdk-v5 = "1.9.30" dashscope = { version = "~1.17.0", extras = ["tokenizer"] } -firecrawl-py = "0.0.5" flask = "~3.0.1" flask-compress = "~1.14" flask-cors = "~4.0.0"