Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removed firecrawl-py, fixed and improved firecrawl tool #5896

Merged
merged 11 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions api/core/tools/provider/builtin/firecrawl/firecrawl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ identity:
zh_CN: Firecrawl
description:
en_US: Firecrawl API integration for web crawling and scraping.
zh_CN: Firecrawl API 集成,用于网页爬取和数据抓取。
zh_Hans: Firecrawl API 集成,用于网页爬取和数据抓取。
icon: icon.svg
tags:
- search
Expand All @@ -17,20 +17,22 @@ credentials_for_provider:
required: true
label:
en_US: Firecrawl API Key
zh_CN: Firecrawl API 密钥
zh_Hans: Firecrawl API 密钥
placeholder:
en_US: Please input your Firecrawl API key
zh_CN: 请输入您的 Firecrawl API 密钥
zh_Hans: 请输入您的 Firecrawl API 密钥,如果是自托管版本,可以随意填写密钥
help:
en_US: Get your Firecrawl API key from your Firecrawl account settings.
zh_CN: 从您的 Firecrawl 账户设置中获取 Firecrawl API 密钥。
en_US: Get your Firecrawl API key from your Firecrawl account settings.If you are using a self-hosted version, you may enter any key at your convenience.
zh_Hans: 从您的 Firecrawl 账户设置中获取 Firecrawl API 密钥。如果是自托管版本,可以随意填写密钥
url: https://www.firecrawl.dev/account
base_url:
type: text-input
required: false
label:
en_US: Firecrawl server's Base URL
zh_Hans: Firecrawl服务器的API URL
pt_BR: Firecrawl server's Base URL
placeholder:
en_US: https://www.firecrawl.dev
zh_HansL: https://www.firecrawl.dev
pt_BR: https://www.firecrawl.dev
147 changes: 71 additions & 76 deletions api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py
Original file line number Diff line number Diff line change
@@ -1,98 +1,93 @@
import time
from collections.abc import Mapping
from typing import Any

import requests
from requests.exceptions import HTTPError


class FirecrawlApp:
def __init__(self, api_key=None, base_url=None):
def __init__(self, api_key: str | None = None, base_url: str | None = None):
self.api_key = api_key
self.base_url = base_url or 'https://api.firecrawl.dev'
if self.api_key is None and self.base_url == 'https://api.firecrawl.dev':
raise ValueError('No API key provided')
if not self.api_key:
raise ValueError("API key is required")

def scrape_url(self, url, params=None) -> dict:
def _prepare_headers(self, idempotency_key: str | None = None):
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}'
}
json_data = {'url': url}
if params:
json_data.update(params)
response = requests.post(
f'{self.base_url}/v0/scrape',
headers=headers,
json=json_data
)
if response.status_code == 200:
response = response.json()
if response['success'] == True:
return response['data']
else:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
if idempotency_key:
headers['Idempotency-Key'] = idempotency_key
return headers

elif response.status_code in [402, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
else:
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
def _request(
self,
method: str,
url: str,
data: Mapping[str, Any] | None = None,
headers: Mapping[str, str] | None = None,
retries: int = 3,
backoff_factor: float = 0.3,
) -> Mapping[str, Any] | None:
for i in range(retries):
try:
response = requests.request(method, url, json=data, headers=headers)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
if i < retries - 1:
time.sleep(backoff_factor * (2 ** i))
else:
raise
return None

def crawl_url(self, url, params=None, wait_until_done=True, timeout=2) -> str:
def scrape_url(self, url: str, **kwargs):
endpoint = f'{self.base_url}/v0/scrape'
headers = self._prepare_headers()
json_data = {'url': url}
if params:
json_data.update(params)
response = self._post_request(f'{self.base_url}/v0/crawl', json_data, headers)
if response.status_code == 200:
job_id = response.json().get('jobId')
if wait_until_done:
return self._monitor_job_status(job_id, headers, timeout)
else:
return {'jobId': job_id}
else:
self._handle_error(response, 'start crawl job')
data = {'url': url, **kwargs}
response = self._request('POST', endpoint, data, headers)
if response is None:
raise HTTPError("Failed to scrape URL after multiple retries")
return response

def check_crawl_status(self, job_id) -> dict:
def search(self, query: str, **kwargs):
endpoint = f'{self.base_url}/v0/search'
headers = self._prepare_headers()
response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers)
if response.status_code == 200:
return response.json()
else:
self._handle_error(response, 'check crawl status')

def _prepare_headers(self):
return {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}'
}
data = {'query': query, **kwargs}
response = self._request('POST', endpoint, data, headers)
if response is None:
raise HTTPError("Failed to perform search after multiple retries")
return response

def _post_request(self, url, data, headers):
return requests.post(url, headers=headers, json=data)
def crawl_url(
self, url: str, wait: bool = False, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs
):
endpoint = f'{self.base_url}/v0/crawl'
headers = self._prepare_headers(idempotency_key)
data = {'url': url, **kwargs}
response = self._request('POST', endpoint, data, headers)
if response is None:
raise HTTPError("Failed to initiate crawl after multiple retries")
job_id: str = response['jobId']
if wait:
return self._monitor_job_status(job_id=job_id, poll_interval=poll_interval)
return job_id

def _get_request(self, url, headers):
return requests.get(url, headers=headers)
def check_crawl_status(self, job_id: str):
endpoint = f'{self.base_url}/v0/crawl/status/{job_id}'
headers = self._prepare_headers()
response = self._request('GET', endpoint, headers=headers)
if response is None:
raise HTTPError(f"Failed to check status for job {job_id} after multiple retries")
return response

def _monitor_job_status(self, job_id, headers, timeout):
def _monitor_job_status(self, job_id: str, poll_interval: int):
while True:
status_response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers)
if status_response.status_code == 200:
status_data = status_response.json()
if status_data['status'] == 'completed':
if 'data' in status_data:
return status_data['data']
else:
raise Exception('Crawl job completed but no data was returned')
elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
if timeout < 2:
timeout = 2
time.sleep(timeout) # Wait for the specified timeout before checking again
else:
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
else:
self._handle_error(status_response, 'check crawl status')

def _handle_error(self, response, action):
if response.status_code in [402, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
else:
raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
status = self.check_crawl_status(job_id)
if status['status'] == 'completed':
return status
elif status['status'] == 'failed':
raise HTTPError(f'Job {job_id} failed: {status["error"]}')
time.sleep(poll_interval)
33 changes: 10 additions & 23 deletions api/core/tools/provider/builtin/firecrawl/tools/crawl.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from typing import Any, Union

from core.tools.entities.tool_entities import ToolInvokeMessage
Expand All @@ -7,7 +8,6 @@

class CrawlTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
# initialize the app object with the api key
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url'])

options = {
Expand All @@ -21,29 +21,16 @@ def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolIn
}
}

# crawl the url
crawl_result = app.crawl_url(
url=tool_parameters['url'],
params=options,
wait_until_done=True,
wait=True
)

# reformat crawl result
crawl_output = "**Crawl Result**\n\n"
try:
for result in crawl_result:
crawl_output += f"**- Title:** {result.get('metadata', {}).get('title', '')}\n"
crawl_output += f"**- Description:** {result.get('metadata', {}).get('description', '')}\n"
crawl_output += f"**- URL:** {result.get('metadata', {}).get('ogUrl', '')}\n\n"
crawl_output += f"**- Web Content:**\n{result.get('markdown', '')}\n\n"
crawl_output += "---\n\n"
except Exception as e:
crawl_output += f"An error occurred: {str(e)}\n"
crawl_output += f"**- Title:** {result.get('metadata', {}).get('title', '')}\n"
crawl_output += f"**- Description:** {result.get('metadata', {}).get('description','')}\n"
crawl_output += f"**- URL:** {result.get('metadata', {}).get('ogUrl', '')}\n\n"
crawl_output += f"**- Web Content:**\n{result.get('markdown', '')}\n\n"
crawl_output += "---\n\n"


return self.create_text_message(crawl_output)

if not isinstance(crawl_result, str):
crawl_result = json.dumps(crawl_result, ensure_ascii=False, indent=4)

if not crawl_result:
return self.create_text_message("Crawl request failed.")

return self.create_text_message(crawl_result)
16 changes: 1 addition & 15 deletions api/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ chardet = "~5.1.0"
cohere = "~5.2.4"
cos-python-sdk-v5 = "1.9.30"
dashscope = { version = "~1.17.0", extras = ["tokenizer"] }
firecrawl-py = "0.0.5"
flask = "~3.0.1"
flask-compress = "~1.14"
flask-cors = "~4.0.0"
Expand Down