Skip to content

Commit

Permalink
Removed firecrawl-py, fixed and improved firecrawl tool (#5896)
Browse files Browse the repository at this point in the history
Co-authored-by: -LAN- <laipz8200@outlook.com>
  • Loading branch information
ahasasjeb and laipz8200 authored Jul 5, 2024
1 parent bf2268b commit cc63af8
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 120 deletions.
12 changes: 7 additions & 5 deletions api/core/tools/provider/builtin/firecrawl/firecrawl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ identity:
zh_CN: Firecrawl
description:
en_US: Firecrawl API integration for web crawling and scraping.
zh_CN: Firecrawl API 集成,用于网页爬取和数据抓取。
zh_Hans: Firecrawl API 集成,用于网页爬取和数据抓取。
icon: icon.svg
tags:
- search
Expand All @@ -17,20 +17,22 @@ credentials_for_provider:
required: true
label:
en_US: Firecrawl API Key
zh_CN: Firecrawl API 密钥
zh_Hans: Firecrawl API 密钥
placeholder:
en_US: Please input your Firecrawl API key
zh_CN: 请输入您的 Firecrawl API 密钥
zh_Hans: 请输入您的 Firecrawl API 密钥,如果是自托管版本,可以随意填写密钥
help:
en_US: Get your Firecrawl API key from your Firecrawl account settings.
zh_CN: 从您的 Firecrawl 账户设置中获取 Firecrawl API 密钥。
en_US: Get your Firecrawl API key from your Firecrawl account settings.If you are using a self-hosted version, you may enter any key at your convenience.
zh_Hans: 从您的 Firecrawl 账户设置中获取 Firecrawl API 密钥。如果是自托管版本,可以随意填写密钥
url: https://www.firecrawl.dev/account
base_url:
type: text-input
required: false
label:
en_US: Firecrawl server's Base URL
zh_Hans: Firecrawl服务器的API URL
pt_BR: Firecrawl server's Base URL
placeholder:
en_US: https://www.firecrawl.dev
zh_HansL: https://www.firecrawl.dev
pt_BR: https://www.firecrawl.dev
147 changes: 71 additions & 76 deletions api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py
Original file line number Diff line number Diff line change
@@ -1,98 +1,93 @@
import time
from collections.abc import Mapping
from typing import Any

import requests
from requests.exceptions import HTTPError


class FirecrawlApp:
def __init__(self, api_key=None, base_url=None):
def __init__(self, api_key: str | None = None, base_url: str | None = None):
self.api_key = api_key
self.base_url = base_url or 'https://api.firecrawl.dev'
if self.api_key is None and self.base_url == 'https://api.firecrawl.dev':
raise ValueError('No API key provided')
if not self.api_key:
raise ValueError("API key is required")

def scrape_url(self, url, params=None) -> dict:
def _prepare_headers(self, idempotency_key: str | None = None):
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}'
}
json_data = {'url': url}
if params:
json_data.update(params)
response = requests.post(
f'{self.base_url}/v0/scrape',
headers=headers,
json=json_data
)
if response.status_code == 200:
response = response.json()
if response['success'] == True:
return response['data']
else:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
if idempotency_key:
headers['Idempotency-Key'] = idempotency_key
return headers

elif response.status_code in [402, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
else:
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
def _request(
self,
method: str,
url: str,
data: Mapping[str, Any] | None = None,
headers: Mapping[str, str] | None = None,
retries: int = 3,
backoff_factor: float = 0.3,
) -> Mapping[str, Any] | None:
for i in range(retries):
try:
response = requests.request(method, url, json=data, headers=headers)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
if i < retries - 1:
time.sleep(backoff_factor * (2 ** i))
else:
raise
return None

def crawl_url(self, url, params=None, wait_until_done=True, timeout=2) -> str:
def scrape_url(self, url: str, **kwargs):
endpoint = f'{self.base_url}/v0/scrape'
headers = self._prepare_headers()
json_data = {'url': url}
if params:
json_data.update(params)
response = self._post_request(f'{self.base_url}/v0/crawl', json_data, headers)
if response.status_code == 200:
job_id = response.json().get('jobId')
if wait_until_done:
return self._monitor_job_status(job_id, headers, timeout)
else:
return {'jobId': job_id}
else:
self._handle_error(response, 'start crawl job')
data = {'url': url, **kwargs}
response = self._request('POST', endpoint, data, headers)
if response is None:
raise HTTPError("Failed to scrape URL after multiple retries")
return response

def check_crawl_status(self, job_id) -> dict:
def search(self, query: str, **kwargs):
endpoint = f'{self.base_url}/v0/search'
headers = self._prepare_headers()
response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers)
if response.status_code == 200:
return response.json()
else:
self._handle_error(response, 'check crawl status')

def _prepare_headers(self):
return {
'Content-Type': 'application/json',
'Authorization': f'Bearer {self.api_key}'
}
data = {'query': query, **kwargs}
response = self._request('POST', endpoint, data, headers)
if response is None:
raise HTTPError("Failed to perform search after multiple retries")
return response

def _post_request(self, url, data, headers):
return requests.post(url, headers=headers, json=data)
def crawl_url(
self, url: str, wait: bool = False, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs
):
endpoint = f'{self.base_url}/v0/crawl'
headers = self._prepare_headers(idempotency_key)
data = {'url': url, **kwargs}
response = self._request('POST', endpoint, data, headers)
if response is None:
raise HTTPError("Failed to initiate crawl after multiple retries")
job_id: str = response['jobId']
if wait:
return self._monitor_job_status(job_id=job_id, poll_interval=poll_interval)
return job_id

def _get_request(self, url, headers):
return requests.get(url, headers=headers)
def check_crawl_status(self, job_id: str):
endpoint = f'{self.base_url}/v0/crawl/status/{job_id}'
headers = self._prepare_headers()
response = self._request('GET', endpoint, headers=headers)
if response is None:
raise HTTPError(f"Failed to check status for job {job_id} after multiple retries")
return response

def _monitor_job_status(self, job_id, headers, timeout):
def _monitor_job_status(self, job_id: str, poll_interval: int):
while True:
status_response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers)
if status_response.status_code == 200:
status_data = status_response.json()
if status_data['status'] == 'completed':
if 'data' in status_data:
return status_data['data']
else:
raise Exception('Crawl job completed but no data was returned')
elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
if timeout < 2:
timeout = 2
time.sleep(timeout) # Wait for the specified timeout before checking again
else:
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
else:
self._handle_error(status_response, 'check crawl status')

def _handle_error(self, response, action):
if response.status_code in [402, 409, 500]:
error_message = response.json().get('error', 'Unknown error occurred')
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
else:
raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
status = self.check_crawl_status(job_id)
if status['status'] == 'completed':
return status
elif status['status'] == 'failed':
raise HTTPError(f'Job {job_id} failed: {status["error"]}')
time.sleep(poll_interval)
33 changes: 10 additions & 23 deletions api/core/tools/provider/builtin/firecrawl/tools/crawl.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from typing import Any, Union

from core.tools.entities.tool_entities import ToolInvokeMessage
Expand All @@ -7,7 +8,6 @@

class CrawlTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
# initialize the app object with the api key
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url'])

options = {
Expand All @@ -21,29 +21,16 @@ def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolIn
}
}

# crawl the url
crawl_result = app.crawl_url(
url=tool_parameters['url'],
params=options,
wait_until_done=True,
wait=True
)

# reformat crawl result
crawl_output = "**Crawl Result**\n\n"
try:
for result in crawl_result:
crawl_output += f"**- Title:** {result.get('metadata', {}).get('title', '')}\n"
crawl_output += f"**- Description:** {result.get('metadata', {}).get('description', '')}\n"
crawl_output += f"**- URL:** {result.get('metadata', {}).get('ogUrl', '')}\n\n"
crawl_output += f"**- Web Content:**\n{result.get('markdown', '')}\n\n"
crawl_output += "---\n\n"
except Exception as e:
crawl_output += f"An error occurred: {str(e)}\n"
crawl_output += f"**- Title:** {result.get('metadata', {}).get('title', '')}\n"
crawl_output += f"**- Description:** {result.get('metadata', {}).get('description','')}\n"
crawl_output += f"**- URL:** {result.get('metadata', {}).get('ogUrl', '')}\n\n"
crawl_output += f"**- Web Content:**\n{result.get('markdown', '')}\n\n"
crawl_output += "---\n\n"


return self.create_text_message(crawl_output)

if not isinstance(crawl_result, str):
crawl_result = json.dumps(crawl_result, ensure_ascii=False, indent=4)

if not crawl_result:
return self.create_text_message("Crawl request failed.")

return self.create_text_message(crawl_result)
16 changes: 1 addition & 15 deletions api/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion api/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ chardet = "~5.1.0"
cohere = "~5.2.4"
cos-python-sdk-v5 = "1.9.30"
dashscope = { version = "~1.17.0", extras = ["tokenizer"] }
firecrawl-py = "0.0.5"
flask = "~3.0.1"
flask-compress = "~1.14"
flask-cors = "~4.0.0"
Expand Down

0 comments on commit cc63af8

Please sign in to comment.