-
Notifications
You must be signed in to change notification settings - Fork 10.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Removed firecrawl-py, fixed and improved firecrawl tool (#5896)
Co-authored-by: -LAN- <laipz8200@outlook.com>
- Loading branch information
Showing
5 changed files
with
89 additions
and
120 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
147 changes: 71 additions & 76 deletions
147
api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,98 +1,93 @@ | ||
import time | ||
from collections.abc import Mapping | ||
from typing import Any | ||
|
||
import requests | ||
from requests.exceptions import HTTPError | ||
|
||
|
||
class FirecrawlApp: | ||
def __init__(self, api_key=None, base_url=None): | ||
def __init__(self, api_key: str | None = None, base_url: str | None = None): | ||
self.api_key = api_key | ||
self.base_url = base_url or 'https://api.firecrawl.dev' | ||
if self.api_key is None and self.base_url == 'https://api.firecrawl.dev': | ||
raise ValueError('No API key provided') | ||
if not self.api_key: | ||
raise ValueError("API key is required") | ||
|
||
def scrape_url(self, url, params=None) -> dict: | ||
def _prepare_headers(self, idempotency_key: str | None = None): | ||
headers = { | ||
'Content-Type': 'application/json', | ||
'Authorization': f'Bearer {self.api_key}' | ||
} | ||
json_data = {'url': url} | ||
if params: | ||
json_data.update(params) | ||
response = requests.post( | ||
f'{self.base_url}/v0/scrape', | ||
headers=headers, | ||
json=json_data | ||
) | ||
if response.status_code == 200: | ||
response = response.json() | ||
if response['success'] == True: | ||
return response['data'] | ||
else: | ||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}') | ||
if idempotency_key: | ||
headers['Idempotency-Key'] = idempotency_key | ||
return headers | ||
|
||
elif response.status_code in [402, 409, 500]: | ||
error_message = response.json().get('error', 'Unknown error occurred') | ||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') | ||
else: | ||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') | ||
def _request( | ||
self, | ||
method: str, | ||
url: str, | ||
data: Mapping[str, Any] | None = None, | ||
headers: Mapping[str, str] | None = None, | ||
retries: int = 3, | ||
backoff_factor: float = 0.3, | ||
) -> Mapping[str, Any] | None: | ||
for i in range(retries): | ||
try: | ||
response = requests.request(method, url, json=data, headers=headers) | ||
response.raise_for_status() | ||
return response.json() | ||
except requests.exceptions.RequestException as e: | ||
if i < retries - 1: | ||
time.sleep(backoff_factor * (2 ** i)) | ||
else: | ||
raise | ||
return None | ||
|
||
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2) -> str: | ||
def scrape_url(self, url: str, **kwargs): | ||
endpoint = f'{self.base_url}/v0/scrape' | ||
headers = self._prepare_headers() | ||
json_data = {'url': url} | ||
if params: | ||
json_data.update(params) | ||
response = self._post_request(f'{self.base_url}/v0/crawl', json_data, headers) | ||
if response.status_code == 200: | ||
job_id = response.json().get('jobId') | ||
if wait_until_done: | ||
return self._monitor_job_status(job_id, headers, timeout) | ||
else: | ||
return {'jobId': job_id} | ||
else: | ||
self._handle_error(response, 'start crawl job') | ||
data = {'url': url, **kwargs} | ||
response = self._request('POST', endpoint, data, headers) | ||
if response is None: | ||
raise HTTPError("Failed to scrape URL after multiple retries") | ||
return response | ||
|
||
def check_crawl_status(self, job_id) -> dict: | ||
def search(self, query: str, **kwargs): | ||
endpoint = f'{self.base_url}/v0/search' | ||
headers = self._prepare_headers() | ||
response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers) | ||
if response.status_code == 200: | ||
return response.json() | ||
else: | ||
self._handle_error(response, 'check crawl status') | ||
|
||
def _prepare_headers(self): | ||
return { | ||
'Content-Type': 'application/json', | ||
'Authorization': f'Bearer {self.api_key}' | ||
} | ||
data = {'query': query, **kwargs} | ||
response = self._request('POST', endpoint, data, headers) | ||
if response is None: | ||
raise HTTPError("Failed to perform search after multiple retries") | ||
return response | ||
|
||
def _post_request(self, url, data, headers): | ||
return requests.post(url, headers=headers, json=data) | ||
def crawl_url( | ||
self, url: str, wait: bool = False, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs | ||
): | ||
endpoint = f'{self.base_url}/v0/crawl' | ||
headers = self._prepare_headers(idempotency_key) | ||
data = {'url': url, **kwargs} | ||
response = self._request('POST', endpoint, data, headers) | ||
if response is None: | ||
raise HTTPError("Failed to initiate crawl after multiple retries") | ||
job_id: str = response['jobId'] | ||
if wait: | ||
return self._monitor_job_status(job_id=job_id, poll_interval=poll_interval) | ||
return job_id | ||
|
||
def _get_request(self, url, headers): | ||
return requests.get(url, headers=headers) | ||
def check_crawl_status(self, job_id: str): | ||
endpoint = f'{self.base_url}/v0/crawl/status/{job_id}' | ||
headers = self._prepare_headers() | ||
response = self._request('GET', endpoint, headers=headers) | ||
if response is None: | ||
raise HTTPError(f"Failed to check status for job {job_id} after multiple retries") | ||
return response | ||
|
||
def _monitor_job_status(self, job_id, headers, timeout): | ||
def _monitor_job_status(self, job_id: str, poll_interval: int): | ||
while True: | ||
status_response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers) | ||
if status_response.status_code == 200: | ||
status_data = status_response.json() | ||
if status_data['status'] == 'completed': | ||
if 'data' in status_data: | ||
return status_data['data'] | ||
else: | ||
raise Exception('Crawl job completed but no data was returned') | ||
elif status_data['status'] in ['active', 'paused', 'pending', 'queued']: | ||
if timeout < 2: | ||
timeout = 2 | ||
time.sleep(timeout) # Wait for the specified timeout before checking again | ||
else: | ||
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}') | ||
else: | ||
self._handle_error(status_response, 'check crawl status') | ||
|
||
def _handle_error(self, response, action): | ||
if response.status_code in [402, 409, 500]: | ||
error_message = response.json().get('error', 'Unknown error occurred') | ||
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') | ||
else: | ||
raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}') | ||
status = self.check_crawl_status(job_id) | ||
if status['status'] == 'completed': | ||
return status | ||
elif status['status'] == 'failed': | ||
raise HTTPError(f'Job {job_id} failed: {status["error"]}') | ||
time.sleep(poll_interval) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters