diff --git a/Cargo.lock b/Cargo.lock index 281f8f59d..2142877ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3329,7 +3329,7 @@ dependencies = [ [[package]] name = "spider" -version = "1.86.15" +version = "1.87.3" dependencies = [ "ahash", "async-openai", @@ -3377,7 +3377,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "1.86.15" +version = "1.87.3" dependencies = [ "clap", "env_logger", @@ -3400,7 +3400,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "1.86.15" +version = "1.87.3" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 639485dc4..b84ae7bbf 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "1.86.15" +version = "1.87.3" authors = [ "madeindjs ", "j-mendez ", @@ -84,7 +84,7 @@ lol_html = { version = "1.2.1", optional = true } tikv-jemallocator = { version = "0.5.0", optional = true } [features] -default = ["sync", "cookies", "reqwest/native-tls-alpn"] +default = ["sync", "reqwest/native-tls-alpn", "cookies"] regex = ["dep:regex"] glob = ["dep:regex", "dep:itertools"] ua_generator = ["dep:ua_generator"] @@ -120,5 +120,6 @@ http3 = ["reqwest/http3"] smart = ["chrome", "dep:regex"] encoding = ["dep:encoding_rs"] headers = [] +real_browser = [] openai = ["chrome", "chrome_intercept", "dep:async-openai", "dep:tiktoken-rs", "dep:lol_html"] decentralized_headers = ["dep:const_format", "dep:itertools"] diff --git a/spider/README.md b/spider/README.md index 690c1f223..e9c02d4eb 100644 --- a/spider/README.md +++ b/spider/README.md @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom ```toml [dependencies] -spider = "1.86.15" +spider = "1.87.3" ``` And then the code: @@ -93,7 +93,7 @@ We have the following optional feature flags. ```toml [dependencies] -spider = { version = "1.86.15", features = ["regex", "ua_generator"] } +spider = { version = "1.87.3", features = ["regex", "ua_generator"] } ``` 1. `ua_generator`: Enables auto generating a random real User-Agent. @@ -122,6 +122,7 @@ spider = { version = "1.86.15", features = ["regex", "ua_generator"] } 1. `chrome_stealth`: Enables stealth mode to make it harder to be detected as a bot. 1. `chrome_intercept`: Allows intercepting network request to speed up processing. 1. `cookies`: Enables cookies storing and setting to use for request. +1. `real_browser`: Enables the ability to bypass cloudflare protected pages. 1. `cron`: Enables the ability to start cron jobs for the website. 1. `openai`: Enables OpenAI to generate dynamic browser executable scripts. Make sure to use the env var `OPENAI_API_KEY`. 1. `smart`: Enables smart mode. This runs request as HTTP until JavaScript rendering is needed. This avoids sending multiple network request by re-using the content. @@ -136,7 +137,7 @@ Move processing to a worker, drastically increases performance even if worker is ```toml [dependencies] -spider = { version = "1.86.15", features = ["decentralized"] } +spider = { version = "1.87.3", features = ["decentralized"] } ``` ```sh @@ -167,7 +168,7 @@ Use the subscribe method to get a broadcast channel. ```toml [dependencies] -spider = { version = "1.86.15", features = ["sync"] } +spider = { version = "1.87.3", features = ["sync"] } ``` ```rust,no_run @@ -197,7 +198,7 @@ Allow regex for blacklisting routes ```toml [dependencies] -spider = { version = "1.86.15", features = ["regex"] } +spider = { version = "1.87.3", features = ["regex"] } ``` ```rust,no_run @@ -224,7 +225,7 @@ If you are performing large workloads you may need to control the crawler by ena ```toml [dependencies] -spider = { version = "1.86.15", features = ["control"] } +spider = { version = "1.87.3", features = ["control"] } ``` ```rust @@ -294,7 +295,7 @@ Use cron jobs to run crawls continuously at anytime. ```toml [dependencies] -spider = { version = "1.86.15", features = ["sync", "cron"] } +spider = { version = "1.87.3", features = ["sync", "cron"] } ``` ```rust,no_run @@ -333,7 +334,7 @@ the feature flag [`chrome_intercept`] to possibly speed up request using Network ```toml [dependencies] -spider = { version = "1.86.15", features = ["chrome", "chrome_intercept"] } +spider = { version = "1.87.3", features = ["chrome", "chrome_intercept"] } ``` You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug. @@ -363,7 +364,7 @@ Enabling HTTP cache can be done with the feature flag [`cache`] or [`cache_mem`] ```toml [dependencies] -spider = { version = "1.86.15", features = ["cache"] } +spider = { version = "1.87.3", features = ["cache"] } ``` You need to set `website.cache` to true to enable as well. @@ -394,7 +395,7 @@ Intelligently run crawls using HTTP and JavaScript Rendering when needed. The be ```toml [dependencies] -spider = { version = "1.86.15", features = ["smart"] } +spider = { version = "1.87.3", features = ["smart"] } ``` ```rust,no_run @@ -420,7 +421,7 @@ Use OpenAI to generate dynamic scripts to drive the browser done with the featur ```toml [dependencies] -spider = { version = "1.86.15", features = ["openai"] } +spider = { version = "1.87.3", features = ["openai"] } ``` ```rust @@ -445,7 +446,7 @@ Set a depth limit to prevent forwarding. ```toml [dependencies] -spider = { version = "1.86.15", features = ["budget"] } +spider = { version = "1.87.3", features = ["budget"] } ``` ```rust,no_run diff --git a/spider/src/features/chrome.rs b/spider/src/features/chrome.rs index 9396312a6..e1f3840e9 100644 --- a/spider/src/features/chrome.rs +++ b/spider/src/features/chrome.rs @@ -239,9 +239,80 @@ pub async fn close_browser(browser_handle: JoinHandle<()>) { } } -#[cfg(not(feature = "chrome_cpu"))] +/// static chrome arguments to start +#[cfg(all(feature = "chrome_cpu", feature = "real_browser"))] +pub static CHROME_ARGS: [&'static str; 27] = [ + if cfg!(feature = "chrome_headless_new") { + "--headless=new" + } else { + "--headless" + }, + "--disable-extensions", + "--disable-component-extensions-with-background-pages", + "--disable-background-networking", + "--disable-component-update", + "--disable-client-side-phishing-detection", + "--disable-sync", + "--metrics-recording-only", + "--disable-default-apps", + "--mute-audio", + "--no-default-browser-check", + "--no-first-run", + "--disable-gpu", + "--disable-gpu-sandbox", + "--disable-setuid-sandbox", + "--disable-dev-shm-usage", + "--disable-backgrounding-occluded-windows", + "--disable-renderer-backgrounding", + "--disable-background-timer-throttling", + "--disable-ipc-flooding-protection", + "--password-store=basic", + "--use-mock-keychain", + "--force-fieldtrials=*BackgroundTracing/default/", + "--disable-hang-monitor", + "--disable-prompt-on-repost", + "--disable-domain-reliability", + "--disable-features=InterestFeedContentSuggestions,PrivacySandboxSettings4,AutofillServerCommunication,CalculateNativeWinOcclusion,OptimizationHints,AudioServiceOutOfProcess,IsolateOrigins,site-per-process,ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,DialMediaRouteProvider,AcceptCHFrame,AutoExpandDetailsElement,CertificateTransparencyComponentUpdater,AvoidUnnecessaryBeforeUnloadCheckSync,Translate" +]; + +/// static chrome arguments to start +#[cfg(all(not(feature = "chrome_cpu"), feature = "real_browser"))] +pub static CHROME_ARGS: [&'static str; 24] = [ + if cfg!(feature = "chrome_headless_new") { + "--headless=new" + } else { + "--headless" + }, + "--disable-extensions", + "--disable-component-extensions-with-background-pages", + "--disable-background-networking", + "--disable-component-update", + "--disable-client-side-phishing-detection", + "--disable-sync", + "--disable-dev-shm-usage", + "--metrics-recording-only", + "--disable-default-apps", + "--mute-audio", + "--no-default-browser-check", + "--no-first-run", + "--disable-backgrounding-occluded-windows", + "--disable-renderer-backgrounding", + "--disable-background-timer-throttling", + "--disable-ipc-flooding-protection", + "--password-store=basic", + "--use-mock-keychain", + "--force-fieldtrials=*BackgroundTracing/default/", + "--disable-hang-monitor", + "--disable-prompt-on-repost", + "--disable-domain-reliability", + "--disable-features=InterestFeedContentSuggestions,PrivacySandboxSettings4,AutofillServerCommunication,CalculateNativeWinOcclusion,OptimizationHints,AudioServiceOutOfProcess,IsolateOrigins,site-per-process,ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,DialMediaRouteProvider,AcceptCHFrame,AutoExpandDetailsElement,CertificateTransparencyComponentUpdater,AvoidUnnecessaryBeforeUnloadCheckSync,Translate" +]; + +// One of the configs below is detected by CF bots. We need to take a look at the optimal args 03/25/24. + +#[cfg(all(not(feature = "chrome_cpu"), not(feature = "real_browser")))] /// static chrome arguments to start application ref [https://github.com/a11ywatch/chrome/blob/main/src/main.rs#L13] -static CHROME_ARGS: [&'static str; 59] = [ +static CHROME_ARGS: [&'static str; 60] = [ if cfg!(feature = "chrome_headless_new") { "--headless=new" } else { "--headless" }, "--no-sandbox", "--no-first-run", @@ -289,6 +360,7 @@ static CHROME_ARGS: [&'static str; 59] = [ "--disable-field-trial-config", "--disable-back-forward-cache", "--disable-backgrounding-occluded-windows", + "--force-fieldtrials=*BackgroundTracing/default/", // "--enable-automation", "--log-level=3", "--enable-logging=stderr", @@ -303,12 +375,12 @@ static CHROME_ARGS: [&'static str; 59] = [ "--no-pings", "--use-gl=swiftshader", "--window-size=1920,1080", - "--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,DialMediaRouteProvider,AcceptCHFrame,AutoExpandDetailsElement,CertificateTransparencyComponentUpdater,AvoidUnnecessaryBeforeUnloadCheckSync,Translate" + "--disable-features=InterestFeedContentSuggestions,PrivacySandboxSettings4,AutofillServerCommunication,CalculateNativeWinOcclusion,OptimizationHints,AudioServiceOutOfProcess,IsolateOrigins,site-per-process,ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,DialMediaRouteProvider,AcceptCHFrame,AutoExpandDetailsElement,CertificateTransparencyComponentUpdater,AvoidUnnecessaryBeforeUnloadCheckSync,Translate" ]; -#[cfg(feature = "chrome_cpu")] +#[cfg(all(feature = "chrome_cpu", not(feature = "real_browser")))] /// static chrome arguments to start application ref [https://github.com/a11ywatch/chrome/blob/main/src/main.rs#L13] -static CHROME_ARGS: [&'static str; 62] = [ +static CHROME_ARGS: [&'static str; 63] = [ if cfg!(feature = "chrome_headless_new") { "--headless=new" } else { "--headless" }, "--no-sandbox", "--no-first-run", @@ -359,6 +431,7 @@ static CHROME_ARGS: [&'static str; 62] = [ "--disable-field-trial-config", "--disable-back-forward-cache", "--disable-backgrounding-occluded-windows", + "--force-fieldtrials=*BackgroundTracing/default/", // "--enable-automation", "--log-level=3", "--enable-logging=stderr", @@ -373,5 +446,5 @@ static CHROME_ARGS: [&'static str; 62] = [ "--no-pings", "--use-gl=swiftshader", "--window-size=1920,1080", - "--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,DialMediaRouteProvider,AcceptCHFrame,AutoExpandDetailsElement,CertificateTransparencyComponentUpdater,AvoidUnnecessaryBeforeUnloadCheckSync,Translate" + "--disable-features=InterestFeedContentSuggestions,PrivacySandboxSettings4,AutofillServerCommunication,CalculateNativeWinOcclusion,OptimizationHints,AudioServiceOutOfProcess,IsolateOrigins,site-per-process,ImprovedCookieControls,LazyFrameLoading,GlobalMediaControls,DestroyProfileOnBrowserClose,MediaRouter,DialMediaRouteProvider,AcceptCHFrame,AutoExpandDetailsElement,CertificateTransparencyComponentUpdater,AvoidUnnecessaryBeforeUnloadCheckSync,Translate" ]; diff --git a/spider/src/lib.rs b/spider/src/lib.rs index 556de9c7d..e48bcf18f 100644 --- a/spider/src/lib.rs +++ b/spider/src/lib.rs @@ -70,6 +70,7 @@ //! - `chrome_intercept`: Allows intercepting network request to speed up processing. //! - `chrome_headless_new`: Use headless=new to launch the chrome instance. //! - `cookies`: Enables cookies storing and setting to use for request. +//! - `real_browser`: Enables the ability to bypass cloudflare protected pages. //! - `cron`: Enables the ability to start cron jobs for the website. //! - `openai`: Enables OpenAI to generate dynamic browser executable scripts. Make sure to use the env var `OPENAI_API_KEY`. //! - `http3`: Enables experimental HTTP/3 client. diff --git a/spider/src/utils.rs b/spider/src/utils.rs index de87c43c6..5a4572694 100644 --- a/spider/src/utils.rs +++ b/spider/src/utils.rs @@ -5,6 +5,50 @@ use log::{info, log_enabled, Level}; use reqwest::header::HeaderMap; use reqwest::{Error, Response, StatusCode}; +/// Handle cloudflare protected pages via chrome. This does nothing without the real_browser feature enabled. +#[cfg(all(feature = "chrome", feature = "real_browser"))] +async fn cf_handle( + b: bytes::Bytes, + page: &chromiumoxide::Page, +) -> Result { + use crate::configuration::{WaitFor, WaitForDelay, WaitForIdleNetwork}; + lazy_static! { + static ref CF_END: &'static [u8; 62] = + b"target=\"_blank\">Cloudflare"; + }; + let cf = CF_END.as_ref(); + + if b.ends_with(cf) { + let mut wait_for = WaitFor::default(); + wait_for.delay = WaitForDelay::new(Some(core::time::Duration::from_secs(1))).into(); + wait_for.idle_network = + WaitForIdleNetwork::new(core::time::Duration::from_secs(8).into()).into(); + page_wait(&page, &Some(wait_for.clone())).await; + page.find_element("iframe").await?.click().await?; + wait_for.page_navigations = true; + page_wait(&page, &Some(wait_for.clone())).await; + let next_content = page.content_bytes().await?; + Ok(if next_content.ends_with(cf) { + wait_for.delay = WaitForDelay::new(Some(core::time::Duration::from_secs(4))).into(); + page_wait(&page, &Some(wait_for)).await; + page.content_bytes().await? + } else { + next_content + }) + } else { + Ok(b) + } +} + +/// Handle cloudflare protected pages via chrome. This does nothing without the real_browser feature enabled. +#[cfg(all(feature = "chrome", not(feature = "real_browser")))] +async fn cf_handle( + b: bytes::Bytes, + _page: &chromiumoxide::Page, +) -> Result { + Ok(b) +} + /// The response of a web page. #[derive(Debug, Default)] pub struct PageResponse { @@ -192,6 +236,12 @@ pub async fn fetch_page_html_chrome_base( let page = page.activate().await?; let res: bytes::Bytes = page.content_bytes().await?; + let res = if cfg!(feature = "real_browser") { + cf_handle(res, &page).await? + } else { + res + }; + let ok = res.len() > 0; let mut page_response = PageResponse { diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index ca87852b6..79d428f3a 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "1.86.15" +version = "1.87.3" authors = [ "madeindjs ", "j-mendez ", @@ -29,7 +29,7 @@ quote = "1.0.18" failure_derive = "0.1.8" [dependencies.spider] -version = "1.86.15" +version = "1.87.3" path = "../spider" [[bin]] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 5f11300e1..5a3b229a7 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "1.86.15" +version = "1.87.3" authors = [ "madeindjs ", "j-mendez ", @@ -25,7 +25,7 @@ lazy_static = "1.4.0" env_logger = "0.11.3" [dependencies.spider] -version = "1.86.15" +version = "1.87.3" path = "../spider" features = ["serde", "flexbuffers"]