diff --git a/Cargo.lock b/Cargo.lock index 96c603665..a6c1d29c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4280,7 +4280,7 @@ dependencies = [ [[package]] name = "spider" -version = "1.80.1" +version = "1.80.3" dependencies = [ "ahash", "async-trait", @@ -4323,7 +4323,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "1.80.1" +version = "1.80.3" dependencies = [ "clap 4.4.11", "env_logger 0.9.3", @@ -4335,7 +4335,7 @@ dependencies = [ [[package]] name = "spider_examples" -version = "1.80.1" +version = "1.80.3" dependencies = [ "convert_case 0.5.0", "env_logger 0.9.3", @@ -4356,7 +4356,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "1.80.1" +version = "1.80.3" dependencies = [ "env_logger 0.10.1", "lazy_static", diff --git a/examples/Cargo.toml b/examples/Cargo.toml index d80024f95..3879e3a25 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_examples" -version = "1.80.1" +version = "1.80.3" authors = ["madeindjs ", "j-mendez "] description = "Multithreaded web crawler written in Rust." repository = "https://github.com/spider-rs/spider" @@ -22,7 +22,7 @@ htr = "0.5.27" flexbuffers = "2.0.0" [dependencies.spider] -version = "1.80.1" +version = "1.80.3" path = "../spider" features = ["serde"] diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 0649b33a0..f2856931b 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "1.80.1" +version = "1.80.3" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler written in Rust." repository = "https://github.com/spider-rs/spider" diff --git a/spider/README.md b/spider/README.md index 9d00ba349..e7ca953e6 100644 --- a/spider/README.md +++ b/spider/README.md @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom ```toml [dependencies] -spider = "1.80.1" +spider = "1.80.3" ``` And then the code: @@ -91,7 +91,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl ```toml [dependencies] -spider = { version = "1.80.1", features = ["regex", "ua_generator"] } +spider = { version = "1.80.3", features = ["regex", "ua_generator"] } ``` 1. `ua_generator`: Enables auto generating a random real User-Agent. @@ -128,7 +128,7 @@ Move processing to a worker, drastically increases performance even if worker is ```toml [dependencies] -spider = { version = "1.80.1", features = ["decentralized"] } +spider = { version = "1.80.3", features = ["decentralized"] } ``` ```sh @@ -148,7 +148,7 @@ Use the subscribe method to get a broadcast channel. ```toml [dependencies] -spider = { version = "1.80.1", features = ["sync"] } +spider = { version = "1.80.3", features = ["sync"] } ``` ```rust,no_run @@ -178,7 +178,7 @@ Allow regex for blacklisting routes ```toml [dependencies] -spider = { version = "1.80.1", features = ["regex"] } +spider = { version = "1.80.3", features = ["regex"] } ``` ```rust,no_run @@ -205,7 +205,7 @@ If you are performing large workloads you may need to control the crawler by ena ```toml [dependencies] -spider = { version = "1.80.1", features = ["control"] } +spider = { version = "1.80.3", features = ["control"] } ``` ```rust @@ -275,7 +275,7 @@ Use cron jobs to run crawls continuously at anytime. ```toml [dependencies] -spider = { version = "1.80.1", features = ["sync", "cron"] } +spider = { version = "1.80.3", features = ["sync", "cron"] } ``` ```rust,no_run @@ -314,7 +314,7 @@ the feature flag [`chrome_intercept`] to possibly speed up request using Network ```toml [dependencies] -spider = { version = "1.80.1", features = ["chrome", "chrome_intercept"] } +spider = { version = "1.80.3", features = ["chrome", "chrome_intercept"] } ``` You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug. @@ -346,7 +346,7 @@ Enabling HTTP cache can be done with the feature flag [`cache`] or [`cache_mem`] ```toml [dependencies] -spider = { version = "1.80.1", features = ["cache"] } +spider = { version = "1.80.3", features = ["cache"] } ``` You need to set `website.cache` to true to enable as well. @@ -377,7 +377,7 @@ Intelligently run crawls using HTTP and JavaScript Rendering when needed. The be ```toml [dependencies] -spider = { version = "1.80.1", features = ["smart"] } +spider = { version = "1.80.3", features = ["smart"] } ``` ```rust,no_run diff --git a/spider/src/packages/robotparser/parser.rs b/spider/src/packages/robotparser/parser.rs index 0f58e1f36..ae60b897a 100644 --- a/spider/src/packages/robotparser/parser.rs +++ b/spider/src/packages/robotparser/parser.rs @@ -25,8 +25,8 @@ //! } //! ``` -use compact_str::CompactString; use crate::Client; +use compact_str::CompactString; use reqwest::Response; use reqwest::StatusCode; use std::time::{Duration, SystemTime, UNIX_EPOCH}; diff --git a/spider/src/website.rs b/spider/src/website.rs index bf2191b6b..9fdb985f4 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -194,7 +194,7 @@ pub struct Website { pub chrome_intercept: bool, /// Block all images from rendering in Chrome. #[cfg(feature = "chrome_intercept")] - pub chrome_intercept_block_images: bool, + pub chrome_intercept_block_visuals: bool, /// Cache the page following HTTP Caching rules. #[cfg(feature = "cache")] pub cache: bool, @@ -1024,14 +1024,27 @@ impl Website { .await { Ok(mut rp) => { - let host_name = self.domain.inner().to_string(); + let mut host_name = self.domain.inner().to_string(); let intercept_page = chrome_page.clone(); - let ignore_images = self.chrome_intercept_block_images; + let ignore_visuals = self.chrome_intercept_block_visuals; let ih = task::spawn(async move { + let mut first_rq = true; while let Some(event) = rp.next().await { let u = &event.request.url; - if ignore_images && ResourceType::Image == event.resource_type || !u.starts_with(&host_name) && !crate::page::JS_FRAMEWORK_ALLOW.contains(&u.as_str()) { + + if first_rq { + if ResourceType::Document == event.resource_type { + host_name = u.into(); + } + first_rq = false; + } + + if ignore_visuals && (ResourceType::Image == event.resource_type || ResourceType::Media == event.resource_type || ResourceType::Stylesheet == event.resource_type) || + ResourceType::Script == event.resource_type && !u.starts_with(&host_name) && !crate::page::JS_FRAMEWORK_ALLOW.contains(&u.as_str()) || + ResourceType::Prefetch == event.resource_type || + ResourceType::Ping == event.resource_type + { match chromiumoxide::cdp::browser_protocol::fetch::FulfillRequestParams::builder() .request_id(event.request_id.clone()) .response_code(200) @@ -3003,7 +3016,7 @@ impl Website { block_images: bool, ) -> &mut Self { self.chrome_intercept = chrome_intercept; - self.chrome_intercept_block_images = block_images; + self.chrome_intercept_block_visuals = block_images; self } diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 1c758a2e7..7cdc5d526 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "1.80.1" +version = "1.80.3" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler CLI written in Rust." repository = "https://github.com/spider-rs/spider" @@ -26,7 +26,7 @@ quote = "1.0.18" failure_derive = "0.1.8" [dependencies.spider] -version = "1.80.1" +version = "1.80.3" path = "../spider" [[bin]] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 88cbc3385..83715a42a 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "1.80.1" +version = "1.80.3" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler as a worker or proxy." repository = "https://github.com/spider-rs/spider" @@ -22,7 +22,7 @@ lazy_static = "1.4.0" env_logger = "0.10.0" [dependencies.spider] -version = "1.80.1" +version = "1.80.3" path = "../spider" features = ["serde", "flexbuffers"]