chore(chrome_intercept): fix intercept redirect initial domain

spider-rs · Dec 17, 2023 · 0fe4e85 · 0fe4e85
1 parent e303747
commit 0fe4e85
Show file tree

Hide file tree

Showing 8 changed files with 40 additions and 27 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/examples/Cargo.toml b/examples/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_examples"
-version = "1.80.1"
+version = "1.80.3"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "Multithreaded web crawler written in Rust."
 repository = "https://github.com/spider-rs/spider"
@@ -22,7 +22,7 @@ htr = "0.5.27"
 flexbuffers = "2.0.0"
 
 [dependencies.spider]
-version = "1.80.1"
+version = "1.80.3"
 path = "../spider"
 features = ["serde"]
 

diff --git a/spider/Cargo.toml b/spider/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider"
-version = "1.80.1"
+version = "1.80.3"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "The fastest web crawler written in Rust."
 repository = "https://github.com/spider-rs/spider"

diff --git a/spider/README.md b/spider/README.md
@@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom
 
 ```toml
 [dependencies]
-spider = "1.80.1"
+spider = "1.80.3"
 ```
 
 And then the code:
@@ -91,7 +91,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl
 
 ```toml
 [dependencies]
-spider = { version = "1.80.1", features = ["regex", "ua_generator"] }
+spider = { version = "1.80.3", features = ["regex", "ua_generator"] }
 ```
 
 1. `ua_generator`: Enables auto generating a random real User-Agent.
@@ -128,7 +128,7 @@ Move processing to a worker, drastically increases performance even if worker is
 
 ```toml
 [dependencies]
-spider = { version = "1.80.1", features = ["decentralized"] }
+spider = { version = "1.80.3", features = ["decentralized"] }
 ```
 
 ```sh
@@ -148,7 +148,7 @@ Use the subscribe method to get a broadcast channel.
 
 ```toml
 [dependencies]
-spider = { version = "1.80.1", features = ["sync"] }
+spider = { version = "1.80.3", features = ["sync"] }
 ```
 
 ```rust,no_run
@@ -178,7 +178,7 @@ Allow regex for blacklisting routes
 
 ```toml
 [dependencies]
-spider = { version = "1.80.1", features = ["regex"] }
+spider = { version = "1.80.3", features = ["regex"] }
 ```
 
 ```rust,no_run
@@ -205,7 +205,7 @@ If you are performing large workloads you may need to control the crawler by ena
 
 ```toml
 [dependencies]
-spider = { version = "1.80.1", features = ["control"] }
+spider = { version = "1.80.3", features = ["control"] }
 ```
 
 ```rust
@@ -275,7 +275,7 @@ Use cron jobs to run crawls continuously at anytime.
 
 ```toml
 [dependencies]
-spider = { version = "1.80.1", features = ["sync", "cron"] }
+spider = { version = "1.80.3", features = ["sync", "cron"] }
 ```
 
 ```rust,no_run
@@ -314,7 +314,7 @@ the feature flag [`chrome_intercept`] to possibly speed up request using Network
 
 ```toml
 [dependencies]
-spider = { version = "1.80.1", features = ["chrome", "chrome_intercept"] }
+spider = { version = "1.80.3", features = ["chrome", "chrome_intercept"] }
 ```
 
 You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug.
@@ -346,7 +346,7 @@ Enabling HTTP cache can be done with the feature flag [`cache`] or [`cache_mem`]
 
 ```toml
 [dependencies]
-spider = { version = "1.80.1", features = ["cache"] }
+spider = { version = "1.80.3", features = ["cache"] }
 ```
 
 You need to set `website.cache` to true to enable as well.
@@ -377,7 +377,7 @@ Intelligently run crawls using HTTP and JavaScript Rendering when needed. The be
 
 ```toml
 [dependencies]
-spider = { version = "1.80.1", features = ["smart"] }
+spider = { version = "1.80.3", features = ["smart"] }
 ```
 
 ```rust,no_run

diff --git a/spider/src/packages/robotparser/parser.rs b/spider/src/packages/robotparser/parser.rs
@@ -25,8 +25,8 @@
 //! }
 //! ```
 
-use compact_str::CompactString;
 use crate::Client;
+use compact_str::CompactString;
 use reqwest::Response;
 use reqwest::StatusCode;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};

diff --git a/spider/src/website.rs b/spider/src/website.rs
@@ -194,7 +194,7 @@ pub struct Website {
     pub chrome_intercept: bool,
     /// Block all images from rendering in Chrome.
     #[cfg(feature = "chrome_intercept")]
-    pub chrome_intercept_block_images: bool,
+    pub chrome_intercept_block_visuals: bool,
     /// Cache the page following HTTP Caching rules.
     #[cfg(feature = "cache")]
     pub cache: bool,
@@ -1024,14 +1024,27 @@ impl Website {
                 .await
             {
                 Ok(mut rp) => {
-                    let host_name = self.domain.inner().to_string();
+                    let mut host_name = self.domain.inner().to_string();
                     let intercept_page = chrome_page.clone();
-                    let ignore_images = self.chrome_intercept_block_images;
+                    let ignore_visuals = self.chrome_intercept_block_visuals;
 
                     let ih = task::spawn(async move {
+                        let mut first_rq = true;
                         while let Some(event) = rp.next().await {
                             let u = &event.request.url;
-                            if ignore_images && ResourceType::Image == event.resource_type || !u.starts_with(&host_name) && !crate::page::JS_FRAMEWORK_ALLOW.contains(&u.as_str()) {
+
+                            if first_rq {
+                                if ResourceType::Document == event.resource_type {
+                                    host_name = u.into();
+                                }
+                                first_rq = false;
+                            }
+
+                            if ignore_visuals && (ResourceType::Image == event.resource_type || ResourceType::Media == event.resource_type || ResourceType::Stylesheet == event.resource_type) || 
+                                ResourceType::Script == event.resource_type && !u.starts_with(&host_name) && !crate::page::JS_FRAMEWORK_ALLOW.contains(&u.as_str()) ||
+                                ResourceType::Prefetch == event.resource_type || 
+                                ResourceType::Ping == event.resource_type
+                            {
                                 match chromiumoxide::cdp::browser_protocol::fetch::FulfillRequestParams::builder()
                                 .request_id(event.request_id.clone())
                                 .response_code(200)
@@ -3003,7 +3016,7 @@ impl Website {
         block_images: bool,
     ) -> &mut Self {
         self.chrome_intercept = chrome_intercept;
-        self.chrome_intercept_block_images = block_images;
+        self.chrome_intercept_block_visuals = block_images;
         self
     }
 

diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_cli"
-version = "1.80.1"
+version = "1.80.3"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "The fastest web crawler CLI written in Rust."
 repository = "https://github.com/spider-rs/spider"
@@ -26,7 +26,7 @@ quote = "1.0.18"
 failure_derive = "0.1.8"
 
 [dependencies.spider]
-version = "1.80.1"
+version = "1.80.3"
 path = "../spider"
 
 [[bin]]

diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_worker"
-version = "1.80.1"
+version = "1.80.3"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "The fastest web crawler as a worker or proxy."
 repository = "https://github.com/spider-rs/spider"
@@ -22,7 +22,7 @@ lazy_static = "1.4.0"
 env_logger = "0.10.0"
 
 [dependencies.spider]
-version = "1.80.1"
+version = "1.80.3"
 path = "../spider"
 features = ["serde", "flexbuffers"]