From 33953c2b2a94e6029d250c35cb3cddc0e90c8e89 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Thu, 14 Dec 2023 13:50:29 -0500 Subject: [PATCH] feat(redirect): add transparent top redirect handling --- Cargo.lock | 8 +- examples/Cargo.toml | 4 +- spider/Cargo.toml | 2 +- spider/README.md | 18 ++-- spider/src/website.rs | 183 +++++++++++++++++++++++++++++++-------- spider_cli/Cargo.toml | 4 +- spider_worker/Cargo.toml | 4 +- 7 files changed, 166 insertions(+), 57 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b159917fc..e2118533f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3960,7 +3960,7 @@ dependencies = [ [[package]] name = "spider" -version = "1.60.12" +version = "1.60.13" dependencies = [ "ahash", "async-trait", @@ -4001,7 +4001,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "1.60.12" +version = "1.60.13" dependencies = [ "clap 4.4.11", "env_logger 0.9.3", @@ -4013,7 +4013,7 @@ dependencies = [ [[package]] name = "spider_examples" -version = "1.60.12" +version = "1.60.13" dependencies = [ "convert_case 0.5.0", "env_logger 0.9.3", @@ -4034,7 +4034,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "1.60.12" +version = "1.60.13" dependencies = [ "env_logger 0.10.1", "lazy_static", diff --git a/examples/Cargo.toml b/examples/Cargo.toml index 763722789..94b3ad110 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_examples" -version = "1.60.12" +version = "1.60.13" authors = ["madeindjs ", "j-mendez "] description = "Multithreaded web crawler written in Rust." repository = "https://github.com/spider-rs/spider" @@ -22,7 +22,7 @@ htr = "0.5.27" flexbuffers = "2.0.0" [dependencies.spider] -version = "1.60.12" +version = "1.60.13" path = "../spider" features = ["serde"] diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 342badbb1..807ffc6d4 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "1.60.12" +version = "1.60.13" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler written in Rust." repository = "https://github.com/spider-rs/spider" diff --git a/spider/README.md b/spider/README.md index e81989a09..dd635eca0 100644 --- a/spider/README.md +++ b/spider/README.md @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom ```toml [dependencies] -spider = "1.60.12" +spider = "1.60.13" ``` And then the code: @@ -91,7 +91,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl ```toml [dependencies] -spider = { version = "1.60.12", features = ["regex", "ua_generator"] } +spider = { version = "1.60.13", features = ["regex", "ua_generator"] } ``` 1. `ua_generator`: Enables auto generating a random real User-Agent. @@ -125,7 +125,7 @@ Move processing to a worker, drastically increases performance even if worker is ```toml [dependencies] -spider = { version = "1.60.12", features = ["decentralized"] } +spider = { version = "1.60.13", features = ["decentralized"] } ``` ```sh @@ -145,7 +145,7 @@ Use the subscribe method to get a broadcast channel. ```toml [dependencies] -spider = { version = "1.60.12", features = ["sync"] } +spider = { version = "1.60.13", features = ["sync"] } ``` ```rust,no_run @@ -175,7 +175,7 @@ Allow regex for blacklisting routes ```toml [dependencies] -spider = { version = "1.60.12", features = ["regex"] } +spider = { version = "1.60.13", features = ["regex"] } ``` ```rust,no_run @@ -202,7 +202,7 @@ If you are performing large workloads you may need to control the crawler by ena ```toml [dependencies] -spider = { version = "1.60.12", features = ["control"] } +spider = { version = "1.60.13", features = ["control"] } ``` ```rust @@ -272,7 +272,7 @@ Use cron jobs to run crawls continuously at anytime. ```toml [dependencies] -spider = { version = "1.60.12", features = ["sync", "cron"] } +spider = { version = "1.60.13", features = ["sync", "cron"] } ``` ```rust,no_run @@ -310,7 +310,7 @@ Connecting to Chrome can be done using the ENV variable `CHROME_URL`, if no conn ```toml [dependencies] -spider = { version = "1.60.12", features = ["chrome"] } +spider = { version = "1.60.13", features = ["chrome"] } ``` You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug. @@ -322,7 +322,7 @@ Intelligently run crawls using HTTP and JavaScript Rendering when needed. The be ```toml [dependencies] -spider = { version = "1.60.12", features = ["smart"] } +spider = { version = "1.60.13", features = ["smart"] } ``` ```rust,no_run diff --git a/spider/src/website.rs b/spider/src/website.rs index bfde068f7..c20fd6dd2 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -529,16 +529,52 @@ impl Website { /// build the http client #[cfg(not(feature = "decentralized"))] fn configure_http_client_builder(&mut self) -> reqwest::ClientBuilder { + use crate::page::domain_name; + use reqwest::redirect::Attempt; + use std::sync::atomic::AtomicU8; + let host_str = self.domain_parsed.as_deref().cloned(); let default_policy = reqwest::redirect::Policy::default(); + let policy = match host_str { - Some(host_s) => reqwest::redirect::Policy::custom(move |attempt| { - if attempt.url().host_str() != host_s.host_str() { - attempt.stop() + Some(host_s) => { + let initial_redirect = Arc::new(AtomicU8::new(0)); + let initial_redirect_limit = if self.configuration.respect_robots_txt { + 2 } else { - default_policy.redirect(attempt) - } - }), + 1 + }; + + let subdomains = self.configuration.subdomains; + let tld = self.configuration.tld; + let host_domain_name = if tld { + domain_name(&host_s).to_string() + } else { + Default::default() + }; + + let custom_policy = { + move |attempt: Attempt| { + if tld && domain_name(attempt.url()).ends_with(&host_domain_name) + || subdomains && attempt.url().as_str().ends_with(host_s.as_str()) + || attempt.url().host() == host_s.host() + { + default_policy.redirect(attempt) + } else if attempt.previous().len() > 7 { + attempt.error("too many redirects") + } else if attempt.status().is_redirection() + && (0..initial_redirect_limit) + .contains(&initial_redirect.load(Ordering::Relaxed)) + { + initial_redirect.fetch_add(1, Ordering::Relaxed); + default_policy.redirect(attempt) + } else { + attempt.stop() + } + } + }; + reqwest::redirect::Policy::custom(custom_policy) + } _ => default_policy, }; @@ -812,7 +848,7 @@ impl Website { async fn _crawl_establish( &mut self, client: &Client, - base: &(CompactString, smallvec::SmallVec<[CompactString; 2]>), + base: &mut (CompactString, smallvec::SmallVec<[CompactString; 2]>), _: bool, ) -> HashSet { let links: HashSet = if self @@ -820,6 +856,27 @@ impl Website { { let page = Page::new_page(&self.domain.inner(), &client).await; + // allow initial page mutation + match page.final_redirect_destination { + Some(ref domain) => { + let domain: Box = + CaseInsensitiveString::new(&domain).into(); + self.domain_parsed = match url::Url::parse(&domain.inner()) { + Ok(u) => Some(Box::new(crate::page::convert_abs_path(&u, "/"))), + _ => None, + }; + self.domain = domain; + match self.setup_selectors() { + Some(s) => { + base.0 = s.0; + base.1 = s.1; + } + _ => (), + } + } + _ => (), + }; + if !self.external_domains.is_empty() { self.external_domains_caseless = self .external_domains @@ -875,10 +932,10 @@ impl Website { async fn crawl_establish( &mut self, client: &Client, - base: &(CompactString, smallvec::SmallVec<[CompactString; 2]>), + base: &mut (CompactString, smallvec::SmallVec<[CompactString; 2]>), selector: bool, ) -> HashSet { - self._crawl_establish(&client, &base, selector).await + self._crawl_establish(&client, base, selector).await } /// expand links for crawl @@ -890,7 +947,7 @@ impl Website { async fn crawl_establish( &mut self, client: &Client, - base: &(CompactString, smallvec::SmallVec<[CompactString; 2]>), + base: &mut (CompactString, smallvec::SmallVec<[CompactString; 2]>), _: bool, page: &chromiumoxide::Page, ) -> HashSet { @@ -899,6 +956,26 @@ impl Website { { let page = Page::new(&self.domain.inner(), &client, &page).await; + match page.final_redirect_destination { + Some(ref domain) => { + let domain: Box = + CaseInsensitiveString::new(&domain).into(); + self.domain_parsed = match url::Url::parse(&domain.inner()) { + Ok(u) => Some(Box::new(crate::page::convert_abs_path(&u, "/"))), + _ => None, + }; + self.domain = domain; + match self.setup_selectors() { + Some(s) => { + base.0 = s.0; + base.1 = s.1; + } + _ => (), + } + } + _ => (), + } + if !self.external_domains.is_empty() { self.external_domains_caseless = self .external_domains @@ -971,7 +1048,7 @@ impl Website { } _ => *self.domain.to_owned(), }); - + let page_links = HashSet::from(page.links.clone()); channel_send_page(&self.channel, page); @@ -1028,12 +1105,11 @@ impl Website { }; self.links_visited.insert(link_result.0); - + channel_send_page(&self.channel, page.clone()); let page_links = HashSet::from(page.links); - links.extend(page_links); } } @@ -1046,7 +1122,7 @@ impl Website { async fn crawl_establish( &mut self, client: &Client, - base: &(CompactString, smallvec::SmallVec<[CompactString; 2]>), + base: &mut (CompactString, smallvec::SmallVec<[CompactString; 2]>), _: bool, ) -> HashSet { let mut links: HashSet = HashSet::new(); @@ -1068,6 +1144,26 @@ impl Website { if self.is_allowed_default(&link.inner(), &blacklist_url) { let page = Page::new(&link.inner(), &client).await; + match page.final_redirect_destination { + Some(ref domain) => { + let domain: Box = + CaseInsensitiveString::new(&domain).into(); + self.domain_parsed = match url::Url::parse(&domain.inner()) { + Ok(u) => Some(Box::new(crate::page::convert_abs_path(&u, "/"))), + _ => None, + }; + self.domain = domain; + match self.setup_selectors() { + Some(s) => { + base.0 = s.0; + base.1 = s.1; + } + _ => (), + } + } + _ => (), + } + if !page.is_empty() { let u = page.get_url().into(); let link_result = match self.on_link_find_callback { @@ -1266,11 +1362,14 @@ impl Website { async fn crawl_concurrent_raw(&mut self, client: &Client, handle: &Option>) { self.start(); match self.setup_selectors() { - Some(selector) => { + Some(mut selector) => { let (mut interval, throttle) = self.setup_crawl(); let blacklist_url = self.configuration.get_blacklist(); - let on_link_find_callback = self.on_link_find_callback; + + let mut links: HashSet = + self._crawl_establish(&client, &mut selector, false).await; + let shared = Arc::new(( client.to_owned(), selector, @@ -1278,9 +1377,6 @@ impl Website { self.external_domains_caseless.clone(), )); - let mut links: HashSet = - self._crawl_establish(&shared.0, &shared.1, false).await; - if !links.is_empty() { let mut set: JoinSet> = JoinSet::new(); let chandle = Handle::current(); @@ -1381,6 +1477,7 @@ impl Website { let on_link_find_callback = self.on_link_find_callback; let mut interval = tokio::time::interval(Duration::from_millis(10)); let selectors = Arc::new(unsafe { selectors.unwrap_unchecked() }); + let throttle = Duration::from_millis(delay); let mut links: HashSet = HashSet::from([*self.domain.clone()]); @@ -1502,18 +1599,22 @@ impl Website { }); } + let mut selectors = unsafe { selectors.unwrap_unchecked() }; + + let chrome_page = Arc::new(new_page.clone()); + + let mut links: HashSet = self + .crawl_establish(&client, &mut selectors, false, &chrome_page) + .await; + let shared = Arc::new(( client.to_owned(), - unsafe { selectors.unwrap_unchecked() }, + selectors, self.channel.clone(), - Arc::new(new_page.clone()), + chrome_page, self.external_domains_caseless.clone(), )); - let mut links: HashSet = self - .crawl_establish(&shared.0, &shared.1, false, &shared.3) - .await; - let add_external = shared.4.len() > 0; if !links.is_empty() { @@ -1652,18 +1753,22 @@ impl Website { }); } + let mut selectors = unsafe { selectors.unwrap_unchecked() }; + + let chrome_page = Arc::new(new_page.clone()); + + let mut links: HashSet = self + .crawl_establish(&client, &mut selectors, false, &chrome_page) + .await; + let shared = Arc::new(( client.to_owned(), - unsafe { selectors.unwrap_unchecked() }, + selectors, self.channel.clone(), - Arc::new(new_page.clone()), + chrome_page, self.external_domains_caseless.clone(), )); - let mut links: HashSet = self - .crawl_establish(&shared.0, &shared.1, false, &shared.3) - .await; - let add_external = shared.4.len() > 0; if !links.is_empty() { @@ -1780,12 +1885,15 @@ impl Website { self.start(); // crawl if valid selector match self.setup_selectors() { - Some(selector) => { + Some(mut selector) => { let (mut interval, throttle) = self.setup_crawl(); let blacklist_url = self.configuration.get_blacklist(); - let on_link_find_callback = self.on_link_find_callback; + let mut links: HashSet = + self.crawl_establish(&client, &mut selector, false).await; + + let shared = Arc::new(( client.to_owned(), selector, @@ -1793,9 +1901,6 @@ impl Website { self.external_domains_caseless.clone(), )); - let mut links: HashSet = - self.crawl_establish(&shared.0, &shared.1, false).await; - let add_external = shared.3.len() > 0; if !links.is_empty() { @@ -1899,7 +2004,11 @@ impl Website { .starts_with("http:"); let mut links: HashSet = self - .crawl_establish(&client, &(domain.into(), Default::default()), http_worker) + .crawl_establish( + &client, + &mut (domain.into(), Default::default()), + http_worker, + ) .await; let mut set: JoinSet> = JoinSet::new(); diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 3bb9f6316..bb7c7b7d3 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "1.60.12" +version = "1.60.13" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler CLI written in Rust." repository = "https://github.com/spider-rs/spider" @@ -26,7 +26,7 @@ quote = "1.0.18" failure_derive = "0.1.8" [dependencies.spider] -version = "1.60.12" +version = "1.60.13" path = "../spider" [[bin]] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 5331c1a5c..0e83abfb7 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "1.60.12" +version = "1.60.13" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler as a worker or proxy." repository = "https://github.com/spider-rs/spider" @@ -22,7 +22,7 @@ lazy_static = "1.4.0" env_logger = "0.10.0" [dependencies.spider] -version = "1.60.12" +version = "1.60.13" path = "../spider" features = ["serde", "flexbuffers"]