diff --git a/Cargo.lock b/Cargo.lock index a1e825a60..4e900d8e0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4295,7 +4295,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.13.88" +version = "2.13.90" dependencies = [ "ahash", "aho-corasick", @@ -4357,7 +4357,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.13.88" +version = "2.13.90" dependencies = [ "adblock", "async-tungstenite", @@ -4392,7 +4392,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.13.88" +version = "2.13.90" dependencies = [ "clap", "env_logger", @@ -4417,7 +4417,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.13.88" +version = "2.13.90" dependencies = [ "aho-corasick", "fast_html2md", @@ -4439,7 +4439,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.13.88" +version = "2.13.90" dependencies = [ "indexmap 1.9.3", "serde", @@ -4451,7 +4451,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.13.88" +version = "2.13.90" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index d6c082d93..f918b35df 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.13.88" +version = "2.13.90" authors = [ "j-mendez " ] diff --git a/spider/src/page.rs b/spider/src/page.rs index 571a82c53..a85f341ea 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -223,11 +223,11 @@ pub fn push_link>( let mut abs = convert_abs_path(b, href); let scheme = abs.scheme(); - if scheme == "https" || scheme == "http" { - if let Some(link_map) = links_pages { - link_map.insert(A::from(abs.as_str().to_string())); - } + if let Some(link_map) = links_pages { + link_map.insert(A::from(abs.as_str().to_string())); + } + if scheme == "https" || scheme == "http" { let host_name = abs.host_str(); let mut can_process = parent_host_match( host_name, @@ -570,21 +570,38 @@ impl Page { res.content_length().unwrap_or(DEFAULT_BYTE_CAPACITY) as usize, ); - if url != res.url().as_str() { - let domain = res.url().as_str(); - let mut url = Box::new(CaseInsensitiveString::new(&url)); + let target_url = res.url().as_str(); - modify_selectors( - prior_domain, - domain, - domain_parsed, - &mut url, - selectors, - AllowedDomainTypes::new(r_settings.subdomains, r_settings.tld), - ); + // handle redirects + if url != target_url { + let mut url = Box::new(CaseInsensitiveString::new(&url)); + let end_target_slash = target_url.ends_with("/"); + let main_slash = url.ends_with("/"); + + let exact_match = end_target_slash + && !main_slash + && target_url[..target_url.len() - 1] == *url + || !end_target_slash && main_slash && url[..url.len() - 1] == *target_url; + + if !exact_match { + modify_selectors( + prior_domain, + target_url, + domain_parsed, + &mut url, + selectors, + AllowedDomainTypes::new(r_settings.subdomains, r_settings.tld), + ); + } }; - let base = domain_parsed.as_deref(); + // always use a base url. + let base = if domain_parsed.is_none() { + prior_domain + } else { + domain_parsed + } + .as_deref(); let parent_host = &selectors.1[0]; // the host schemes diff --git a/spider/src/website.rs b/spider/src/website.rs index b27193608..b10625117 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -6,7 +6,7 @@ use crate::configuration::{ use crate::features::chrome_common::RequestInterceptConfiguration; use crate::packages::robotparser::parser::RobotFileParser; use crate::page::{Page, PageLinkBuildSettings}; -use crate::utils::abs::parse_absolute_url; +use crate::utils::abs::{convert_abs_url, parse_absolute_url}; use crate::utils::{ emit_log, emit_log_shutdown, setup_website_selectors, spawn_set, spawn_task, AllowedDomainTypes, }; @@ -1288,12 +1288,19 @@ impl Website { &page_links_settings, &mut links, Some(&mut links_ssg), - &mut domain_parsed, + &mut domain_parsed, // original domain &mut self.domain_parsed, &mut links_pages, ) .await; + if self.domain_parsed.is_none() { + if let Some(mut domain_parsed) = domain_parsed.take() { + convert_abs_url(&mut domain_parsed); + self.domain_parsed.replace(domain_parsed); + } + } + let mut retry_count = self.configuration.retry; let domains_caseless = &self.configuration.external_domains_caseless; @@ -1366,9 +1373,6 @@ impl Website { if self.configuration.return_page_links { page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new); - if let Some(page_links) = page.page_links.as_mut() { - page_links.extend(links_ssg.clone()); - } } links.extend(links_ssg); diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 91cc466cd..41ea5773f 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.13.88" +version = "2.13.90" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 7c96f4f1a..6ef68c4ed 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.13.88" +version = "2.13.90" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 8e2e7c039..85f350811 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.13.88" +version = "2.13.90" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 7a690f055..39aa5a8ef 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.13.88" +version = "2.13.90" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 06dec1a78..2b5d828fc 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.13.88" +version = "2.13.90" authors = [ "j-mendez " ]