Skip to content

Commit

Permalink
chore(website): fix crawl establish domain removal [#233]
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 1, 2024
1 parent f8a9fca commit f184914
Show file tree
Hide file tree
Showing 9 changed files with 54 additions and 33 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.13.88"
version = "2.13.90"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
49 changes: 33 additions & 16 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,11 +223,11 @@ pub fn push_link<A: PartialEq + Eq + std::hash::Hash + From<String>>(
let mut abs = convert_abs_path(b, href);
let scheme = abs.scheme();

if scheme == "https" || scheme == "http" {
if let Some(link_map) = links_pages {
link_map.insert(A::from(abs.as_str().to_string()));
}
if let Some(link_map) = links_pages {
link_map.insert(A::from(abs.as_str().to_string()));
}

if scheme == "https" || scheme == "http" {
let host_name = abs.host_str();
let mut can_process = parent_host_match(
host_name,
Expand Down Expand Up @@ -570,21 +570,38 @@ impl Page {
res.content_length().unwrap_or(DEFAULT_BYTE_CAPACITY) as usize,
);

if url != res.url().as_str() {
let domain = res.url().as_str();
let mut url = Box::new(CaseInsensitiveString::new(&url));
let target_url = res.url().as_str();

modify_selectors(
prior_domain,
domain,
domain_parsed,
&mut url,
selectors,
AllowedDomainTypes::new(r_settings.subdomains, r_settings.tld),
);
// handle redirects
if url != target_url {
let mut url = Box::new(CaseInsensitiveString::new(&url));
let end_target_slash = target_url.ends_with("/");
let main_slash = url.ends_with("/");

let exact_match = end_target_slash
&& !main_slash
&& target_url[..target_url.len() - 1] == *url
|| !end_target_slash && main_slash && url[..url.len() - 1] == *target_url;

if !exact_match {
modify_selectors(
prior_domain,
target_url,
domain_parsed,
&mut url,
selectors,
AllowedDomainTypes::new(r_settings.subdomains, r_settings.tld),
);
}
};

let base = domain_parsed.as_deref();
// always use a base url.
let base = if domain_parsed.is_none() {
prior_domain
} else {
domain_parsed
}
.as_deref();

let parent_host = &selectors.1[0];
// the host schemes
Expand Down
14 changes: 9 additions & 5 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::configuration::{
use crate::features::chrome_common::RequestInterceptConfiguration;
use crate::packages::robotparser::parser::RobotFileParser;
use crate::page::{Page, PageLinkBuildSettings};
use crate::utils::abs::parse_absolute_url;
use crate::utils::abs::{convert_abs_url, parse_absolute_url};
use crate::utils::{
emit_log, emit_log_shutdown, setup_website_selectors, spawn_set, spawn_task, AllowedDomainTypes,
};
Expand Down Expand Up @@ -1288,12 +1288,19 @@ impl Website {
&page_links_settings,
&mut links,
Some(&mut links_ssg),
&mut domain_parsed,
&mut domain_parsed, // original domain
&mut self.domain_parsed,
&mut links_pages,
)
.await;

if self.domain_parsed.is_none() {
if let Some(mut domain_parsed) = domain_parsed.take() {
convert_abs_url(&mut domain_parsed);
self.domain_parsed.replace(domain_parsed);
}
}

let mut retry_count = self.configuration.retry;
let domains_caseless = &self.configuration.external_domains_caseless;

Expand Down Expand Up @@ -1366,9 +1373,6 @@ impl Website {

if self.configuration.return_page_links {
page.page_links = links_pages.filter(|pages| !pages.is_empty()).map(Box::new);
if let Some(page_links) = page.page_links.as_mut() {
page_links.extend(links_ssg.clone());
}
}

links.extend(links_ssg);
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.13.88"
version = "2.13.90"
rust-version = "1.70"
authors = [
"j-mendez <jeff@spider.cloud>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.13.88"
version = "2.13.90"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.13.88"
version = "2.13.90"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.13.88"
version = "2.13.90"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.13.88"
version = "2.13.90"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down

0 comments on commit f184914

Please sign in to comment.