From c14cd6c21a24ccffc74dd0a58c9ddbb0cd5ca241 Mon Sep 17 00:00:00 2001 From: Jeff Mendez Date: Sat, 28 Oct 2023 13:51:10 -0400 Subject: [PATCH] chore(page): fix subdomain entry point handling root (#146) --- Cargo.lock | 8 ++++---- examples/Cargo.toml | 10 ++++------ examples/download.rs | 2 +- examples/download_to_react.rs | 2 +- spider/Cargo.toml | 2 +- spider/README.md | 12 ++++++------ spider/src/page.rs | 32 ++++++++++++++++++++++++++++---- spider_cli/Cargo.toml | 4 ++-- spider_worker/Cargo.toml | 4 ++-- 9 files changed, 49 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1091d9818..3b8fa6851 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3569,7 +3569,7 @@ dependencies = [ [[package]] name = "spider" -version = "1.46.4" +version = "1.46.5" dependencies = [ "ahash", "bytes", @@ -3604,7 +3604,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "1.46.4" +version = "1.46.5" dependencies = [ "clap 4.4.4", "env_logger 0.9.3", @@ -3616,7 +3616,7 @@ dependencies = [ [[package]] name = "spider_examples" -version = "1.46.4" +version = "1.46.5" dependencies = [ "convert_case", "env_logger 0.9.3", @@ -3637,7 +3637,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "1.46.4" +version = "1.46.5" dependencies = [ "env_logger 0.10.0", "lazy_static", diff --git a/examples/Cargo.toml b/examples/Cargo.toml index 9aaafe565..7eead062c 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_examples" -version = "1.46.4" +version = "1.46.5" authors = ["madeindjs ", "j-mendez "] description = "Multithreaded web crawler written in Rust." repository = "https://github.com/spider-rs/spider" @@ -22,7 +22,7 @@ htr = "0.5.27" flexbuffers = "2.0.0" [dependencies.spider] -version = "1.46.4" +version = "1.46.5" path = "../spider" features = ["serde"] @@ -61,7 +61,7 @@ path = "callback.rs" [[example]] name = "sitemap" path = "sitemap.rs" -features = ["sitemap"] +required-features = ["spider/sitemap"] [[example]] name = "configuration" @@ -70,6 +70,4 @@ path = "configuration.rs" [[example]] name = "budget" path = "budget.rs" - -[features] -default = ["spider/budget", "spider/sync"] \ No newline at end of file +required-features = ["spider/budget", "spider/sync"] \ No newline at end of file diff --git a/examples/download.rs b/examples/download.rs index b1eb3f938..287756621 100644 --- a/examples/download.rs +++ b/examples/download.rs @@ -30,7 +30,7 @@ async fn main() { website.scrape().await; for page in website.get_pages().unwrap().iter() { - let download_file = page.get_url().clone(); + let download_file = page.get_url(); let download_file = download_file.replace(website_name, ""); let download_file = download_file.replace(".", "-"); let download_file = download_file.replace("/", "-"); diff --git a/examples/download_to_react.rs b/examples/download_to_react.rs index 0f8d0891b..8658052bf 100644 --- a/examples/download_to_react.rs +++ b/examples/download_to_react.rs @@ -39,7 +39,7 @@ async fn main() { match website.get_pages() { Some(pages) => { for page in pages.iter() { - let download_file = page.get_url().clone(); + let download_file = page.get_url(); let download_file = download_file.replace(website_name, ""); let download_file = download_file.replace(".", "-"); let download_file = download_file.replace("/", "-"); diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 9a94c84e1..b6b883678 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "1.46.4" +version = "1.46.5" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler written in Rust." repository = "https://github.com/spider-rs/spider" diff --git a/spider/README.md b/spider/README.md index 8d42471be..88d5533f3 100644 --- a/spider/README.md +++ b/spider/README.md @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom ```toml [dependencies] -spider = "1.46.4" +spider = "1.46.5" ``` And then the code: @@ -87,7 +87,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl ```toml [dependencies] -spider = { version = "1.46.4", features = ["regex", "ua_generator"] } +spider = { version = "1.46.5", features = ["regex", "ua_generator"] } ``` 1. `ua_generator`: Enables auto generating a random real User-Agent. @@ -116,7 +116,7 @@ Move processing to a worker, drastically increases performance even if worker is ```toml [dependencies] -spider = { version = "1.46.4", features = ["decentralized"] } +spider = { version = "1.46.5", features = ["decentralized"] } ``` ```sh @@ -136,7 +136,7 @@ Use the subscribe method to get a broadcast channel. ```toml [dependencies] -spider = { version = "1.46.4", features = ["sync"] } +spider = { version = "1.46.5", features = ["sync"] } ``` ```rust,no_run @@ -166,7 +166,7 @@ Allow regex for blacklisting routes ```toml [dependencies] -spider = { version = "1.46.4", features = ["regex"] } +spider = { version = "1.46.5", features = ["regex"] } ``` ```rust,no_run @@ -193,7 +193,7 @@ If you are performing large workloads you may need to control the crawler by ena ```toml [dependencies] -spider = { version = "1.46.4", features = ["control"] } +spider = { version = "1.46.5", features = ["control"] } ``` ```rust diff --git a/spider/src/page.rs b/spider/src/page.rs index 0313520f9..4e610663a 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -312,7 +312,13 @@ impl Page { let mut abs = self.abs_path(href); let host_name = abs.host_str(); let mut can_process = match host_name { - Some(host) => parent_host.ends_with(host), + Some(host) => { + if base_domain.is_empty() { + parent_host.eq(&host) + } else { + parent_host.ends_with(host) + } + } _ => false, }; if !can_process @@ -409,7 +415,13 @@ impl Page { let mut abs = self.abs_path(href.inner()); let host_name = abs.host_str(); let mut can_process = match host_name { - Some(host) => parent_host.ends_with(host), + Some(host) => { + if base_domain.is_empty() { + parent_host.eq(&host) + } else { + parent_host.ends_with(host) + } + } _ => false, }; if !can_process && host_name.is_some() && !self.external_domains_caseless.is_empty() @@ -494,7 +506,13 @@ impl Page { // determine if the crawl can continue based on host match let mut can_process = match abs.host_str() { - Some(host) => parent_host.ends_with(host), + Some(host) => { + if base_domain.is_empty() { + parent_host.eq(&host) + } else { + parent_host.ends_with(host) + } + } _ => false, }; @@ -567,7 +585,13 @@ impl Page { let mut abs = self.abs_path(href); let can_process = match abs.host_str() { - Some(host) => parent_host.ends_with(host), + Some(host) => { + if base_domain.is_empty() { + parent_host.eq(&host) + } else { + parent_host.ends_with(host) + } + } _ => false, }; diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 6bf555477..b5b7dfaf3 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "1.46.4" +version = "1.46.5" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler CLI written in Rust." repository = "https://github.com/spider-rs/spider" @@ -26,7 +26,7 @@ quote = "1.0.18" failure_derive = "0.1.8" [dependencies.spider] -version = "1.46.4" +version = "1.46.5" path = "../spider" [[bin]] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index bf47c8ea4..c81c61bfa 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "1.46.4" +version = "1.46.5" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler as a worker or proxy." repository = "https://github.com/spider-rs/spider" @@ -22,7 +22,7 @@ lazy_static = "1.4.0" env_logger = "0.10.0" [dependencies.spider] -version = "1.46.4" +version = "1.46.5" path = "../spider" features = ["serde", "flexbuffers"]