diff --git a/Cargo.lock b/Cargo.lock index 309e3f4b1..7edd9ac7e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3961,7 +3961,7 @@ dependencies = [ [[package]] name = "spider" -version = "1.50.19" +version = "1.50.20" dependencies = [ "ahash", "async-trait", @@ -4002,7 +4002,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "1.50.19" +version = "1.50.20" dependencies = [ "clap 4.4.8", "env_logger 0.9.3", @@ -4014,7 +4014,7 @@ dependencies = [ [[package]] name = "spider_examples" -version = "1.50.19" +version = "1.50.20" dependencies = [ "convert_case 0.5.0", "env_logger 0.9.3", @@ -4035,7 +4035,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "1.50.19" +version = "1.50.20" dependencies = [ "env_logger 0.10.1", "lazy_static", diff --git a/examples/Cargo.toml b/examples/Cargo.toml index b0e00d705..b11eb12b6 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_examples" -version = "1.50.19" +version = "1.50.20" authors = ["madeindjs ", "j-mendez "] description = "Multithreaded web crawler written in Rust." repository = "https://github.com/spider-rs/spider" @@ -22,7 +22,7 @@ htr = "0.5.27" flexbuffers = "2.0.0" [dependencies.spider] -version = "1.50.19" +version = "1.50.20" path = "../spider" features = ["serde"] diff --git a/spider/Cargo.toml b/spider/Cargo.toml index ea5a3f00b..c9c439637 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "1.50.19" +version = "1.50.20" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler written in Rust." repository = "https://github.com/spider-rs/spider" diff --git a/spider/README.md b/spider/README.md index 5e226e25b..967a7c028 100644 --- a/spider/README.md +++ b/spider/README.md @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom ```toml [dependencies] -spider = "1.50.19" +spider = "1.50.20" ``` And then the code: @@ -91,7 +91,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl ```toml [dependencies] -spider = { version = "1.50.19", features = ["regex", "ua_generator"] } +spider = { version = "1.50.20", features = ["regex", "ua_generator"] } ``` 1. `ua_generator`: Enables auto generating a random real User-Agent. @@ -123,7 +123,7 @@ Move processing to a worker, drastically increases performance even if worker is ```toml [dependencies] -spider = { version = "1.50.19", features = ["decentralized"] } +spider = { version = "1.50.20", features = ["decentralized"] } ``` ```sh @@ -143,7 +143,7 @@ Use the subscribe method to get a broadcast channel. ```toml [dependencies] -spider = { version = "1.50.19", features = ["sync"] } +spider = { version = "1.50.20", features = ["sync"] } ``` ```rust,no_run @@ -173,7 +173,7 @@ Allow regex for blacklisting routes ```toml [dependencies] -spider = { version = "1.50.19", features = ["regex"] } +spider = { version = "1.50.20", features = ["regex"] } ``` ```rust,no_run @@ -200,7 +200,7 @@ If you are performing large workloads you may need to control the crawler by ena ```toml [dependencies] -spider = { version = "1.50.19", features = ["control"] } +spider = { version = "1.50.20", features = ["control"] } ``` ```rust @@ -270,7 +270,7 @@ Use cron jobs to run crawls continuously at anytime. ```toml [dependencies] -spider = { version = "1.50.19", features = ["sync", "cron"] } +spider = { version = "1.50.20", features = ["sync", "cron"] } ``` ```rust,no_run @@ -306,7 +306,7 @@ async fn main() { ```toml [dependencies] -spider = { version = "1.50.19", features = ["chrome"] } +spider = { version = "1.50.20", features = ["chrome"] } ``` You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug. diff --git a/spider/src/website.rs b/spider/src/website.rs index ad27f91a6..8e04c2b36 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -698,14 +698,15 @@ impl Website { /// setup atomic controller #[cfg(feature = "control")] - fn configure_handler(&self) -> Arc { + fn configure_handler(&self) -> (Arc, tokio::task::JoinHandle<()>) { use crate::utils::{Handler, CONTROLLER}; - - let paused = Arc::new(AtomicI8::new(0)); - let handle = paused.clone(); + let c: Arc = Arc::new(AtomicI8::new(0)); + let handle = c.clone(); let domain = self.domain.inner().clone(); - tokio::spawn(async move { + // we should probally assign a temp-uid with domain name to control spawns easier + + let join_handle = tokio::spawn(async move { let mut l = CONTROLLER.lock().await.1.to_owned(); while l.changed().await.is_ok() { @@ -714,24 +715,24 @@ impl Website { if domain.eq_ignore_ascii_case(&target) { if rest == &Handler::Resume { - paused.store(0, Ordering::Relaxed); + c.store(0, Ordering::Relaxed); } if rest == &Handler::Pause { - paused.store(1, Ordering::Relaxed); + c.store(1, Ordering::Relaxed); } if rest == &Handler::Shutdown { - paused.store(2, Ordering::Relaxed); + c.store(2, Ordering::Relaxed); } } } }); - handle + (handle, join_handle) } /// setup config for crawl #[cfg(feature = "control")] - async fn setup(&mut self) -> (Client, Option>) { + async fn setup(&mut self) -> (Client, Option<(Arc, tokio::task::JoinHandle<()>)>) { if self.status == CrawlStatus::Idle { self.clear(); } @@ -750,7 +751,7 @@ impl Website { /// setup config for crawl #[cfg(not(feature = "control"))] - async fn setup(&mut self) -> (Client, Option) { + async fn setup(&mut self) -> (Client, Option<(Arc, tokio::task::JoinHandle<()>)>) { if self.status == CrawlStatus::Idle { self.clear(); } @@ -1128,8 +1129,16 @@ impl Website { pub async fn crawl(&mut self) { self.start(); let (client, handle) = self.setup().await; + let (handle, join_handle) = match handle { + Some(h) => (Some(h.0), Some(h.1)), + _ => (None, None), + }; self.crawl_concurrent(&client, &handle).await; self.set_crawl_status(); + match join_handle { + Some(h) => h.abort(), + _ => (), + }; } #[cfg(all(not(feature = "sitemap"), feature = "chrome"))] @@ -1137,8 +1146,16 @@ impl Website { pub async fn crawl_raw(&mut self) { self.start(); let (client, handle) = self.setup().await; + let (handle, join_handle) = match handle { + Some(h) => (Some(h.0), Some(h.1)), + _ => (None, None), + }; self.crawl_concurrent_raw(&client, &handle).await; self.set_crawl_status(); + match join_handle { + Some(h) => h.abort(), + _ => (), + }; } #[cfg(not(feature = "sitemap"))] @@ -1146,8 +1163,16 @@ impl Website { pub async fn scrape(&mut self) { self.start(); let (client, handle) = self.setup().await; + let (handle, join_handle) = match handle { + Some(h) => (Some(h.0), Some(h.1)), + _ => (None, None), + }; self.scrape_concurrent(&client, &handle).await; self.set_crawl_status(); + match join_handle { + Some(h) => h.abort(), + _ => (), + }; } #[cfg(all(not(feature = "sitemap"), feature = "chrome"))] @@ -1155,8 +1180,16 @@ impl Website { pub async fn scrape_raw(&mut self) { self.start(); let (client, handle) = self.setup().await; + let (handle, join_handle) = match handle { + Some(h) => (Some(h.0), Some(h.1)), + _ => (None, None), + }; self.scrape_concurrent_raw(&client, &handle).await; self.set_crawl_status(); + match join_handle { + Some(h) => h.abort(), + _ => (), + }; } #[cfg(feature = "sitemap")] @@ -1164,9 +1197,17 @@ impl Website { pub async fn crawl(&mut self) { self.start(); let (client, handle) = self.setup().await; + let (handle, join_handle) = match handle { + Some(h) => (Some(h.0), Some(h.1)), + _ => (None, None), + }; self.crawl_concurrent(&client, &handle).await; self.sitemap_crawl(&client, &handle, false).await; self.set_crawl_status(); + match join_handle { + Some(h) => h.abort(), + _ => (), + }; } #[cfg(all(feature = "sitemap", feature = "chrome"))] @@ -1174,9 +1215,17 @@ impl Website { pub async fn crawl_raw(&mut self) { self.start(); let (client, handle) = self.setup().await; + let (handle, join_handle) = match handle { + Some(h) => (Some(h.0), Some(h.1)), + _ => (None, None), + }; self.crawl_concurrent_raw(&client, &handle).await; self.sitemap_crawl(&client, &handle, false).await; self.set_crawl_status(); + match join_handle { + Some(h) => h.abort(), + _ => (), + }; } #[cfg(all(feature = "sitemap", feature = "chrome"))] @@ -1184,9 +1233,17 @@ impl Website { pub async fn scrape_raw(&mut self) { self.start(); let (client, handle) = self.setup().await; + let (handle, join_handle) = match handle { + Some(h) => (Some(h.0), Some(h.1)), + _ => (None, None), + }; self.scrape_concurrent_raw(&client, &handle).await; self.sitemap_crawl(&client, &handle, false).await; self.set_crawl_status(); + match join_handle { + Some(h) => h.abort(), + _ => (), + }; } #[cfg(feature = "sitemap")] @@ -1194,9 +1251,17 @@ impl Website { pub async fn scrape(&mut self) { self.start(); let (client, handle) = self.setup().await; + let (handle, join_handle) = match handle { + Some(h) => (Some(h.0), Some(h.1)), + _ => (None, None), + }; self.scrape_concurrent(&client, &handle).await; self.sitemap_crawl(&client, &handle, true).await; self.set_crawl_status(); + match join_handle { + Some(h) => h.abort(), + _ => (), + }; } /// Start to crawl website concurrently - used mainly for chrome instances to connect to default raw HTTP @@ -2621,7 +2686,8 @@ async fn test_respect_robots_txt() { website.configuration.respect_robots_txt = true; website.configuration.user_agent = Some(Box::new("*".into())); - let (client, _): (Client, Option>) = website.setup().await; + let (client, _): (Client, Option<(Arc, tokio::task::JoinHandle<()>)>) = + website.setup().await; website.configure_robots_parser(client).await; @@ -2637,7 +2703,8 @@ async fn test_respect_robots_txt() { website_second.configuration.respect_robots_txt = true; website_second.configuration.user_agent = Some(Box::new("bingbot".into())); - let (client_second, _): (Client, Option>) = website_second.setup().await; + let (client_second, _): (Client, Option<(Arc, tokio::task::JoinHandle<()>)>) = + website_second.setup().await; website_second.configure_robots_parser(client_second).await; assert_eq!(website_second.configuration.delay, 60000); // should equal one minute in ms @@ -2645,7 +2712,8 @@ async fn test_respect_robots_txt() { // test crawl delay with wildcard agent [DOES not work when using set agent] let mut website_third: Website = Website::new("https://www.mongodb.com"); website_third.configuration.respect_robots_txt = true; - let (client_third, _): (Client, Option>) = website_third.setup().await; + let (client_third, _): (Client, Option<(Arc, tokio::task::JoinHandle<()>)>) = + website_third.setup().await; website_third.configure_robots_parser(client_third).await; @@ -2802,8 +2870,8 @@ async fn test_crawl_budget() { assert!(website.links_visited.len() <= 1); } -#[cfg(feature = "control")] #[tokio::test] +#[cfg(feature = "control")] #[ignore] async fn test_crawl_pause_resume() { use crate::utils::{pause, resume}; @@ -2824,7 +2892,7 @@ async fn test_crawl_pause_resume() { let duration = start.elapsed(); - assert!(duration.as_secs() > 5, "{:?}", duration); + assert!(duration.as_secs() >= 5, "{:?}", duration); assert!( website diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index b569d090b..05192044f 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "1.50.19" +version = "1.50.20" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler CLI written in Rust." repository = "https://github.com/spider-rs/spider" @@ -26,7 +26,7 @@ quote = "1.0.18" failure_derive = "0.1.8" [dependencies.spider] -version = "1.50.19" +version = "1.50.20" path = "../spider" [[bin]] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 4a01bcc2d..23387dba3 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "1.50.19" +version = "1.50.20" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler as a worker or proxy." repository = "https://github.com/spider-rs/spider" @@ -22,7 +22,7 @@ lazy_static = "1.4.0" env_logger = "0.10.0" [dependencies.spider] -version = "1.50.19" +version = "1.50.20" path = "../spider" features = ["serde", "flexbuffers"]