chore(control): fix control task abort after crawl

spider-rs · Dec 4, 2023 · fdbc211 · fdbc211
1 parent 45793bb
commit fdbc211
Show file tree

Hide file tree

Showing 7 changed files with 103 additions and 35 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/examples/Cargo.toml b/examples/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_examples"
-version = "1.50.19"
+version = "1.50.20"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "Multithreaded web crawler written in Rust."
 repository = "https://github.com/spider-rs/spider"
@@ -22,7 +22,7 @@ htr = "0.5.27"
 flexbuffers = "2.0.0"
 
 [dependencies.spider]
-version = "1.50.19"
+version = "1.50.20"
 path = "../spider"
 features = ["serde"]
 

diff --git a/spider/Cargo.toml b/spider/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider"
-version = "1.50.19"
+version = "1.50.20"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "The fastest web crawler written in Rust."
 repository = "https://github.com/spider-rs/spider"

diff --git a/spider/README.md b/spider/README.md
@@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom
 
 ```toml
 [dependencies]
-spider = "1.50.19"
+spider = "1.50.20"
 ```
 
 And then the code:
@@ -91,7 +91,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl
 
 ```toml
 [dependencies]
-spider = { version = "1.50.19", features = ["regex", "ua_generator"] }
+spider = { version = "1.50.20", features = ["regex", "ua_generator"] }
 ```
 
 1. `ua_generator`: Enables auto generating a random real User-Agent.
@@ -123,7 +123,7 @@ Move processing to a worker, drastically increases performance even if worker is
 
 ```toml
 [dependencies]
-spider = { version = "1.50.19", features = ["decentralized"] }
+spider = { version = "1.50.20", features = ["decentralized"] }
 ```
 
 ```sh
@@ -143,7 +143,7 @@ Use the subscribe method to get a broadcast channel.
 
 ```toml
 [dependencies]
-spider = { version = "1.50.19", features = ["sync"] }
+spider = { version = "1.50.20", features = ["sync"] }
 ```
 
 ```rust,no_run
@@ -173,7 +173,7 @@ Allow regex for blacklisting routes
 
 ```toml
 [dependencies]
-spider = { version = "1.50.19", features = ["regex"] }
+spider = { version = "1.50.20", features = ["regex"] }
 ```
 
 ```rust,no_run
@@ -200,7 +200,7 @@ If you are performing large workloads you may need to control the crawler by ena
 
 ```toml
 [dependencies]
-spider = { version = "1.50.19", features = ["control"] }
+spider = { version = "1.50.20", features = ["control"] }
 ```
 
 ```rust
@@ -270,7 +270,7 @@ Use cron jobs to run crawls continuously at anytime.
 
 ```toml
 [dependencies]
-spider = { version = "1.50.19", features = ["sync", "cron"] }
+spider = { version = "1.50.20", features = ["sync", "cron"] }
 ```
 
 ```rust,no_run
@@ -306,7 +306,7 @@ async fn main() {
 
 ```toml
 [dependencies]
-spider = { version = "1.50.19", features = ["chrome"] }
+spider = { version = "1.50.20", features = ["chrome"] }
 ```
 
 You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug.

diff --git a/spider/src/website.rs b/spider/src/website.rs
@@ -698,14 +698,15 @@ impl Website {
 
     /// setup atomic controller
     #[cfg(feature = "control")]
-    fn configure_handler(&self) -> Arc<AtomicI8> {
+    fn configure_handler(&self) -> (Arc<AtomicI8>, tokio::task::JoinHandle<()>) {
         use crate::utils::{Handler, CONTROLLER};
-
-        let paused = Arc::new(AtomicI8::new(0));
-        let handle = paused.clone();
+        let c: Arc<AtomicI8> = Arc::new(AtomicI8::new(0));
+        let handle = c.clone();
         let domain = self.domain.inner().clone();
 
-        tokio::spawn(async move {
+        // we should probally assign a temp-uid with domain name to control spawns easier
+
+        let join_handle = tokio::spawn(async move {
             let mut l = CONTROLLER.lock().await.1.to_owned();
 
             while l.changed().await.is_ok() {
@@ -714,24 +715,24 @@ impl Website {
 
                 if domain.eq_ignore_ascii_case(&target) {
                     if rest == &Handler::Resume {
-                        paused.store(0, Ordering::Relaxed);
+                        c.store(0, Ordering::Relaxed);
                     }
                     if rest == &Handler::Pause {
-                        paused.store(1, Ordering::Relaxed);
+                        c.store(1, Ordering::Relaxed);
                     }
                     if rest == &Handler::Shutdown {
-                        paused.store(2, Ordering::Relaxed);
+                        c.store(2, Ordering::Relaxed);
                     }
                 }
             }
         });
 
-        handle
+        (handle, join_handle)
     }
 
     /// setup config for crawl
     #[cfg(feature = "control")]
-    async fn setup(&mut self) -> (Client, Option<Arc<AtomicI8>>) {
+    async fn setup(&mut self) -> (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) {
         if self.status == CrawlStatus::Idle {
             self.clear();
         }
@@ -750,7 +751,7 @@ impl Website {
 
     /// setup config for crawl
     #[cfg(not(feature = "control"))]
-    async fn setup<T>(&mut self) -> (Client, Option<T>) {
+    async fn setup(&mut self) -> (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) {
         if self.status == CrawlStatus::Idle {
             self.clear();
         }
@@ -1128,75 +1129,139 @@ impl Website {
     pub async fn crawl(&mut self) {
         self.start();
         let (client, handle) = self.setup().await;
+        let (handle, join_handle) = match handle {
+            Some(h) => (Some(h.0), Some(h.1)),
+            _ => (None, None),
+        };
         self.crawl_concurrent(&client, &handle).await;
         self.set_crawl_status();
+        match join_handle {
+            Some(h) => h.abort(),
+            _ => (),
+        };
     }
 
     #[cfg(all(not(feature = "sitemap"), feature = "chrome"))]
     /// Start to crawl website with async concurrency using the base raw functionality. Useful when using the "chrome" feature and defaulting to the basic implementation.
     pub async fn crawl_raw(&mut self) {
         self.start();
         let (client, handle) = self.setup().await;
+        let (handle, join_handle) = match handle {
+            Some(h) => (Some(h.0), Some(h.1)),
+            _ => (None, None),
+        };
         self.crawl_concurrent_raw(&client, &handle).await;
         self.set_crawl_status();
+        match join_handle {
+            Some(h) => h.abort(),
+            _ => (),
+        };
     }
 
     #[cfg(not(feature = "sitemap"))]
     /// Start to scrape/download website with async concurrency
     pub async fn scrape(&mut self) {
         self.start();
         let (client, handle) = self.setup().await;
+        let (handle, join_handle) = match handle {
+            Some(h) => (Some(h.0), Some(h.1)),
+            _ => (None, None),
+        };
         self.scrape_concurrent(&client, &handle).await;
         self.set_crawl_status();
+        match join_handle {
+            Some(h) => h.abort(),
+            _ => (),
+        };
     }
 
     #[cfg(all(not(feature = "sitemap"), feature = "chrome"))]
     /// Start to crawl website with async concurrency using the base raw functionality. Useful when using the "chrome" feature and defaulting to the basic implementation.
     pub async fn scrape_raw(&mut self) {
         self.start();
         let (client, handle) = self.setup().await;
+        let (handle, join_handle) = match handle {
+            Some(h) => (Some(h.0), Some(h.1)),
+            _ => (None, None),
+        };
         self.scrape_concurrent_raw(&client, &handle).await;
         self.set_crawl_status();
+        match join_handle {
+            Some(h) => h.abort(),
+            _ => (),
+        };
     }
 
     #[cfg(feature = "sitemap")]
     /// Start to crawl website and include sitemap links
     pub async fn crawl(&mut self) {
         self.start();
         let (client, handle) = self.setup().await;
+        let (handle, join_handle) = match handle {
+            Some(h) => (Some(h.0), Some(h.1)),
+            _ => (None, None),
+        };
         self.crawl_concurrent(&client, &handle).await;
         self.sitemap_crawl(&client, &handle, false).await;
         self.set_crawl_status();
+        match join_handle {
+            Some(h) => h.abort(),
+            _ => (),
+        };
     }
 
     #[cfg(all(feature = "sitemap", feature = "chrome"))]
     /// Start to crawl website  and include sitemap links with async concurrency using the base raw functionality. Useful when using the "chrome" feature and defaulting to the basic implementation.
     pub async fn crawl_raw(&mut self) {
         self.start();
         let (client, handle) = self.setup().await;
+        let (handle, join_handle) = match handle {
+            Some(h) => (Some(h.0), Some(h.1)),
+            _ => (None, None),
+        };
         self.crawl_concurrent_raw(&client, &handle).await;
         self.sitemap_crawl(&client, &handle, false).await;
         self.set_crawl_status();
+        match join_handle {
+            Some(h) => h.abort(),
+            _ => (),
+        };
     }
 
     #[cfg(all(feature = "sitemap", feature = "chrome"))]
     /// Start to crawl website  and include sitemap links with async concurrency using the base raw functionality. Useful when using the "chrome" feature and defaulting to the basic implementation.
     pub async fn scrape_raw(&mut self) {
         self.start();
         let (client, handle) = self.setup().await;
+        let (handle, join_handle) = match handle {
+            Some(h) => (Some(h.0), Some(h.1)),
+            _ => (None, None),
+        };
         self.scrape_concurrent_raw(&client, &handle).await;
         self.sitemap_crawl(&client, &handle, false).await;
         self.set_crawl_status();
+        match join_handle {
+            Some(h) => h.abort(),
+            _ => (),
+        };
     }
 
     #[cfg(feature = "sitemap")]
     /// Start to scrape/download website with async concurrency
     pub async fn scrape(&mut self) {
         self.start();
         let (client, handle) = self.setup().await;
+        let (handle, join_handle) = match handle {
+            Some(h) => (Some(h.0), Some(h.1)),
+            _ => (None, None),
+        };
         self.scrape_concurrent(&client, &handle).await;
         self.sitemap_crawl(&client, &handle, true).await;
         self.set_crawl_status();
+        match join_handle {
+            Some(h) => h.abort(),
+            _ => (),
+        };
     }
 
     /// Start to crawl website concurrently - used mainly for chrome instances to connect to default raw HTTP
@@ -2621,7 +2686,8 @@ async fn test_respect_robots_txt() {
     website.configuration.respect_robots_txt = true;
     website.configuration.user_agent = Some(Box::new("*".into()));
 
-    let (client, _): (Client, Option<Arc<AtomicI8>>) = website.setup().await;
+    let (client, _): (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) =
+        website.setup().await;
 
     website.configure_robots_parser(client).await;
 
@@ -2637,15 +2703,17 @@ async fn test_respect_robots_txt() {
     website_second.configuration.respect_robots_txt = true;
     website_second.configuration.user_agent = Some(Box::new("bingbot".into()));
 
-    let (client_second, _): (Client, Option<Arc<AtomicI8>>) = website_second.setup().await;
+    let (client_second, _): (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) =
+        website_second.setup().await;
     website_second.configure_robots_parser(client_second).await;
 
     assert_eq!(website_second.configuration.delay, 60000); // should equal one minute in ms
 
     // test crawl delay with wildcard agent [DOES not work when using set agent]
     let mut website_third: Website = Website::new("https://www.mongodb.com");
     website_third.configuration.respect_robots_txt = true;
-    let (client_third, _): (Client, Option<Arc<AtomicI8>>) = website_third.setup().await;
+    let (client_third, _): (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) =
+        website_third.setup().await;
 
     website_third.configure_robots_parser(client_third).await;
 
@@ -2802,8 +2870,8 @@ async fn test_crawl_budget() {
     assert!(website.links_visited.len() <= 1);
 }
 
-#[cfg(feature = "control")]
 #[tokio::test]
+#[cfg(feature = "control")]
 #[ignore]
 async fn test_crawl_pause_resume() {
     use crate::utils::{pause, resume};
@@ -2824,7 +2892,7 @@ async fn test_crawl_pause_resume() {
 
     let duration = start.elapsed();
 
-    assert!(duration.as_secs() > 5, "{:?}", duration);
+    assert!(duration.as_secs() >= 5, "{:?}", duration);
 
     assert!(
         website

diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_cli"
-version = "1.50.19"
+version = "1.50.20"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "The fastest web crawler CLI written in Rust."
 repository = "https://github.com/spider-rs/spider"
@@ -26,7 +26,7 @@ quote = "1.0.18"
 failure_derive = "0.1.8"
 
 [dependencies.spider]
-version = "1.50.19"
+version = "1.50.20"
 path = "../spider"
 
 [[bin]]