Skip to content

Commit

Permalink
chore(control): fix control task abort after crawl
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 4, 2023
1 parent 45793bb commit fdbc211
Show file tree
Hide file tree
Showing 7 changed files with 103 additions and 35 deletions.
8 changes: 4 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "1.50.19"
version = "1.50.20"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand All @@ -22,7 +22,7 @@ htr = "0.5.27"
flexbuffers = "2.0.0"

[dependencies.spider]
version = "1.50.19"
version = "1.50.20"
path = "../spider"
features = ["serde"]

Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "1.50.19"
version = "1.50.20"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "The fastest web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand Down
16 changes: 8 additions & 8 deletions spider/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom

```toml
[dependencies]
spider = "1.50.19"
spider = "1.50.20"
```

And then the code:
Expand Down Expand Up @@ -91,7 +91,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl

```toml
[dependencies]
spider = { version = "1.50.19", features = ["regex", "ua_generator"] }
spider = { version = "1.50.20", features = ["regex", "ua_generator"] }
```

1. `ua_generator`: Enables auto generating a random real User-Agent.
Expand Down Expand Up @@ -123,7 +123,7 @@ Move processing to a worker, drastically increases performance even if worker is

```toml
[dependencies]
spider = { version = "1.50.19", features = ["decentralized"] }
spider = { version = "1.50.20", features = ["decentralized"] }
```

```sh
Expand All @@ -143,7 +143,7 @@ Use the subscribe method to get a broadcast channel.

```toml
[dependencies]
spider = { version = "1.50.19", features = ["sync"] }
spider = { version = "1.50.20", features = ["sync"] }
```

```rust,no_run
Expand Down Expand Up @@ -173,7 +173,7 @@ Allow regex for blacklisting routes

```toml
[dependencies]
spider = { version = "1.50.19", features = ["regex"] }
spider = { version = "1.50.20", features = ["regex"] }
```

```rust,no_run
Expand All @@ -200,7 +200,7 @@ If you are performing large workloads you may need to control the crawler by ena

```toml
[dependencies]
spider = { version = "1.50.19", features = ["control"] }
spider = { version = "1.50.20", features = ["control"] }
```

```rust
Expand Down Expand Up @@ -270,7 +270,7 @@ Use cron jobs to run crawls continuously at anytime.

```toml
[dependencies]
spider = { version = "1.50.19", features = ["sync", "cron"] }
spider = { version = "1.50.20", features = ["sync", "cron"] }
```

```rust,no_run
Expand Down Expand Up @@ -306,7 +306,7 @@ async fn main() {

```toml
[dependencies]
spider = { version = "1.50.19", features = ["chrome"] }
spider = { version = "1.50.20", features = ["chrome"] }
```

You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug.
Expand Down
100 changes: 84 additions & 16 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -698,14 +698,15 @@ impl Website {

/// setup atomic controller
#[cfg(feature = "control")]
fn configure_handler(&self) -> Arc<AtomicI8> {
fn configure_handler(&self) -> (Arc<AtomicI8>, tokio::task::JoinHandle<()>) {
use crate::utils::{Handler, CONTROLLER};

let paused = Arc::new(AtomicI8::new(0));
let handle = paused.clone();
let c: Arc<AtomicI8> = Arc::new(AtomicI8::new(0));
let handle = c.clone();
let domain = self.domain.inner().clone();

tokio::spawn(async move {
// we should probally assign a temp-uid with domain name to control spawns easier

let join_handle = tokio::spawn(async move {
let mut l = CONTROLLER.lock().await.1.to_owned();

while l.changed().await.is_ok() {
Expand All @@ -714,24 +715,24 @@ impl Website {

if domain.eq_ignore_ascii_case(&target) {
if rest == &Handler::Resume {
paused.store(0, Ordering::Relaxed);
c.store(0, Ordering::Relaxed);
}
if rest == &Handler::Pause {
paused.store(1, Ordering::Relaxed);
c.store(1, Ordering::Relaxed);
}
if rest == &Handler::Shutdown {
paused.store(2, Ordering::Relaxed);
c.store(2, Ordering::Relaxed);
}
}
}
});

handle
(handle, join_handle)
}

/// setup config for crawl
#[cfg(feature = "control")]
async fn setup(&mut self) -> (Client, Option<Arc<AtomicI8>>) {
async fn setup(&mut self) -> (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) {
if self.status == CrawlStatus::Idle {
self.clear();
}
Expand All @@ -750,7 +751,7 @@ impl Website {

/// setup config for crawl
#[cfg(not(feature = "control"))]
async fn setup<T>(&mut self) -> (Client, Option<T>) {
async fn setup(&mut self) -> (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) {
if self.status == CrawlStatus::Idle {
self.clear();
}
Expand Down Expand Up @@ -1128,75 +1129,139 @@ impl Website {
pub async fn crawl(&mut self) {
self.start();
let (client, handle) = self.setup().await;
let (handle, join_handle) = match handle {
Some(h) => (Some(h.0), Some(h.1)),
_ => (None, None),
};
self.crawl_concurrent(&client, &handle).await;
self.set_crawl_status();
match join_handle {
Some(h) => h.abort(),
_ => (),
};
}

#[cfg(all(not(feature = "sitemap"), feature = "chrome"))]
/// Start to crawl website with async concurrency using the base raw functionality. Useful when using the "chrome" feature and defaulting to the basic implementation.
pub async fn crawl_raw(&mut self) {
self.start();
let (client, handle) = self.setup().await;
let (handle, join_handle) = match handle {
Some(h) => (Some(h.0), Some(h.1)),
_ => (None, None),
};
self.crawl_concurrent_raw(&client, &handle).await;
self.set_crawl_status();
match join_handle {
Some(h) => h.abort(),
_ => (),
};
}

#[cfg(not(feature = "sitemap"))]
/// Start to scrape/download website with async concurrency
pub async fn scrape(&mut self) {
self.start();
let (client, handle) = self.setup().await;
let (handle, join_handle) = match handle {
Some(h) => (Some(h.0), Some(h.1)),
_ => (None, None),
};
self.scrape_concurrent(&client, &handle).await;
self.set_crawl_status();
match join_handle {
Some(h) => h.abort(),
_ => (),
};
}

#[cfg(all(not(feature = "sitemap"), feature = "chrome"))]
/// Start to crawl website with async concurrency using the base raw functionality. Useful when using the "chrome" feature and defaulting to the basic implementation.
pub async fn scrape_raw(&mut self) {
self.start();
let (client, handle) = self.setup().await;
let (handle, join_handle) = match handle {
Some(h) => (Some(h.0), Some(h.1)),
_ => (None, None),
};
self.scrape_concurrent_raw(&client, &handle).await;
self.set_crawl_status();
match join_handle {
Some(h) => h.abort(),
_ => (),
};
}

#[cfg(feature = "sitemap")]
/// Start to crawl website and include sitemap links
pub async fn crawl(&mut self) {
self.start();
let (client, handle) = self.setup().await;
let (handle, join_handle) = match handle {
Some(h) => (Some(h.0), Some(h.1)),
_ => (None, None),
};
self.crawl_concurrent(&client, &handle).await;
self.sitemap_crawl(&client, &handle, false).await;
self.set_crawl_status();
match join_handle {
Some(h) => h.abort(),
_ => (),
};
}

#[cfg(all(feature = "sitemap", feature = "chrome"))]
/// Start to crawl website and include sitemap links with async concurrency using the base raw functionality. Useful when using the "chrome" feature and defaulting to the basic implementation.
pub async fn crawl_raw(&mut self) {
self.start();
let (client, handle) = self.setup().await;
let (handle, join_handle) = match handle {
Some(h) => (Some(h.0), Some(h.1)),
_ => (None, None),
};
self.crawl_concurrent_raw(&client, &handle).await;
self.sitemap_crawl(&client, &handle, false).await;
self.set_crawl_status();
match join_handle {
Some(h) => h.abort(),
_ => (),
};
}

#[cfg(all(feature = "sitemap", feature = "chrome"))]
/// Start to crawl website and include sitemap links with async concurrency using the base raw functionality. Useful when using the "chrome" feature and defaulting to the basic implementation.
pub async fn scrape_raw(&mut self) {
self.start();
let (client, handle) = self.setup().await;
let (handle, join_handle) = match handle {
Some(h) => (Some(h.0), Some(h.1)),
_ => (None, None),
};
self.scrape_concurrent_raw(&client, &handle).await;
self.sitemap_crawl(&client, &handle, false).await;
self.set_crawl_status();
match join_handle {
Some(h) => h.abort(),
_ => (),
};
}

#[cfg(feature = "sitemap")]
/// Start to scrape/download website with async concurrency
pub async fn scrape(&mut self) {
self.start();
let (client, handle) = self.setup().await;
let (handle, join_handle) = match handle {
Some(h) => (Some(h.0), Some(h.1)),
_ => (None, None),
};
self.scrape_concurrent(&client, &handle).await;
self.sitemap_crawl(&client, &handle, true).await;
self.set_crawl_status();
match join_handle {
Some(h) => h.abort(),
_ => (),
};
}

/// Start to crawl website concurrently - used mainly for chrome instances to connect to default raw HTTP
Expand Down Expand Up @@ -2621,7 +2686,8 @@ async fn test_respect_robots_txt() {
website.configuration.respect_robots_txt = true;
website.configuration.user_agent = Some(Box::new("*".into()));

let (client, _): (Client, Option<Arc<AtomicI8>>) = website.setup().await;
let (client, _): (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) =
website.setup().await;

website.configure_robots_parser(client).await;

Expand All @@ -2637,15 +2703,17 @@ async fn test_respect_robots_txt() {
website_second.configuration.respect_robots_txt = true;
website_second.configuration.user_agent = Some(Box::new("bingbot".into()));

let (client_second, _): (Client, Option<Arc<AtomicI8>>) = website_second.setup().await;
let (client_second, _): (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) =
website_second.setup().await;
website_second.configure_robots_parser(client_second).await;

assert_eq!(website_second.configuration.delay, 60000); // should equal one minute in ms

// test crawl delay with wildcard agent [DOES not work when using set agent]
let mut website_third: Website = Website::new("https://www.mongodb.com");
website_third.configuration.respect_robots_txt = true;
let (client_third, _): (Client, Option<Arc<AtomicI8>>) = website_third.setup().await;
let (client_third, _): (Client, Option<(Arc<AtomicI8>, tokio::task::JoinHandle<()>)>) =
website_third.setup().await;

website_third.configure_robots_parser(client_third).await;

Expand Down Expand Up @@ -2802,8 +2870,8 @@ async fn test_crawl_budget() {
assert!(website.links_visited.len() <= 1);
}

#[cfg(feature = "control")]
#[tokio::test]
#[cfg(feature = "control")]
#[ignore]
async fn test_crawl_pause_resume() {
use crate::utils::{pause, resume};
Expand All @@ -2824,7 +2892,7 @@ async fn test_crawl_pause_resume() {

let duration = start.elapsed();

assert!(duration.as_secs() > 5, "{:?}", duration);
assert!(duration.as_secs() >= 5, "{:?}", duration);

assert!(
website
Expand Down
4 changes: 2 additions & 2 deletions spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "1.50.19"
version = "1.50.20"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "The fastest web crawler CLI written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand All @@ -26,7 +26,7 @@ quote = "1.0.18"
failure_derive = "0.1.8"

[dependencies.spider]
version = "1.50.19"
version = "1.50.20"
path = "../spider"

[[bin]]
Expand Down
Loading

0 comments on commit fdbc211

Please sign in to comment.