Skip to content

Commit

Permalink
feat(delay): add non blocking delay scheduling (#43)
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez authored Apr 24, 2022
1 parent 0530467 commit b44dff5
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 19 deletions.
2 changes: 1 addition & 1 deletion benches/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ publish = false
edition = "2021"

[dependencies]
spider = { version = "1.6.3", path = "../spider" }
spider = { version = "1.6.4", path = "../spider" }
criterion = "0.3"

[[bench]]
Expand Down
4 changes: 2 additions & 2 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "1.6.3"
version = "1.6.4"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand All @@ -15,7 +15,7 @@ publish = false
maintenance = { status = "as-is" }

[dependencies.spider]
version = "1.6.3"
version = "1.6.4"
path = "../spider"
default-features = false

Expand Down
4 changes: 2 additions & 2 deletions spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "1.6.3"
version = "1.6.4"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand All @@ -21,7 +21,7 @@ robotparser-fork = "0.10.5"
url = "2.2"
rayon = "1.5"
num_cpus = "1.13.0"
tokio = { version = "^1.17.0", features = ["rt-multi-thread", "net", "macros"] }
tokio = { version = "^1.17.0", features = [ "rt-multi-thread", "net", "macros", "time" ] }
regex = { version = "^1.5.0", optional = true }
hashbrown = { version = "0.12" }
log = "0.4.16"
Expand Down
47 changes: 35 additions & 12 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ use rayon::ThreadPoolBuilder;
use robotparser_fork::RobotFileParser;

use hashbrown::HashSet;
use std::{sync, thread, time::Duration};
use std::{sync, time::{Duration}};
use reqwest::header::CONNECTION;
use reqwest::header;
use sync::mpsc::{channel, Sender, Receiver};
use log::{log_enabled, info, Level};
use tokio::time::sleep;

/// Represent a website to scrawl. To start crawling, instanciate a new `struct` using
/// <pre>
Expand Down Expand Up @@ -117,6 +118,14 @@ impl<'a> Website<'a> {
let on_link_find_callback = self.on_link_find_callback;
let pool = self.create_thread_pool();

// get delay time duration as ms [TODO: move delay checking outside while and use method defined prior]
let delay_enabled = &(self.configuration.delay > 0);
let delay: Duration = if *delay_enabled {
Some(self.get_delay())
} else {
None
}.unwrap();

// crawl while links exists
while !self.links.is_empty() {
let (tx, rx): (Sender<Message>, Receiver<Message>) = channel();
Expand All @@ -125,24 +134,32 @@ impl<'a> Website<'a> {
if !self.is_allowed(link) {
continue;
}
if self.configuration.delay > 0 {
thread::sleep(self.get_delay());
}
log("- fetch {}", link);

self.links_visited.insert(link.into());

let link = link.clone();
let tx = tx.clone();
let cx = client.clone();

pool.spawn(move || {
let link_result = on_link_find_callback(link);
let mut page = Page::new(&link_result, &cx);
let links = page.links();

tx.send((page, links)).unwrap();
});
if *delay_enabled {
let pspawn = pool.spawn(move || {
let link_result = on_link_find_callback(link);
let mut page = Page::new(&link_result, &cx);
let links = page.links();

tx.send((page, links)).unwrap();
});

rayon::join(|| tokio_sleep(&delay), || pspawn);
} else {
pool.spawn(move || {
let link_result = on_link_find_callback(link);
let mut page = Page::new(&link_result, &cx);
let links = page.links();

tx.send((page, links)).unwrap();
});
}
}

drop(tx);
Expand Down Expand Up @@ -201,6 +218,12 @@ pub fn log(message: &str, data: impl AsRef<str>) {
}
}

// delay the process duration and send
#[tokio::main]
async fn tokio_sleep(delay: &Duration){
sleep(*delay).await;
}

#[test]
fn crawl() {
let mut website: Website = Website::new("https://choosealicense.com");
Expand Down
4 changes: 2 additions & 2 deletions spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "1.6.3"
version = "1.6.4"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand All @@ -23,7 +23,7 @@ quote = "1.0.18"
failure_derive = "0.1.8"

[dependencies.spider]
version = "1.6.3"
version = "1.6.4"
path = "../spider"
default-features = false

Expand Down

0 comments on commit b44dff5

Please sign in to comment.