Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(delay): add non blocking delay scheduling #43

Merged
merged 1 commit into from
Apr 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion benches/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ publish = false
edition = "2021"

[dependencies]
spider = { version = "1.6.3", path = "../spider" }
spider = { version = "1.6.4", path = "../spider" }
criterion = "0.3"

[[bench]]
Expand Down
4 changes: 2 additions & 2 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "1.6.3"
version = "1.6.4"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand All @@ -15,7 +15,7 @@ publish = false
maintenance = { status = "as-is" }

[dependencies.spider]
version = "1.6.3"
version = "1.6.4"
path = "../spider"
default-features = false

Expand Down
4 changes: 2 additions & 2 deletions spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "1.6.3"
version = "1.6.4"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand All @@ -21,7 +21,7 @@ robotparser-fork = "0.10.5"
url = "2.2"
rayon = "1.5"
num_cpus = "1.13.0"
tokio = { version = "^1.17.0", features = ["rt-multi-thread", "net", "macros"] }
tokio = { version = "^1.17.0", features = [ "rt-multi-thread", "net", "macros", "time" ] }
regex = { version = "^1.5.0", optional = true }
hashbrown = { version = "0.12" }
log = "0.4.16"
Expand Down
47 changes: 35 additions & 12 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ use rayon::ThreadPoolBuilder;
use robotparser_fork::RobotFileParser;

use hashbrown::HashSet;
use std::{sync, thread, time::Duration};
use std::{sync, time::{Duration}};
use reqwest::header::CONNECTION;
use reqwest::header;
use sync::mpsc::{channel, Sender, Receiver};
use log::{log_enabled, info, Level};
use tokio::time::sleep;

/// Represent a website to scrawl. To start crawling, instanciate a new `struct` using
/// <pre>
Expand Down Expand Up @@ -117,6 +118,14 @@ impl<'a> Website<'a> {
let on_link_find_callback = self.on_link_find_callback;
let pool = self.create_thread_pool();

// get delay time duration as ms [TODO: move delay checking outside while and use method defined prior]
let delay_enabled = &(self.configuration.delay > 0);
let delay: Duration = if *delay_enabled {
Some(self.get_delay())
} else {
None
}.unwrap();

// crawl while links exists
while !self.links.is_empty() {
let (tx, rx): (Sender<Message>, Receiver<Message>) = channel();
Expand All @@ -125,24 +134,32 @@ impl<'a> Website<'a> {
if !self.is_allowed(link) {
continue;
}
if self.configuration.delay > 0 {
thread::sleep(self.get_delay());
}
log("- fetch {}", link);

self.links_visited.insert(link.into());

let link = link.clone();
let tx = tx.clone();
let cx = client.clone();

pool.spawn(move || {
let link_result = on_link_find_callback(link);
let mut page = Page::new(&link_result, &cx);
let links = page.links();

tx.send((page, links)).unwrap();
});
if *delay_enabled {
let pspawn = pool.spawn(move || {
let link_result = on_link_find_callback(link);
let mut page = Page::new(&link_result, &cx);
let links = page.links();

tx.send((page, links)).unwrap();
});

rayon::join(|| tokio_sleep(&delay), || pspawn);
} else {
pool.spawn(move || {
let link_result = on_link_find_callback(link);
let mut page = Page::new(&link_result, &cx);
let links = page.links();

tx.send((page, links)).unwrap();
});
}
}

drop(tx);
Expand Down Expand Up @@ -201,6 +218,12 @@ pub fn log(message: &str, data: impl AsRef<str>) {
}
}

// delay the process duration and send
#[tokio::main]
async fn tokio_sleep(delay: &Duration){
sleep(*delay).await;
}

#[test]
fn crawl() {
let mut website: Website = Website::new("https://choosealicense.com");
Expand Down
4 changes: 2 additions & 2 deletions spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "1.6.3"
version = "1.6.4"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand All @@ -23,7 +23,7 @@ quote = "1.0.18"
failure_derive = "0.1.8"

[dependencies.spider]
version = "1.6.3"
version = "1.6.4"
path = "../spider"
default-features = false

Expand Down