Skip to content

Commit

Permalink
perf(client): fix blocking and async mixture
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Apr 21, 2022
1 parent 5ee0f7a commit 2d7471d
Show file tree
Hide file tree
Showing 12 changed files with 92 additions and 50 deletions.
42 changes: 42 additions & 0 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
name: Benches
on:
push:
branches: [master]
pull_request:
branches: [master]

env:
CARGO_TERM_COLOR: always
RUST_LOG: "off"

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/cache@v2
id: cache
with:
path: |
~/.cargo/bin/
~/.cargo/registry/index/
~/.cargo/registry/cache/
~/.cargo/git/db/
target/
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
- name: Install Benchmark Dependencies
run: |
# install node-crawler
npm install -g crawler
# install go and deps
go mod init example.com/spider
cat go.mod
echo "github.com/gocolly/colly/v2 v2.1.0" >> ./go.mod
cat go.mod
go mod tidy
# install the local cli latest
cd ./spider_cli && cargo install --path . && cd ../
- name: Run Benchmarks
run: cargo bench
24 changes: 10 additions & 14 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,17 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/cache@v2
id: cache
with:
path: |
~/.cargo/bin/
~/.cargo/registry/index/
~/.cargo/registry/cache/
~/.cargo/git/db/
target/
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
- name: Build
run: cargo build --verbose
- name: Run tests
run: cargo test --verbose --all-features
- name: Install Benchmark Dependencies
run: |
sudo apt update && sudo apt-get install build-essential
# install node-crawler
npm install crawler -g
# install the local cli latest
cd ./spider_cli && cargo install --path . && cd ../
# install go and deps
echo "module github.com/x/y" >> go.mod
echo "go 1.14" >> go.mod
echo "require github.com/gocolly/colly/v2 v2.1.0" >> go.mod
go mod tidy
- name: Run Benchmarks
run: cargo bench
3 changes: 2 additions & 1 deletion benches/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ node_modules
node-crawler.js
go-crolly.go
go.mod
go.sum
go.sum
output.txt
2 changes: 1 addition & 1 deletion benches/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ publish = false
edition = "2021"

[dependencies]
spider = { version = "1.5.5", path = "../spider" }
spider = { version = "1.6.0", path = "../spider" }
criterion = "0.3"

[[bench]]
Expand Down
2 changes: 1 addition & 1 deletion benches/crawl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ pub fn bench_speed(c: &mut Criterion) {
let go_crawl_script = go_crolly::gen_crawl();
let mut group = c.benchmark_group("crawl-speed");

group.sample_size(10).measurement_time(Duration::new(85, 0) + Duration::from_millis(500));
group.sample_size(10).measurement_time(Duration::new(180, 0) + Duration::from_millis(500));
group.bench_function("Rust[spider]: with crawl 10 times", |b| b.iter(||Command::new("spider")
.args(["--delay", "0", "--domain", "https://rsseau.fr", "crawl"])
.output()
Expand Down
2 changes: 1 addition & 1 deletion benches/go_crolly.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::io::{BufWriter, Write};

pub fn crawl_stub() -> String {
r#"
package main
package spider
import (
"fmt"
Expand Down
4 changes: 2 additions & 2 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "1.5.5"
version = "1.6.0"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand All @@ -15,7 +15,7 @@ publish = false
maintenance = { status = "as-is" }

[dependencies.spider]
version = "1.5.5"
version = "1.6.0"
path = "../spider"
default-features = false

Expand Down
4 changes: 2 additions & 2 deletions spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "1.5.5"
version = "1.6.0"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand All @@ -15,7 +15,7 @@ edition = "2018"
maintenance = { status = "as-is" }

[dependencies]
reqwest = { version = "0.11.10" }
reqwest = { version = "0.11.10", features = ["blocking"] }
scraper = "0.12"
robotparser-fork = "0.10.5"
url = "2.2"
Expand Down
4 changes: 2 additions & 2 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ fn parse_links() {
.unwrap();

let link_result = "https://choosealicense.com/";
let html = fetch_page_html(&link_result, &client).unwrap();
let html = fetch_page_html(&link_result, &client);
let page: Page = Page::new(&link_result, &html);

assert!(
Expand All @@ -111,7 +111,7 @@ fn test_abs_path() {
.build()
.unwrap();
let link_result = "https://choosealicense.com/";
let html = fetch_page_html(&link_result, &client).unwrap();
let html = fetch_page_html(&link_result, &client);
let page: Page = Page::new(&link_result, &html);

assert_eq!(
Expand Down
20 changes: 15 additions & 5 deletions spider/src/utils.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,18 @@
pub use crate::reqwest::{Client, Error};
pub use crate::reqwest::blocking::{Client};
use reqwest::StatusCode;

#[tokio::main]
pub async fn fetch_page_html(url: &str, client: &Client) -> Result<String, Error> {
let body = client.get(url).send().await?.text().await?;
pub fn fetch_page_html(url: &str, client: &Client) -> String {
let mut body = String::new();

Ok(body)
// silence errors for top level logging
match client.get(url).send() {
Ok(res) if res.status() == StatusCode::OK => match res.text() {
Ok(text) => body = text,
Err(_) => {},
},
Ok(_) => (),
Err(_) => {}
}

body
}
31 changes: 12 additions & 19 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,6 @@ pub struct Website<'a> {
pub on_link_find_callback: fn(String) -> String,
/// Robot.txt parser holder
robot_file_parser: RobotFileParser<'a>,
// fetch client
client: Client,
// ignore holding page in memory, pages will always be empty
pub page_store_ignore: bool,
}
Expand All @@ -50,14 +48,13 @@ impl<'a> Website<'a> {
pub fn new(domain: &str) -> Self {
Self {
configuration: Configuration::new(),
domain: domain.to_string(),
links: HashSet::from([format!("{}/", domain)]),
links_visited: HashSet::new(),
pages: Vec::new(),
robot_file_parser: RobotFileParser::new(&format!("{}/robots.txt", domain)), // TODO: lazy establish
links: HashSet::from([format!("{}/", domain)]),
on_link_find_callback: |s| s,
client: Client::new(),
page_store_ignore: false
page_store_ignore: false,
domain: domain.to_owned(),
}
}

Expand Down Expand Up @@ -85,14 +82,13 @@ impl<'a> Website<'a> {
}

/// configure http client
pub fn configure_http_client(&mut self, user_agent: Option<String>) {
fn configure_http_client(&mut self, user_agent: Option<String>) -> Client {
let mut headers = header::HeaderMap::new();
headers.insert(CONNECTION, header::HeaderValue::from_static("keep-alive"));

self.client = Client::builder()
Client::builder()
.default_headers(headers)
.user_agent(user_agent.unwrap_or(self.configuration.user_agent.to_string()))
.pool_max_idle_per_host(0)
.build()
.expect("Failed building client.")
}
Expand All @@ -108,7 +104,7 @@ impl<'a> Website<'a> {
/// Start to crawl website
pub fn crawl(&mut self) {
self.configure_robots_parser();
self.configure_http_client(None);
let client = self.configure_http_client(None);
let delay = self.get_delay();
let on_link_find_callback = self.on_link_find_callback;
let pool = self.create_thread_pool();
Expand All @@ -122,16 +118,15 @@ impl<'a> Website<'a> {
continue;
}
self.log(&format!("- fetch {}", &link));
self.links_visited.insert(link.to_string());

let thread_link = link.to_string();
self.links_visited.insert(String::from(link));

let link = link.clone();
let tx = tx.clone();
let cx = self.client.clone();
let cx = client.clone();

pool.spawn(move || {
let link_result = on_link_find_callback(thread_link);
let html = fetch_page_html(&link_result, &cx).unwrap_or_default();
let link_result = on_link_find_callback(link);
let html = fetch_page_html(&link_result, &cx);
let page = Page::new(&link_result, &html);
let links = page.links();

Expand All @@ -146,7 +141,7 @@ impl<'a> Website<'a> {
rx.into_iter().for_each(|page| {
let (page, links) = page;
self.log(&format!("- parse {}", page.get_url()));

new_links.extend(links);

if !self.page_store_ignore {
Expand All @@ -156,11 +151,9 @@ impl<'a> Website<'a> {
if self.configuration.delay > 0 {
thread::sleep(delay);
}

});

self.links = new_links;

}
}

Expand Down
4 changes: 2 additions & 2 deletions spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "1.5.5"
version = "1.6.0"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand All @@ -23,7 +23,7 @@ quote = "1.0.18"
failure_derive = "0.1.8"

[dependencies.spider]
version = "1.5.5"
version = "1.6.0"
path = "../spider"
default-features = false

Expand Down

0 comments on commit 2d7471d

Please sign in to comment.