diff --git a/CHANGELOG.md b/CHANGELOG.md index 203272cf9..7c22156eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ ## Unreleased +## v1.6.1 + +- perf(links): filter dup links after async batch +- chore(delay): fix crawl delay thread groups +- perf(page): slim channel page sending required props + ## v1.5.3 - feat(regex): add optional regex black listing diff --git a/benches/Cargo.toml b/benches/Cargo.toml index 8a1e2d5c3..93555789a 100644 --- a/benches/Cargo.toml +++ b/benches/Cargo.toml @@ -5,7 +5,7 @@ publish = false edition = "2021" [dependencies] -spider = { version = "1.6.0", path = "../spider" } +spider = { version = "1.6.1", path = "../spider" } criterion = "0.3" [[bench]] diff --git a/benches/README.md b/benches/README.md index 6d7a06e49..391e212ef 100644 --- a/benches/README.md +++ b/benches/README.md @@ -10,7 +10,7 @@ We have comparisons set against 3 different languages and libs that can be used How fast can we crawl all pages on a medium sized website. Tests are ordered between the largest to smallest runtimes needed. All examples use the same html selector to gather the pages for a website. -### v1.6.0 +### v1.6.1 Case: `https://rsseau.fr` diff --git a/examples/Cargo.toml b/examples/Cargo.toml index 5a4bdc417..7997f26ac 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_examples" -version = "1.6.0" +version = "1.6.1" authors = ["madeindjs ", "j-mendez "] description = "Multithreaded web crawler written in Rust." repository = "https://github.com/madeindjs/spider" @@ -15,7 +15,7 @@ publish = false maintenance = { status = "as-is" } [dependencies.spider] -version = "1.6.0" +version = "1.6.1" path = "../spider" default-features = false diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 78d6469ae..3448bded4 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "1.6.0" +version = "1.6.1" authors = ["madeindjs ", "j-mendez "] description = "Multithreaded web crawler written in Rust." repository = "https://github.com/madeindjs/spider" diff --git a/spider/src/website.rs b/spider/src/website.rs index eed25d515..fd4cbade3 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -11,6 +11,7 @@ use std::collections::HashSet; use std::{sync, thread, time::Duration}; use reqwest::header::CONNECTION; use reqwest::header; +use sync::mpsc::{channel, Sender, Receiver}; /// Represent a website to scrawl. To start crawling, instanciate a new `struct` using ///
@@ -43,6 +44,8 @@ pub struct Website<'a> {
     pub page_store_ignore: bool,
 }
 
+type Message = (Page, Vec);
+
 impl<'a> Website<'a> {
     /// Initialize Website object with a start link to scrawl.
     pub fn new(domain: &str) -> Self {
@@ -110,7 +113,7 @@ impl<'a> Website<'a> {
         
         // crawl while links exists
         while !self.links.is_empty() {
-            let (tx, rx) = sync::mpsc::channel();
+            let (tx, rx): (Sender, Receiver) = channel();
 
             for link in self.links.iter() {
                 if !self.is_allowed(link) {
@@ -152,7 +155,7 @@ impl<'a> Website<'a> {
 
             });
 
-            self.links = new_links;
+            self.links = new_links.difference(&self.links_visited).cloned().collect();
         }
     }
 
diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml
index ac547ace7..729536daa 100644
--- a/spider_cli/Cargo.toml
+++ b/spider_cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_cli"
-version = "1.6.0"
+version = "1.6.1"
 authors = ["madeindjs ", "j-mendez "]
 description = "Multithreaded web crawler written in Rust."
 repository = "https://github.com/madeindjs/spider"
@@ -23,7 +23,7 @@ quote = "1.0.18"
 failure_derive = "0.1.8"
 
 [dependencies.spider]
-version = "1.6.0"
+version = "1.6.1"
 path = "../spider"
 default-features = false