feat(crawl): add subdomain crawling with tld ignore

spider-rs · Jun 24, 2022 · 8e79647 · 8e79647
1 parent 7b0524f
commit 8e79647
Show file tree

Hide file tree

Showing 11 changed files with 98 additions and 35 deletions.
diff --git a/examples/Cargo.toml b/examples/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_examples"
-version = "1.8.3"
+version = "1.9.0"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "Multithreaded web crawler written in Rust."
 repository = "https://github.com/madeindjs/spider"
@@ -15,7 +15,7 @@ publish = false
 maintenance = { status = "as-is" }
 
 [dependencies.spider]
-version = "1.8.3"
+version = "1.9.0"
 path = "../spider"
 default-features = false
 

diff --git a/examples/example.rs b/examples/example.rs
@@ -6,6 +6,7 @@ fn main() {
   let mut website: Website = Website::new("https://rsseau.fr");
   website.configuration.blacklist_url.push("https://rsseau.fr/resume".to_string());
   website.configuration.respect_robots_txt = true;
+  website.configuration.subdomains = false;
   website.configuration.delay = 15; // Defaults to 250 ms
   website.configuration.concurrency = 10; // Defaults to number of cpus available
   website.configuration.user_agent = "SpiderBot".into(); // Defaults to spider/x.y.z, where x.y.z is the library version

diff --git a/spider/Cargo.toml b/spider/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider"
-version = "1.8.3"
+version = "1.9.0"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "Multithreaded web crawler written in Rust."
 repository = "https://github.com/madeindjs/spider"

diff --git a/spider/README.md b/spider/README.md
@@ -16,7 +16,7 @@ This is a basic blocking example crawling a web page, add spider to your `Cargo.
 
 ```toml
 [dependencies]
-spider = "1.8.3"
+spider = "1.9.0"
 ```
 
 And then the code:
@@ -43,6 +43,7 @@ You can use `Configuration` object to configure your crawler:
 let mut website: Website = Website::new("https://choosealicense.com");
 website.configuration.blacklist_url.push("https://choosealicense.com/licenses/".to_string());
 website.configuration.respect_robots_txt = true;
+website.configuration.subdomains = true;
 website.configuration.delay = 2000; // Defaults to 250 ms
 website.configuration.concurrency = 10; // Defaults to number of cpus available * 4
 website.configuration.user_agent = "myapp/version".to_string(); // Defaults to spider/x.y.z, where x.y.z is the library version
@@ -57,7 +58,7 @@ There is an optional "regex" crate that can be enabled:
 
 ```toml
 [dependencies]
-spider = { version = "1.8.3", features = ["regex"] }
+spider = { version = "1.9.0", features = ["regex"] }
 ```
 
 ```rust,no_run

diff --git a/spider/src/configuration.rs b/spider/src/configuration.rs
@@ -7,12 +7,15 @@ use std::env;
 /// let mut website: Website = Website::new("https://choosealicense.com");
 /// website.configuration.blacklist_url.push("https://choosealicense.com/licenses/".to_string());
 /// website.configuration.respect_robots_txt = true;
+/// website.configuration.subdomains = true;
 /// website.crawl();
 /// ```
 #[derive(Debug, Default)]
 pub struct Configuration {
     /// Respect robots.txt file and not scrape not allowed files.
     pub respect_robots_txt: bool,
+    /// Allow sub-domains and tld crawling.
+    pub subdomains: bool,
     /// List of pages to not crawl. [optional: regex pattern matching]
     pub blacklist_url: Vec<String>,
     /// User-Agent

diff --git a/spider/src/page.rs b/spider/src/page.rs
@@ -1,5 +1,5 @@
 use scraper::{Html, Selector};
-use url::Url;
+use url::{Url};
 use crate::utils::{fetch_page_html};
 use reqwest::blocking::{Client};
 use hashbrown::HashSet;
@@ -11,7 +11,7 @@ pub struct Page {
     url: String,
     /// HTML parsed with [scraper](https://crates.io/crates/scraper) lib. The html is not stored and only used to parse links.
     html: String,
-    /// Base absolute url for domain.
+    /// Base absolute url for page.
     base: Url
 }
 
@@ -38,7 +38,7 @@ impl Page {
     /// Instantiate a new page and start to scrape it.
     pub fn new(url: &str, client: &Client) -> Self {
         let html = fetch_page_html(&url, &client); // TODO: remove heavy cpu / network from new
-
+        
         Page::build(url, &html)
     }
 
@@ -71,14 +71,38 @@ impl Page {
         self.html.clear();
     }
 
+    /// get the host name for url without tld
+    fn domain_name(&self, domain: &Url) -> String {
+        let b = domain.host_str().unwrap_or("").to_string();
+        let mut b = b.split(".").collect::<Vec<&str>>();
+        if b.len() >= 2 {
+            b.pop(); // remove the tld
+        }
+        let b = b[b.len() - 1];
+
+        b.to_string()
+    }
+
     /// html selector for valid web pages for domain.
-    pub fn get_page_selectors(&self, domain: &str) -> Selector {
+    pub fn get_page_selectors(&self, url: &str, subdomains: bool) -> Selector {
         // select all absolute links
-        let absolute_selector = &format!(
-            r#"a[href^="{}"]{}"#,
-            domain,
-            *MEDIA_IGNORE_SELECTOR,
-        );
+        let absolute_selector = &if subdomains {
+            let dname = self.domain_name(&self.base);
+            let scheme = self.base.scheme();
+
+            format!(
+                r#"a[href^="{url}"]{}, a[href^="https://{dname}"]{}, a[href^="http://{dname}"]{}, a[href^="{scheme}"][href*=".{dname}."]{}"#,
+                *MEDIA_IGNORE_SELECTOR,
+                *MEDIA_IGNORE_SELECTOR,
+                *MEDIA_IGNORE_SELECTOR,
+                *MEDIA_IGNORE_SELECTOR,
+            )
+        } else {
+            format!(
+                r#"a[href^="{url}"]{}"#,
+                *MEDIA_IGNORE_SELECTOR,
+            )
+        };
         // allow relative and absolute .html files
         let static_html_selector = &format!(
             r#"{} {}, {} {}"#,
@@ -98,13 +122,27 @@ impl Page {
     }
 
     /// Find all href links and return them using CSS selectors.
-    pub fn links(&self) -> HashSet<String> {
-        let selector = self.get_page_selectors(&self.url);
+    pub fn links(&self, subdomains: bool) -> HashSet<String> {
+        let selector = self.get_page_selectors(&self.url, subdomains);
         let html = self.parse_html();
-
-        html.select(&selector)
-            .map(|a| self.abs_path(a.value().attr("href").unwrap_or_default()).to_string())
-            .collect()
+        let anchors = html.select(&selector);
+
+        if subdomains {
+            let base_domain = self.domain_name(&self.base);
+
+            anchors.filter_map(|a| {
+                let abs = self.abs_path(a.value().attr("href").unwrap_or_default()).to_string();
+                let url_domain = self.domain_name(&Url::parse(&abs).unwrap());
+
+                if base_domain == url_domain  {
+                    Some(abs)
+                } else {
+                    None
+                }
+            }).collect()
+        } else {
+            anchors.map(|a| self.abs_path(a.value().attr("href").unwrap_or("")).to_string()).collect()
+        }
     }
 
     /// Convert a URL to its absolute path without any fragments or params.
@@ -124,8 +162,8 @@ fn parse_links() {
         .unwrap();
 
     let link_result = "https://choosealicense.com/";
-    let page: Page = Page::new(&link_result, &client);
-    let links = page.links();
+    let page: Page = Page::new(&link_result, &client, &link_result);
+    let links = page.links(false);
 
     assert!(
         links
@@ -143,7 +181,7 @@ fn test_abs_path() {
         .build()
         .unwrap();
     let link_result = "https://choosealicense.com/";
-    let page: Page = Page::new(&link_result, &client);
+    let page: Page = Page::new(&link_result, &client, &link_result);
 
     assert_eq!(
         page.abs_path("/page"),

diff --git a/spider/src/website.rs b/spider/src/website.rs
@@ -16,10 +16,10 @@ use tokio::time::sleep;
 /// Represents a website to crawl and gather all links.
 /// ```rust
 /// use spider::website::Website;
-/// let mut localhost = Website::new("http://example.com");
-/// localhost.crawl();
+/// let mut website = Website::new("http://example.com");
+/// website.crawl();
 /// // `Website` will be filled with `Pages` when crawled. To get them, just use
-/// for page in localhost.get_pages() {
+/// for page in website.get_pages() {
 ///     // do something
 /// }
 /// ```
@@ -142,9 +142,10 @@ impl<'a> Website<'a> {
     fn crawl_concurrent(&mut self, client: &Client) {
         let pool = self.create_thread_pool();
         let delay = self.configuration.delay;
+        let subdomains = self.configuration.subdomains;
         let delay_enabled = delay > 0;
         let on_link_find_callback = self.on_link_find_callback;
-        
+
         // crawl while links exists
         while !self.links.is_empty() {
             let (tx, rx): (Sender<Message>, Receiver<Message>) = channel();
@@ -167,7 +168,7 @@ impl<'a> Website<'a> {
                     }
                     let link_result = on_link_find_callback(link);
                     let page = Page::new(&link_result, &cx);
-                    let links = page.links();
+                    let links = page.links(subdomains);
 
                     tx.send(links).unwrap();
                 });
@@ -188,9 +189,10 @@ impl<'a> Website<'a> {
     /// Start to crawl website sequential
     fn crawl_sequential(&mut self, client: &Client) {
         let delay = self.configuration.delay;
+        let subdomains = self.configuration.subdomains;
         let delay_enabled = delay > 0;
         let on_link_find_callback = self.on_link_find_callback;
-        
+
         // crawl while links exists
         while !self.links.is_empty() {
             let mut new_links: HashSet<String> = HashSet::new();
@@ -209,7 +211,7 @@ impl<'a> Website<'a> {
                 let cx = client.clone();
                 let link_result = on_link_find_callback(link);
                 let page = Page::new(&link_result, &cx);
-                let links = page.links();
+                let links = page.links(subdomains);
 
                 new_links.extend(links);
             }
@@ -257,7 +259,7 @@ impl<'a> Website<'a> {
             let mut new_links: HashSet<String> = HashSet::new();
 
             rx.into_iter().for_each(|page| {
-                let links = page.links();
+                let links = page.links(self.configuration.subdomains);
                 new_links.extend(links);
                 self.pages.push(page);
             });
@@ -432,6 +434,20 @@ fn test_respect_robots_txt() {
     assert_eq!(website_third.configuration.delay, 10000); // should equal 10 seconds in ms
 }
 
+#[test]
+fn test_crawl_subdomains() {
+    let mut website: Website = Website::new("https://choosealicense.com");
+    website.configuration.subdomains = true;
+    website.crawl();
+    assert!(
+        website
+            .links_visited
+            .contains(&"https://choosealicense.com/licenses/".to_string()),
+        "{:?}",
+        website.links_visited
+    );
+}
+
 #[test]
 fn test_link_duplicates() {
     fn has_unique_elements<T>(iter: T) -> bool

diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_cli"
-version = "1.8.3"
+version = "1.9.0"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "Multithreaded web crawler written in Rust."
 repository = "https://github.com/madeindjs/spider"
@@ -25,7 +25,7 @@ quote = "1.0.18"
 failure_derive = "0.1.8"
 
 [dependencies.spider]
-version = "1.8.3"
+version = "1.9.0"
 path = "../spider"
 default-features = false
 

diff --git a/spider_cli/README.md b/spider_cli/README.md
@@ -34,7 +34,7 @@ spider --domain https://choosealicense.com crawl -o > spider_choosealicense.json
 ```
 
 ```sh
-spider_cli 1.8.3
+spider_cli 1.9.0
 madeindjs <contact@rousseau-alexandre.fr>, j-mendez <jeff@a11ywatch.com>
 Multithreaded web crawler written in Rust.
 

diff --git a/spider_cli/src/main.rs b/spider_cli/src/main.rs
@@ -31,6 +31,7 @@ fn main() {
     website.configuration.respect_robots_txt = cli.respect_robots_txt;
     website.configuration.delay = delay;
     website.configuration.concurrency = concurrency;
+    website.configuration.subdomains = cli.subdomains;
 
     if !blacklist_url.is_empty() {
         let blacklist_url: Vec<String> = blacklist_url.split(",").map(|l| l.to_string()).collect();
@@ -67,7 +68,7 @@ fn main() {
                 let mut html: &String = &String::new();
 
                 if *output_links {
-                    let page_links = page.links();
+                    let page_links = page.links(cli.subdomains);
                     links.extend(page_links);
                 }
 

diff --git a/spider_cli/src/options/args.rs b/spider_cli/src/options/args.rs
@@ -14,6 +14,9 @@ pub struct Cli {
     /// Respect robots.txt file
     #[clap(short, long)]
     pub respect_robots_txt: bool,
+    /// Allow sub-domain crawling
+    #[clap(short, long)]
+    pub subdomains: bool,
     /// Print page visited on standard output
     #[clap(short, long)]
     pub verbose: bool,