diff --git a/examples/Cargo.toml b/examples/Cargo.toml index 8b10583b8..364bd5b20 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_examples" -version = "1.8.3" +version = "1.9.0" authors = ["madeindjs ", "j-mendez "] description = "Multithreaded web crawler written in Rust." repository = "https://github.com/madeindjs/spider" @@ -15,7 +15,7 @@ publish = false maintenance = { status = "as-is" } [dependencies.spider] -version = "1.8.3" +version = "1.9.0" path = "../spider" default-features = false diff --git a/examples/example.rs b/examples/example.rs index 7f01411d0..7245e2961 100644 --- a/examples/example.rs +++ b/examples/example.rs @@ -6,6 +6,7 @@ fn main() { let mut website: Website = Website::new("https://rsseau.fr"); website.configuration.blacklist_url.push("https://rsseau.fr/resume".to_string()); website.configuration.respect_robots_txt = true; + website.configuration.subdomains = false; website.configuration.delay = 15; // Defaults to 250 ms website.configuration.concurrency = 10; // Defaults to number of cpus available website.configuration.user_agent = "SpiderBot".into(); // Defaults to spider/x.y.z, where x.y.z is the library version diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 6130902d7..1e770620d 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "1.8.3" +version = "1.9.0" authors = ["madeindjs ", "j-mendez "] description = "Multithreaded web crawler written in Rust." repository = "https://github.com/madeindjs/spider" diff --git a/spider/README.md b/spider/README.md index 5ad47c8ab..30f67a7e4 100644 --- a/spider/README.md +++ b/spider/README.md @@ -16,7 +16,7 @@ This is a basic blocking example crawling a web page, add spider to your `Cargo. ```toml [dependencies] -spider = "1.8.3" +spider = "1.9.0" ``` And then the code: @@ -43,6 +43,7 @@ You can use `Configuration` object to configure your crawler: let mut website: Website = Website::new("https://choosealicense.com"); website.configuration.blacklist_url.push("https://choosealicense.com/licenses/".to_string()); website.configuration.respect_robots_txt = true; +website.configuration.subdomains = true; website.configuration.delay = 2000; // Defaults to 250 ms website.configuration.concurrency = 10; // Defaults to number of cpus available * 4 website.configuration.user_agent = "myapp/version".to_string(); // Defaults to spider/x.y.z, where x.y.z is the library version @@ -57,7 +58,7 @@ There is an optional "regex" crate that can be enabled: ```toml [dependencies] -spider = { version = "1.8.3", features = ["regex"] } +spider = { version = "1.9.0", features = ["regex"] } ``` ```rust,no_run diff --git a/spider/src/configuration.rs b/spider/src/configuration.rs index 73cba3859..549e51ce5 100644 --- a/spider/src/configuration.rs +++ b/spider/src/configuration.rs @@ -7,12 +7,15 @@ use std::env; /// let mut website: Website = Website::new("https://choosealicense.com"); /// website.configuration.blacklist_url.push("https://choosealicense.com/licenses/".to_string()); /// website.configuration.respect_robots_txt = true; +/// website.configuration.subdomains = true; /// website.crawl(); /// ``` #[derive(Debug, Default)] pub struct Configuration { /// Respect robots.txt file and not scrape not allowed files. pub respect_robots_txt: bool, + /// Allow sub-domains and tld crawling. + pub subdomains: bool, /// List of pages to not crawl. [optional: regex pattern matching] pub blacklist_url: Vec, /// User-Agent diff --git a/spider/src/page.rs b/spider/src/page.rs index 12b1c720b..f9220c64c 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -1,5 +1,5 @@ use scraper::{Html, Selector}; -use url::Url; +use url::{Url}; use crate::utils::{fetch_page_html}; use reqwest::blocking::{Client}; use hashbrown::HashSet; @@ -11,7 +11,7 @@ pub struct Page { url: String, /// HTML parsed with [scraper](https://crates.io/crates/scraper) lib. The html is not stored and only used to parse links. html: String, - /// Base absolute url for domain. + /// Base absolute url for page. base: Url } @@ -38,7 +38,7 @@ impl Page { /// Instantiate a new page and start to scrape it. pub fn new(url: &str, client: &Client) -> Self { let html = fetch_page_html(&url, &client); // TODO: remove heavy cpu / network from new - + Page::build(url, &html) } @@ -71,14 +71,38 @@ impl Page { self.html.clear(); } + /// get the host name for url without tld + fn domain_name(&self, domain: &Url) -> String { + let b = domain.host_str().unwrap_or("").to_string(); + let mut b = b.split(".").collect::>(); + if b.len() >= 2 { + b.pop(); // remove the tld + } + let b = b[b.len() - 1]; + + b.to_string() + } + /// html selector for valid web pages for domain. - pub fn get_page_selectors(&self, domain: &str) -> Selector { + pub fn get_page_selectors(&self, url: &str, subdomains: bool) -> Selector { // select all absolute links - let absolute_selector = &format!( - r#"a[href^="{}"]{}"#, - domain, - *MEDIA_IGNORE_SELECTOR, - ); + let absolute_selector = &if subdomains { + let dname = self.domain_name(&self.base); + let scheme = self.base.scheme(); + + format!( + r#"a[href^="{url}"]{}, a[href^="https://{dname}"]{}, a[href^="http://{dname}"]{}, a[href^="{scheme}"][href*=".{dname}."]{}"#, + *MEDIA_IGNORE_SELECTOR, + *MEDIA_IGNORE_SELECTOR, + *MEDIA_IGNORE_SELECTOR, + *MEDIA_IGNORE_SELECTOR, + ) + } else { + format!( + r#"a[href^="{url}"]{}"#, + *MEDIA_IGNORE_SELECTOR, + ) + }; // allow relative and absolute .html files let static_html_selector = &format!( r#"{} {}, {} {}"#, @@ -98,13 +122,27 @@ impl Page { } /// Find all href links and return them using CSS selectors. - pub fn links(&self) -> HashSet { - let selector = self.get_page_selectors(&self.url); + pub fn links(&self, subdomains: bool) -> HashSet { + let selector = self.get_page_selectors(&self.url, subdomains); let html = self.parse_html(); - - html.select(&selector) - .map(|a| self.abs_path(a.value().attr("href").unwrap_or_default()).to_string()) - .collect() + let anchors = html.select(&selector); + + if subdomains { + let base_domain = self.domain_name(&self.base); + + anchors.filter_map(|a| { + let abs = self.abs_path(a.value().attr("href").unwrap_or_default()).to_string(); + let url_domain = self.domain_name(&Url::parse(&abs).unwrap()); + + if base_domain == url_domain { + Some(abs) + } else { + None + } + }).collect() + } else { + anchors.map(|a| self.abs_path(a.value().attr("href").unwrap_or("")).to_string()).collect() + } } /// Convert a URL to its absolute path without any fragments or params. @@ -124,8 +162,8 @@ fn parse_links() { .unwrap(); let link_result = "https://choosealicense.com/"; - let page: Page = Page::new(&link_result, &client); - let links = page.links(); + let page: Page = Page::new(&link_result, &client, &link_result); + let links = page.links(false); assert!( links @@ -143,7 +181,7 @@ fn test_abs_path() { .build() .unwrap(); let link_result = "https://choosealicense.com/"; - let page: Page = Page::new(&link_result, &client); + let page: Page = Page::new(&link_result, &client, &link_result); assert_eq!( page.abs_path("/page"), diff --git a/spider/src/website.rs b/spider/src/website.rs index c644909ba..d2ac1b582 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -16,10 +16,10 @@ use tokio::time::sleep; /// Represents a website to crawl and gather all links. /// ```rust /// use spider::website::Website; -/// let mut localhost = Website::new("http://example.com"); -/// localhost.crawl(); +/// let mut website = Website::new("http://example.com"); +/// website.crawl(); /// // `Website` will be filled with `Pages` when crawled. To get them, just use -/// for page in localhost.get_pages() { +/// for page in website.get_pages() { /// // do something /// } /// ``` @@ -142,9 +142,10 @@ impl<'a> Website<'a> { fn crawl_concurrent(&mut self, client: &Client) { let pool = self.create_thread_pool(); let delay = self.configuration.delay; + let subdomains = self.configuration.subdomains; let delay_enabled = delay > 0; let on_link_find_callback = self.on_link_find_callback; - + // crawl while links exists while !self.links.is_empty() { let (tx, rx): (Sender, Receiver) = channel(); @@ -167,7 +168,7 @@ impl<'a> Website<'a> { } let link_result = on_link_find_callback(link); let page = Page::new(&link_result, &cx); - let links = page.links(); + let links = page.links(subdomains); tx.send(links).unwrap(); }); @@ -188,9 +189,10 @@ impl<'a> Website<'a> { /// Start to crawl website sequential fn crawl_sequential(&mut self, client: &Client) { let delay = self.configuration.delay; + let subdomains = self.configuration.subdomains; let delay_enabled = delay > 0; let on_link_find_callback = self.on_link_find_callback; - + // crawl while links exists while !self.links.is_empty() { let mut new_links: HashSet = HashSet::new(); @@ -209,7 +211,7 @@ impl<'a> Website<'a> { let cx = client.clone(); let link_result = on_link_find_callback(link); let page = Page::new(&link_result, &cx); - let links = page.links(); + let links = page.links(subdomains); new_links.extend(links); } @@ -257,7 +259,7 @@ impl<'a> Website<'a> { let mut new_links: HashSet = HashSet::new(); rx.into_iter().for_each(|page| { - let links = page.links(); + let links = page.links(self.configuration.subdomains); new_links.extend(links); self.pages.push(page); }); @@ -432,6 +434,20 @@ fn test_respect_robots_txt() { assert_eq!(website_third.configuration.delay, 10000); // should equal 10 seconds in ms } +#[test] +fn test_crawl_subdomains() { + let mut website: Website = Website::new("https://choosealicense.com"); + website.configuration.subdomains = true; + website.crawl(); + assert!( + website + .links_visited + .contains(&"https://choosealicense.com/licenses/".to_string()), + "{:?}", + website.links_visited + ); +} + #[test] fn test_link_duplicates() { fn has_unique_elements(iter: T) -> bool diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index f34fad474..1a1b59906 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "1.8.3" +version = "1.9.0" authors = ["madeindjs ", "j-mendez "] description = "Multithreaded web crawler written in Rust." repository = "https://github.com/madeindjs/spider" @@ -25,7 +25,7 @@ quote = "1.0.18" failure_derive = "0.1.8" [dependencies.spider] -version = "1.8.3" +version = "1.9.0" path = "../spider" default-features = false diff --git a/spider_cli/README.md b/spider_cli/README.md index 1c2c8628a..ec13d9a63 100644 --- a/spider_cli/README.md +++ b/spider_cli/README.md @@ -34,7 +34,7 @@ spider --domain https://choosealicense.com crawl -o > spider_choosealicense.json ``` ```sh -spider_cli 1.8.3 +spider_cli 1.9.0 madeindjs , j-mendez Multithreaded web crawler written in Rust. diff --git a/spider_cli/src/main.rs b/spider_cli/src/main.rs index a3d5067a0..9a192d214 100644 --- a/spider_cli/src/main.rs +++ b/spider_cli/src/main.rs @@ -31,6 +31,7 @@ fn main() { website.configuration.respect_robots_txt = cli.respect_robots_txt; website.configuration.delay = delay; website.configuration.concurrency = concurrency; + website.configuration.subdomains = cli.subdomains; if !blacklist_url.is_empty() { let blacklist_url: Vec = blacklist_url.split(",").map(|l| l.to_string()).collect(); @@ -67,7 +68,7 @@ fn main() { let mut html: &String = &String::new(); if *output_links { - let page_links = page.links(); + let page_links = page.links(cli.subdomains); links.extend(page_links); } diff --git a/spider_cli/src/options/args.rs b/spider_cli/src/options/args.rs index 5d255d449..4787ee52d 100644 --- a/spider_cli/src/options/args.rs +++ b/spider_cli/src/options/args.rs @@ -14,6 +14,9 @@ pub struct Cli { /// Respect robots.txt file #[clap(short, long)] pub respect_robots_txt: bool, + /// Allow sub-domain crawling + #[clap(short, long)] + pub subdomains: bool, /// Print page visited on standard output #[clap(short, long)] pub verbose: bool,