Skip to content

Commit

Permalink
feat(crawl): add subdomain crawling with tld ignore
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Jun 24, 2022
1 parent 7b0524f commit 8e79647
Show file tree
Hide file tree
Showing 11 changed files with 98 additions and 35 deletions.
4 changes: 2 additions & 2 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "1.8.3"
version = "1.9.0"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand All @@ -15,7 +15,7 @@ publish = false
maintenance = { status = "as-is" }

[dependencies.spider]
version = "1.8.3"
version = "1.9.0"
path = "../spider"
default-features = false

Expand Down
1 change: 1 addition & 0 deletions examples/example.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ fn main() {
let mut website: Website = Website::new("https://rsseau.fr");
website.configuration.blacklist_url.push("https://rsseau.fr/resume".to_string());
website.configuration.respect_robots_txt = true;
website.configuration.subdomains = false;
website.configuration.delay = 15; // Defaults to 250 ms
website.configuration.concurrency = 10; // Defaults to number of cpus available
website.configuration.user_agent = "SpiderBot".into(); // Defaults to spider/x.y.z, where x.y.z is the library version
Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "1.8.3"
version = "1.9.0"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand Down
5 changes: 3 additions & 2 deletions spider/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This is a basic blocking example crawling a web page, add spider to your `Cargo.

```toml
[dependencies]
spider = "1.8.3"
spider = "1.9.0"
```

And then the code:
Expand All @@ -43,6 +43,7 @@ You can use `Configuration` object to configure your crawler:
let mut website: Website = Website::new("https://choosealicense.com");
website.configuration.blacklist_url.push("https://choosealicense.com/licenses/".to_string());
website.configuration.respect_robots_txt = true;
website.configuration.subdomains = true;
website.configuration.delay = 2000; // Defaults to 250 ms
website.configuration.concurrency = 10; // Defaults to number of cpus available * 4
website.configuration.user_agent = "myapp/version".to_string(); // Defaults to spider/x.y.z, where x.y.z is the library version
Expand All @@ -57,7 +58,7 @@ There is an optional "regex" crate that can be enabled:

```toml
[dependencies]
spider = { version = "1.8.3", features = ["regex"] }
spider = { version = "1.9.0", features = ["regex"] }
```

```rust,no_run
Expand Down
3 changes: 3 additions & 0 deletions spider/src/configuration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,15 @@ use std::env;
/// let mut website: Website = Website::new("https://choosealicense.com");
/// website.configuration.blacklist_url.push("https://choosealicense.com/licenses/".to_string());
/// website.configuration.respect_robots_txt = true;
/// website.configuration.subdomains = true;
/// website.crawl();
/// ```
#[derive(Debug, Default)]
pub struct Configuration {
/// Respect robots.txt file and not scrape not allowed files.
pub respect_robots_txt: bool,
/// Allow sub-domains and tld crawling.
pub subdomains: bool,
/// List of pages to not crawl. [optional: regex pattern matching]
pub blacklist_url: Vec<String>,
/// User-Agent
Expand Down
74 changes: 56 additions & 18 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use scraper::{Html, Selector};
use url::Url;
use url::{Url};
use crate::utils::{fetch_page_html};
use reqwest::blocking::{Client};
use hashbrown::HashSet;
Expand All @@ -11,7 +11,7 @@ pub struct Page {
url: String,
/// HTML parsed with [scraper](https://crates.io/crates/scraper) lib. The html is not stored and only used to parse links.
html: String,
/// Base absolute url for domain.
/// Base absolute url for page.
base: Url
}

Expand All @@ -38,7 +38,7 @@ impl Page {
/// Instantiate a new page and start to scrape it.
pub fn new(url: &str, client: &Client) -> Self {
let html = fetch_page_html(&url, &client); // TODO: remove heavy cpu / network from new

Page::build(url, &html)
}

Expand Down Expand Up @@ -71,14 +71,38 @@ impl Page {
self.html.clear();
}

/// get the host name for url without tld
fn domain_name(&self, domain: &Url) -> String {
let b = domain.host_str().unwrap_or("").to_string();
let mut b = b.split(".").collect::<Vec<&str>>();
if b.len() >= 2 {
b.pop(); // remove the tld
}
let b = b[b.len() - 1];

b.to_string()
}

/// html selector for valid web pages for domain.
pub fn get_page_selectors(&self, domain: &str) -> Selector {
pub fn get_page_selectors(&self, url: &str, subdomains: bool) -> Selector {
// select all absolute links
let absolute_selector = &format!(
r#"a[href^="{}"]{}"#,
domain,
*MEDIA_IGNORE_SELECTOR,
);
let absolute_selector = &if subdomains {
let dname = self.domain_name(&self.base);
let scheme = self.base.scheme();

format!(
r#"a[href^="{url}"]{}, a[href^="https://{dname}"]{}, a[href^="http://{dname}"]{}, a[href^="{scheme}"][href*=".{dname}."]{}"#,
*MEDIA_IGNORE_SELECTOR,
*MEDIA_IGNORE_SELECTOR,
*MEDIA_IGNORE_SELECTOR,
*MEDIA_IGNORE_SELECTOR,
)
} else {
format!(
r#"a[href^="{url}"]{}"#,
*MEDIA_IGNORE_SELECTOR,
)
};
// allow relative and absolute .html files
let static_html_selector = &format!(
r#"{} {}, {} {}"#,
Expand All @@ -98,13 +122,27 @@ impl Page {
}

/// Find all href links and return them using CSS selectors.
pub fn links(&self) -> HashSet<String> {
let selector = self.get_page_selectors(&self.url);
pub fn links(&self, subdomains: bool) -> HashSet<String> {
let selector = self.get_page_selectors(&self.url, subdomains);
let html = self.parse_html();

html.select(&selector)
.map(|a| self.abs_path(a.value().attr("href").unwrap_or_default()).to_string())
.collect()
let anchors = html.select(&selector);

if subdomains {
let base_domain = self.domain_name(&self.base);

anchors.filter_map(|a| {
let abs = self.abs_path(a.value().attr("href").unwrap_or_default()).to_string();
let url_domain = self.domain_name(&Url::parse(&abs).unwrap());

if base_domain == url_domain {
Some(abs)
} else {
None
}
}).collect()
} else {
anchors.map(|a| self.abs_path(a.value().attr("href").unwrap_or("")).to_string()).collect()
}
}

/// Convert a URL to its absolute path without any fragments or params.
Expand All @@ -124,8 +162,8 @@ fn parse_links() {
.unwrap();

let link_result = "https://choosealicense.com/";
let page: Page = Page::new(&link_result, &client);
let links = page.links();
let page: Page = Page::new(&link_result, &client, &link_result);
let links = page.links(false);

assert!(
links
Expand All @@ -143,7 +181,7 @@ fn test_abs_path() {
.build()
.unwrap();
let link_result = "https://choosealicense.com/";
let page: Page = Page::new(&link_result, &client);
let page: Page = Page::new(&link_result, &client, &link_result);

assert_eq!(
page.abs_path("/page"),
Expand Down
32 changes: 24 additions & 8 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ use tokio::time::sleep;
/// Represents a website to crawl and gather all links.
/// ```rust
/// use spider::website::Website;
/// let mut localhost = Website::new("http://example.com");
/// localhost.crawl();
/// let mut website = Website::new("http://example.com");
/// website.crawl();
/// // `Website` will be filled with `Pages` when crawled. To get them, just use
/// for page in localhost.get_pages() {
/// for page in website.get_pages() {
/// // do something
/// }
/// ```
Expand Down Expand Up @@ -142,9 +142,10 @@ impl<'a> Website<'a> {
fn crawl_concurrent(&mut self, client: &Client) {
let pool = self.create_thread_pool();
let delay = self.configuration.delay;
let subdomains = self.configuration.subdomains;
let delay_enabled = delay > 0;
let on_link_find_callback = self.on_link_find_callback;

// crawl while links exists
while !self.links.is_empty() {
let (tx, rx): (Sender<Message>, Receiver<Message>) = channel();
Expand All @@ -167,7 +168,7 @@ impl<'a> Website<'a> {
}
let link_result = on_link_find_callback(link);
let page = Page::new(&link_result, &cx);
let links = page.links();
let links = page.links(subdomains);

tx.send(links).unwrap();
});
Expand All @@ -188,9 +189,10 @@ impl<'a> Website<'a> {
/// Start to crawl website sequential
fn crawl_sequential(&mut self, client: &Client) {
let delay = self.configuration.delay;
let subdomains = self.configuration.subdomains;
let delay_enabled = delay > 0;
let on_link_find_callback = self.on_link_find_callback;

// crawl while links exists
while !self.links.is_empty() {
let mut new_links: HashSet<String> = HashSet::new();
Expand All @@ -209,7 +211,7 @@ impl<'a> Website<'a> {
let cx = client.clone();
let link_result = on_link_find_callback(link);
let page = Page::new(&link_result, &cx);
let links = page.links();
let links = page.links(subdomains);

new_links.extend(links);
}
Expand Down Expand Up @@ -257,7 +259,7 @@ impl<'a> Website<'a> {
let mut new_links: HashSet<String> = HashSet::new();

rx.into_iter().for_each(|page| {
let links = page.links();
let links = page.links(self.configuration.subdomains);
new_links.extend(links);
self.pages.push(page);
});
Expand Down Expand Up @@ -432,6 +434,20 @@ fn test_respect_robots_txt() {
assert_eq!(website_third.configuration.delay, 10000); // should equal 10 seconds in ms
}

#[test]
fn test_crawl_subdomains() {
let mut website: Website = Website::new("https://choosealicense.com");
website.configuration.subdomains = true;
website.crawl();
assert!(
website
.links_visited
.contains(&"https://choosealicense.com/licenses/".to_string()),
"{:?}",
website.links_visited
);
}

#[test]
fn test_link_duplicates() {
fn has_unique_elements<T>(iter: T) -> bool
Expand Down
4 changes: 2 additions & 2 deletions spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "1.8.3"
version = "1.9.0"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/madeindjs/spider"
Expand All @@ -25,7 +25,7 @@ quote = "1.0.18"
failure_derive = "0.1.8"

[dependencies.spider]
version = "1.8.3"
version = "1.9.0"
path = "../spider"
default-features = false

Expand Down
2 changes: 1 addition & 1 deletion spider_cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ spider --domain https://choosealicense.com crawl -o > spider_choosealicense.json
```

```sh
spider_cli 1.8.3
spider_cli 1.9.0
madeindjs <contact@rousseau-alexandre.fr>, j-mendez <jeff@a11ywatch.com>
Multithreaded web crawler written in Rust.

Expand Down
3 changes: 2 additions & 1 deletion spider_cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ fn main() {
website.configuration.respect_robots_txt = cli.respect_robots_txt;
website.configuration.delay = delay;
website.configuration.concurrency = concurrency;
website.configuration.subdomains = cli.subdomains;

if !blacklist_url.is_empty() {
let blacklist_url: Vec<String> = blacklist_url.split(",").map(|l| l.to_string()).collect();
Expand Down Expand Up @@ -67,7 +68,7 @@ fn main() {
let mut html: &String = &String::new();

if *output_links {
let page_links = page.links();
let page_links = page.links(cli.subdomains);
links.extend(page_links);
}

Expand Down
3 changes: 3 additions & 0 deletions spider_cli/src/options/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ pub struct Cli {
/// Respect robots.txt file
#[clap(short, long)]
pub respect_robots_txt: bool,
/// Allow sub-domain crawling
#[clap(short, long)]
pub subdomains: bool,
/// Print page visited on standard output
#[clap(short, long)]
pub verbose: bool,
Expand Down

0 comments on commit 8e79647

Please sign in to comment.