Skip to content

Commit

Permalink
feat(page): add status code and error message page response (#148)
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez authored Nov 13, 2023
1 parent c14cd6c commit 17f1cd0
Show file tree
Hide file tree
Showing 11 changed files with 146 additions and 76 deletions.
8 changes: 4 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "1.46.5"
version = "1.48.0"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand All @@ -22,7 +22,7 @@ htr = "0.5.27"
flexbuffers = "2.0.0"

[dependencies.spider]
version = "1.46.5"
version = "1.48.0"
path = "../spider"
features = ["serde"]

Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "1.46.5"
version = "1.48.0"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "The fastest web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand Down
12 changes: 6 additions & 6 deletions spider/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom

```toml
[dependencies]
spider = "1.46.5"
spider = "1.48.0"
```

And then the code:
Expand Down Expand Up @@ -87,7 +87,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl

```toml
[dependencies]
spider = { version = "1.46.5", features = ["regex", "ua_generator"] }
spider = { version = "1.48.0", features = ["regex", "ua_generator"] }
```

1. `ua_generator`: Enables auto generating a random real User-Agent.
Expand Down Expand Up @@ -116,7 +116,7 @@ Move processing to a worker, drastically increases performance even if worker is

```toml
[dependencies]
spider = { version = "1.46.5", features = ["decentralized"] }
spider = { version = "1.48.0", features = ["decentralized"] }
```

```sh
Expand All @@ -136,7 +136,7 @@ Use the subscribe method to get a broadcast channel.

```toml
[dependencies]
spider = { version = "1.46.5", features = ["sync"] }
spider = { version = "1.48.0", features = ["sync"] }
```

```rust,no_run
Expand Down Expand Up @@ -166,7 +166,7 @@ Allow regex for blacklisting routes

```toml
[dependencies]
spider = { version = "1.46.5", features = ["regex"] }
spider = { version = "1.48.0", features = ["regex"] }
```

```rust,no_run
Expand All @@ -193,7 +193,7 @@ If you are performing large workloads you may need to control the crawler by ena

```toml
[dependencies]
spider = { version = "1.46.5", features = ["control"] }
spider = { version = "1.48.0", features = ["control"] }
```

```rust
Expand Down
18 changes: 16 additions & 2 deletions spider/src/packages/scraper/element_ref/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,22 @@ impl<'a> Iterator for Text<'a> {
fn next(&mut self) -> Option<&'a str> {
for edge in &mut self.inner {
if let Edge::Open(node) = edge {
if let Node::Text(ref text) = node.value() {
return Some(&**text);
let node_value = node.value();

match node_value.as_element() {
Some(v) => {
let n = v.name();
if n != "script" || n != "style" {
if let Node::Text(ref text) = node_value {
return Some(&**text);
}
}
}
_ => {
if let Node::Text(ref text) = node_value {
return Some(&**text);
}
}
}
}
}
Expand Down
72 changes: 48 additions & 24 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#[cfg(not(feature = "decentralized"))]
use crate::packages::scraper::Html;
use crate::utils::PageResponse;
use crate::CaseInsensitiveString;
use bytes::Bytes;
use compact_str::CompactString;
use hashbrown::HashSet;
use reqwest::Client;
use reqwest::{Client, StatusCode};
use smallvec::SmallVec;

#[cfg(all(feature = "time", not(feature = "decentralized")))]
Expand All @@ -31,13 +32,17 @@ pub struct Page {
base: Url,
/// The raw url for the page. Useful since Url::parse adds a trailing slash.
url: String,
#[cfg(feature = "time")]
/// The duration from start of parsing to end of gathering links.
duration: Instant,
/// The status code of the page request.
pub status_code: StatusCode,
/// The error of the request if any.
pub error_status: Option<String>,
/// The external urls to group with the domain
pub external_domains_caseless: Box<HashSet<CaseInsensitiveString>>,
/// The final destination of the page if redirects were performed [Not implemented in the chrome feature].
pub final_redirect_destination: Option<String>,
#[cfg(feature = "time")]
/// The duration from start of parsing to end of gathering links.
duration: Instant,
}

/// Represent a page visited. This page contains HTML scraped with [scraper](https://crates.io/crates/scraper).
Expand All @@ -46,6 +51,10 @@ pub struct Page {
pub struct Page {
/// The bytes of the resource.
html: Option<Bytes>,
/// The status code of the page request.
pub status_code: StatusCode,
/// The error of the request if any.
pub error_status: Option<String>,
/// The current links for the page.
pub links: HashSet<CaseInsensitiveString>,
/// The external urls to group with the domain.
Expand Down Expand Up @@ -135,26 +144,50 @@ pub fn get_page_selectors(

/// Instantiate a new page without scraping it (used for testing purposes).
#[cfg(not(feature = "decentralized"))]
pub fn build(url: &str, html: Option<bytes::Bytes>) -> Page {
pub fn build(url: &str, res: PageResponse) -> Page {
Page {
html: if html.is_some() { html } else { None },
html: if res.content.is_some() {
res.content
} else {
None
},
base: Url::parse(&url).expect("Invalid page URL"),
url: url.into(),
#[cfg(feature = "time")]
duration: Instant::now(),
external_domains_caseless: Default::default(),
final_redirect_destination: Default::default(),
final_redirect_destination: res.final_url,
status_code: res.status_code,
error_status: match res.error_for_status {
Some(e) => match e {
Ok(_) => None,
Err(er) => Some(er.to_string()),
},
_ => None,
},
}
}

/// Instantiate a new page without scraping it (used for testing purposes).
#[cfg(feature = "decentralized")]
pub fn build(_: &str, html: Option<bytes::Bytes>) -> Page {
pub fn build(_: &str, res: PageResponse) -> Page {
Page {
html: if html.is_some() { html } else { None },
html: if res.content.is_some() {
res.content
} else {
None
},
links: Default::default(),
external_domains_caseless: Default::default(),
final_redirect_destination: Default::default(),
final_redirect_destination: res.final_url,
status_code: res.status_code,
error_status: match res.error_for_status {
Some(e) => match e {
Ok(_) => None,
Err(er) => Some(er.to_string()),
},
_ => None,
},
}
}

Expand All @@ -163,27 +196,21 @@ impl Page {
/// Instantiate a new page and gather the html.
pub async fn new(url: &str, client: &Client, page: &chromiumoxide_fork::Page) -> Self {
let page_resource = crate::utils::fetch_page_html(&url, &client, &page).await;
let mut page = build(url, page_resource.0);
page.set_final_redirect(page_resource.1);
page
build(url, page_resource)
}

#[cfg(not(feature = "decentralized"))]
/// Instantiate a new page and gather the html repro of standard fetch_page_html.
pub async fn new_page(url: &str, client: &Client) -> Self {
let page_resource = crate::utils::fetch_page_html_raw(&url, &client).await;
let mut page = build(url, page_resource.0);
page.set_final_redirect(page_resource.1);
page
build(url, page_resource)
}

/// Instantiate a new page and gather the html.
#[cfg(all(not(feature = "decentralized"), not(feature = "chrome")))]
pub async fn new(url: &str, client: &Client) -> Self {
let page_resource = crate::utils::fetch_page_html(&url, &client).await;
let mut page = build(url, page_resource.0);
page.set_final_redirect(page_resource.1);
page
build(url, page_resource)
}

/// Instantiate a new page and gather the links.
Expand All @@ -207,6 +234,8 @@ impl Page {
links,
external_domains_caseless: Default::default(),
final_redirect_destination: Default::default(),
status_code: Default::default(),
error_status: Default::default(),
}
}

Expand Down Expand Up @@ -235,11 +264,6 @@ impl Page {
self.external_domains_caseless = external_domains_caseless;
}

/// Set final redirect destination
pub fn set_final_redirect(&mut self, final_redirect_destination: Option<String>) {
self.final_redirect_destination = final_redirect_destination;
}

/// Parsed URL getter for page.
#[cfg(not(feature = "decentralized"))]
pub fn get_url_parsed(&self) -> &Url {
Expand Down
Loading

0 comments on commit 17f1cd0

Please sign in to comment.