feat(page): add status code and error message page response (#148)

spider-rs · Nov 13, 2023 · 17f1cd0 · 17f1cd0
1 parent c14cd6c
commit 17f1cd0
Show file tree

Hide file tree

Showing 11 changed files with 146 additions and 76 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/examples/Cargo.toml b/examples/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_examples"
-version = "1.46.5"
+version = "1.48.0"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "Multithreaded web crawler written in Rust."
 repository = "https://github.com/spider-rs/spider"
@@ -22,7 +22,7 @@ htr = "0.5.27"
 flexbuffers = "2.0.0"
 
 [dependencies.spider]
-version = "1.46.5"
+version = "1.48.0"
 path = "../spider"
 features = ["serde"]
 

diff --git a/spider/Cargo.toml b/spider/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider"
-version = "1.46.5"
+version = "1.48.0"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "The fastest web crawler written in Rust."
 repository = "https://github.com/spider-rs/spider"

diff --git a/spider/README.md b/spider/README.md
@@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom
 
 ```toml
 [dependencies]
-spider = "1.46.5"
+spider = "1.48.0"
 ```
 
 And then the code:
@@ -87,7 +87,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl
 
 ```toml
 [dependencies]
-spider = { version = "1.46.5", features = ["regex", "ua_generator"] }
+spider = { version = "1.48.0", features = ["regex", "ua_generator"] }
 ```
 
 1. `ua_generator`: Enables auto generating a random real User-Agent.
@@ -116,7 +116,7 @@ Move processing to a worker, drastically increases performance even if worker is
 
 ```toml
 [dependencies]
-spider = { version = "1.46.5", features = ["decentralized"] }
+spider = { version = "1.48.0", features = ["decentralized"] }
 ```
 
 ```sh
@@ -136,7 +136,7 @@ Use the subscribe method to get a broadcast channel.
 
 ```toml
 [dependencies]
-spider = { version = "1.46.5", features = ["sync"] }
+spider = { version = "1.48.0", features = ["sync"] }
 ```
 
 ```rust,no_run
@@ -166,7 +166,7 @@ Allow regex for blacklisting routes
 
 ```toml
 [dependencies]
-spider = { version = "1.46.5", features = ["regex"] }
+spider = { version = "1.48.0", features = ["regex"] }
 ```
 
 ```rust,no_run
@@ -193,7 +193,7 @@ If you are performing large workloads you may need to control the crawler by ena
 
 ```toml
 [dependencies]
-spider = { version = "1.46.5", features = ["control"] }
+spider = { version = "1.48.0", features = ["control"] }
 ```
 
 ```rust

diff --git a/spider/src/packages/scraper/element_ref/mod.rs b/spider/src/packages/scraper/element_ref/mod.rs
@@ -128,8 +128,22 @@ impl<'a> Iterator for Text<'a> {
     fn next(&mut self) -> Option<&'a str> {
         for edge in &mut self.inner {
             if let Edge::Open(node) = edge {
-                if let Node::Text(ref text) = node.value() {
-                    return Some(&**text);
+                let node_value = node.value();
+
+                match node_value.as_element() {
+                    Some(v) => {
+                        let n = v.name();
+                        if n != "script" || n != "style" {
+                            if let Node::Text(ref text) = node_value {
+                                return Some(&**text);
+                            }
+                        }
+                    }
+                    _ => {
+                        if let Node::Text(ref text) = node_value {
+                            return Some(&**text);
+                        }
+                    }
                 }
             }
         }

diff --git a/spider/src/page.rs b/spider/src/page.rs
@@ -1,10 +1,11 @@
 #[cfg(not(feature = "decentralized"))]
 use crate::packages::scraper::Html;
+use crate::utils::PageResponse;
 use crate::CaseInsensitiveString;
 use bytes::Bytes;
 use compact_str::CompactString;
 use hashbrown::HashSet;
-use reqwest::Client;
+use reqwest::{Client, StatusCode};
 use smallvec::SmallVec;
 
 #[cfg(all(feature = "time", not(feature = "decentralized")))]
@@ -31,13 +32,17 @@ pub struct Page {
     base: Url,
     /// The raw url for the page. Useful since Url::parse adds a trailing slash.
     url: String,
-    #[cfg(feature = "time")]
-    /// The duration from start of parsing to end of gathering links.
-    duration: Instant,
+    /// The status code of the page request.
+    pub status_code: StatusCode,
+    /// The error of the request if any.
+    pub error_status: Option<String>,
     /// The external urls to group with the domain
     pub external_domains_caseless: Box<HashSet<CaseInsensitiveString>>,
     /// The final destination of the page if redirects were performed [Not implemented in the chrome feature].
     pub final_redirect_destination: Option<String>,
+    #[cfg(feature = "time")]
+    /// The duration from start of parsing to end of gathering links.
+    duration: Instant,
 }
 
 /// Represent a page visited. This page contains HTML scraped with [scraper](https://crates.io/crates/scraper).
@@ -46,6 +51,10 @@ pub struct Page {
 pub struct Page {
     /// The bytes of the resource.
     html: Option<Bytes>,
+    /// The status code of the page request.
+    pub status_code: StatusCode,
+    /// The error of the request if any.
+    pub error_status: Option<String>,
     /// The current links for the page.
     pub links: HashSet<CaseInsensitiveString>,
     /// The external urls to group with the domain.
@@ -135,26 +144,50 @@ pub fn get_page_selectors(
 
 /// Instantiate a new page without scraping it (used for testing purposes).
 #[cfg(not(feature = "decentralized"))]
-pub fn build(url: &str, html: Option<bytes::Bytes>) -> Page {
+pub fn build(url: &str, res: PageResponse) -> Page {
     Page {
-        html: if html.is_some() { html } else { None },
+        html: if res.content.is_some() {
+            res.content
+        } else {
+            None
+        },
         base: Url::parse(&url).expect("Invalid page URL"),
         url: url.into(),
         #[cfg(feature = "time")]
         duration: Instant::now(),
         external_domains_caseless: Default::default(),
-        final_redirect_destination: Default::default(),
+        final_redirect_destination: res.final_url,
+        status_code: res.status_code,
+        error_status: match res.error_for_status {
+            Some(e) => match e {
+                Ok(_) => None,
+                Err(er) => Some(er.to_string()),
+            },
+            _ => None,
+        },
     }
 }
 
 /// Instantiate a new page without scraping it (used for testing purposes).
 #[cfg(feature = "decentralized")]
-pub fn build(_: &str, html: Option<bytes::Bytes>) -> Page {
+pub fn build(_: &str, res: PageResponse) -> Page {
     Page {
-        html: if html.is_some() { html } else { None },
+        html: if res.content.is_some() {
+            res.content
+        } else {
+            None
+        },
         links: Default::default(),
         external_domains_caseless: Default::default(),
-        final_redirect_destination: Default::default(),
+        final_redirect_destination: res.final_url,
+        status_code: res.status_code,
+        error_status: match res.error_for_status {
+            Some(e) => match e {
+                Ok(_) => None,
+                Err(er) => Some(er.to_string()),
+            },
+            _ => None,
+        },
     }
 }
 
@@ -163,27 +196,21 @@ impl Page {
     /// Instantiate a new page and gather the html.
     pub async fn new(url: &str, client: &Client, page: &chromiumoxide_fork::Page) -> Self {
         let page_resource = crate::utils::fetch_page_html(&url, &client, &page).await;
-        let mut page = build(url, page_resource.0);
-        page.set_final_redirect(page_resource.1);
-        page
+        build(url, page_resource)
     }
 
     #[cfg(not(feature = "decentralized"))]
     /// Instantiate a new page and gather the html repro of standard fetch_page_html.
     pub async fn new_page(url: &str, client: &Client) -> Self {
         let page_resource = crate::utils::fetch_page_html_raw(&url, &client).await;
-        let mut page = build(url, page_resource.0);
-        page.set_final_redirect(page_resource.1);
-        page
+        build(url, page_resource)
     }
 
     /// Instantiate a new page and gather the html.
     #[cfg(all(not(feature = "decentralized"), not(feature = "chrome")))]
     pub async fn new(url: &str, client: &Client) -> Self {
         let page_resource = crate::utils::fetch_page_html(&url, &client).await;
-        let mut page = build(url, page_resource.0);
-        page.set_final_redirect(page_resource.1);
-        page
+        build(url, page_resource)
     }
 
     /// Instantiate a new page and gather the links.
@@ -207,6 +234,8 @@ impl Page {
             links,
             external_domains_caseless: Default::default(),
             final_redirect_destination: Default::default(),
+            status_code: Default::default(),
+            error_status: Default::default(),
         }
     }
 
@@ -235,11 +264,6 @@ impl Page {
         self.external_domains_caseless = external_domains_caseless;
     }
 
-    /// Set final redirect destination
-    pub fn set_final_redirect(&mut self, final_redirect_destination: Option<String>) {
-        self.final_redirect_destination = final_redirect_destination;
-    }
-
     /// Parsed URL getter for page.
     #[cfg(not(feature = "decentralized"))]
     pub fn get_url_parsed(&self) -> &Url {