chore(website): add crawl_concurrent_raw method (#152)

spider-rs · Nov 21, 2023 · 8103694 · 8103694
1 parent 5a497bb
commit 8103694
Show file tree

Hide file tree

Showing 8 changed files with 248 additions and 120 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/examples/Cargo.toml b/examples/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_examples"
-version = "1.49.6"
+version = "1.49.7"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "Multithreaded web crawler written in Rust."
 repository = "https://github.com/spider-rs/spider"
@@ -22,7 +22,7 @@ htr = "0.5.27"
 flexbuffers = "2.0.0"
 
 [dependencies.spider]
-version = "1.49.6"
+version = "1.49.7"
 path = "../spider"
 features = ["serde"]
 

diff --git a/spider/Cargo.toml b/spider/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider"
-version = "1.49.6"
+version = "1.49.7"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "The fastest web crawler written in Rust."
 repository = "https://github.com/spider-rs/spider"

diff --git a/spider/README.md b/spider/README.md
@@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom
 
 ```toml
 [dependencies]
-spider = "1.49.6"
+spider = "1.49.7"
 ```
 
 And then the code:
@@ -87,7 +87,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl
 
 ```toml
 [dependencies]
-spider = { version = "1.49.6", features = ["regex", "ua_generator"] }
+spider = { version = "1.49.7", features = ["regex", "ua_generator"] }
 ```
 
 1. `ua_generator`: Enables auto generating a random real User-Agent.
@@ -116,7 +116,7 @@ Move processing to a worker, drastically increases performance even if worker is
 
 ```toml
 [dependencies]
-spider = { version = "1.49.6", features = ["decentralized"] }
+spider = { version = "1.49.7", features = ["decentralized"] }
 ```
 
 ```sh
@@ -136,7 +136,7 @@ Use the subscribe method to get a broadcast channel.
 
 ```toml
 [dependencies]
-spider = { version = "1.49.6", features = ["sync"] }
+spider = { version = "1.49.7", features = ["sync"] }
 ```
 
 ```rust,no_run
@@ -166,7 +166,7 @@ Allow regex for blacklisting routes
 
 ```toml
 [dependencies]
-spider = { version = "1.49.6", features = ["regex"] }
+spider = { version = "1.49.7", features = ["regex"] }
 ```
 
 ```rust,no_run
@@ -193,7 +193,7 @@ If you are performing large workloads you may need to control the crawler by ena
 
 ```toml
 [dependencies]
-spider = { version = "1.49.6", features = ["control"] }
+spider = { version = "1.49.7", features = ["control"] }
 ```
 
 ```rust
@@ -257,6 +257,15 @@ async fn main() {
 }
 ```
 
+### Chrome
+
+```toml
+[dependencies]
+spider = { version = "1.49.7", features = ["chrome"] }
+```
+
+You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug.
+
 ### Blocking
 
 If you need a blocking sync implementation use a version prior to `v1.12.0`.
diff --git a/spider/src/page.rs b/spider/src/page.rs
@@ -197,14 +197,6 @@ pub fn build(_: &str, res: PageResponse) -> Page {
 }
 
 impl Page {
-    #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
-    /// Instantiate a new page and gather the html.
-    pub async fn new(url: &str, client: &Client, page: &chromiumoxide::Page) -> Self {
-        let page_resource = crate::utils::fetch_page_html(&url, &client, &page).await;
-        build(url, page_resource)
-    }
-
-    #[cfg(not(feature = "decentralized"))]
     /// Instantiate a new page and gather the html repro of standard fetch_page_html.
     pub async fn new_page(url: &str, client: &Client) -> Self {
         let page_resource = crate::utils::fetch_page_html_raw(&url, &client).await;
@@ -218,6 +210,13 @@ impl Page {
         build(url, page_resource)
     }
 
+    #[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
+    /// Instantiate a new page and gather the html.
+    pub async fn new(url: &str, client: &Client, page: &chromiumoxide::Page) -> Self {
+        let page_resource = crate::utils::fetch_page_html(&url, &client, &page).await;
+        build(url, page_resource)
+    }
+
     /// Instantiate a new page and gather the links.
     #[cfg(feature = "decentralized")]
     pub async fn new(url: &str, client: &Client) -> Self {