feat(cron): add cron feature flag (#153)

spider-rs · Nov 25, 2023 · dfef12f · dfef12f
1 parent 16c796a
commit dfef12f
Show file tree

Hide file tree

Showing 14 changed files with 631 additions and 33 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## Unreleased
 
+## v1.50.1
+
+1. feat(cron): add cron feature flag [#153]
+
 ## v1.36.0
 
 1. feat(sync): subscribe to page updates to perform async handling of data

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/README.md b/README.md
@@ -7,6 +7,16 @@
 
 The fastest web crawler and indexer. Foundational building blocks for data curation workloads.
 
+- Concurrent
+- Streaming
+- Decentralization
+- Headless Chrome Rendering
+- HTTP Proxies
+- Cron Jobs
+- Subscriptions
+- Blacklisting and Budgeting Depth
+- [Changelog](CHANGELOG.md)
+
 ## Getting Started
 
 The simplest way to get started is to use the [Spider Cloud](https://spiderwebai.xyz) for a pain free hosted service. View the [spider](./spider/README.md) or [spider_cli](./spider_cli/README.md) directory for local installations.

diff --git a/examples/Cargo.toml b/examples/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_examples"
-version = "1.49.13"
+version = "1.50.1"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "Multithreaded web crawler written in Rust."
 repository = "https://github.com/spider-rs/spider"
@@ -22,7 +22,7 @@ htr = "0.5.27"
 flexbuffers = "2.0.0"
 
 [dependencies.spider]
-version = "1.49.13"
+version = "1.50.1"
 path = "../spider"
 features = ["serde"]
 
@@ -70,4 +70,9 @@ path = "configuration.rs"
 [[example]]
 name = "budget"
 path = "budget.rs"
-required-features = ["spider/budget", "spider/sync"]
+required-features = ["spider/budget", "spider/sync"]
+
+[[example]]
+name = "cron"
+path = "cron.rs"
+required-features = ["spider/sync", "spider/cron"]
diff --git a/examples/README.md b/examples/README.md
@@ -39,3 +39,7 @@ Crawl the page and output the links via [Serde](./serde.rs).
 Crawl links with a budget of amount of pages allowed [Budget](./budget.rs).
 
 - `cargo run --example budget`
+
+Crawl links at a given cron time [Cron](./cron.rs).
+
+- `cargo run --example cron`
diff --git a/examples/cron.rs b/examples/cron.rs
@@ -0,0 +1,25 @@
+//! `cargo run --example cron`
+extern crate spider;
+
+use spider::tokio;
+use spider::website::{Website, run_cron};
+
+#[tokio::main]
+async fn main() {
+    let mut website: Website = Website::new("https://rsseau.fr");
+    website.cron_str = "1/5 * * * * *".into();
+
+    let mut rx2 = website.subscribe(16).unwrap();
+
+   let join_handle = tokio::spawn(async move {
+        while let Ok(res) = rx2.recv().await {
+            println!("{:?}", res.get_url());
+        }
+    });
+
+    let runner = run_cron(website).await;
+
+    println!("Starting the Runner for 20 seconds");
+    tokio::time::sleep(tokio::time::Duration::from_secs(20)).await;
+    let _ = tokio::join!(runner.stop(), join_handle);
+}
diff --git a/spider/Cargo.toml b/spider/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider"
-version = "1.49.13"
+version = "1.50.1"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "The fastest web crawler written in Rust."
 repository = "https://github.com/spider-rs/spider"
@@ -43,12 +43,15 @@ case_insensitive_string = { version = "0.1.7", features = [ "compact", "serde" ]
 jsdom = { version = "0.0.11-alpha.1", optional = true, features = [ "hashbrown", "tokio" ] }
 chromiumoxide = { version = "0.5.6", optional = true, features = ["tokio-runtime", "bytes"], default-features = false }
 sitemap = { version = "0.4.1", optional = true }
+chrono = { version = "0.4.31", optional = true }
+cron = { version = "0.12.0", optional = true }
+async-trait = { version = "0.1.74", optional = true }
 
 [target.'cfg(all(not(windows), not(target_os = "android"), not(target_env = "musl")))'.dependencies]
 tikv-jemallocator = { version = "0.5.0", optional = true }
 
 [features]
-default = ["sync"]
+default = ["sync", "cron"]
 regex = ["dep:regex"]
 glob = ["dep:regex", "dep:itertools"]
 ua_generator = ["dep:ua_generator"]
@@ -70,4 +73,5 @@ chrome = ["dep:chromiumoxide"]
 chrome_headed = ["chrome"]
 chrome_cpu = ["chrome"]
 chrome_stealth = ["chrome"]
-cookies = ["reqwest/cookies"]
+cookies = ["reqwest/cookies"]
+cron = ["dep:chrono", "dep:cron", "dep:async-trait"]
diff --git a/spider/README.md b/spider/README.md
@@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom
 
 ```toml
 [dependencies]
-spider = "1.49.13"
+spider = "1.50.1"
 ```
 
 And then the code:
@@ -30,7 +30,7 @@ use spider::tokio;
 #[tokio::main]
 async fn main() {
     let url = "https://choosealicense.com";
-    let mut website: Website = Website::new(&url);
+    let mut website = Website::new(&url);
     website.crawl().await;
 
     for link in website.get_links() {
@@ -43,7 +43,7 @@ You can use `Configuration` object to configure your crawler:
 
 ```rust
 // ..
-let mut website: Website = Website::new("https://choosealicense.com");
+let mut website = Website::new("https://choosealicense.com");
 
 website.configuration.respect_robots_txt = true;
 website.configuration.subdomains = true;
@@ -52,10 +52,12 @@ website.configuration.delay = 0; // Defaults to 0 ms due to concurrency handling
 website.configuration.request_timeout = None; // Defaults to 15000 ms
 website.configuration.http2_prior_knowledge = false; // Enable if you know the webserver supports http2
 website.configuration.user_agent = Some("myapp/version".into()); // Defaults to using a random agent
-website.on_link_find_callback = Some(|s, html| { println!("link target: {}", s); (s, html)}); // Callback to run on each link find
+website.on_link_find_callback = Some(|s, html| { println!("link target: {}", s); (s, html)}); // Callback to run on each link find - useful for mutating the url, ex: convert the top level domain from `.fr` to `.es`.
 website.configuration.blacklist_url.get_or_insert(Default::default()).push("https://choosealicense.com/licenses/".into());
 website.configuration.proxies.get_or_insert(Default::default()).push("socks5://10.1.1.1:12345".into()); // Defaults to None - proxy list.
 website.budget = Some(spider::hashbrown::HashMap::from([(spider::CaseInsensitiveString::new("*"), 300), (spider::CaseInsensitiveString::new("/licenses"), 10)])); // Defaults to None - Requires the `budget` feature flag
+website.cron_str = "1/5 * * * * *".into(); // Defaults to empty string - Requires the `cron` feature flag
+website.cron_type = spider::website::CronType::Crawl; // Defaults to CronType::Crawl - Requires the `cron` feature flag
 
 website.crawl().await;
 ```
@@ -78,6 +80,8 @@ website
     .with_external_domains(Some(Vec::from(["https://creativecommons.org/licenses/by/3.0/"].map(|d| d.to_string())).into_iter()))
     .with_headers(None)
     .with_blacklist_url(Some(Vec::from(["https://choosealicense.com/licenses/".into()])))
+    // requires the `cron` feature flag
+    .with_cron("1/5 * * * * *", Default::Default());
     .with_proxies(None);
 ```
 
@@ -87,7 +91,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl
 
 ```toml
 [dependencies]
-spider = { version = "1.49.13", features = ["regex", "ua_generator"] }
+spider = { version = "1.50.1", features = ["regex", "ua_generator"] }
 ```
 
 1. `ua_generator`: Enables auto generating a random real User-Agent.
@@ -97,27 +101,28 @@ spider = { version = "1.49.13", features = ["regex", "ua_generator"] }
 1. `sync`: Subscribe to changes for Page data processing async. [Enabled by default]
 1. `budget`: Allows setting a crawl budget per path with depth.
 1. `control`: Enables the ability to pause, start, and shutdown crawls on demand.
-1. `full_resources`: Enables gathering all content that relates to the domain like css,jss, and etc.
+1. `full_resources`: Enables gathering all content that relates to the domain like CSS, JS, and etc.
 1. `serde`: Enables serde serialization support.
 1. `socks`: Enables socks5 proxy support.
 1. `glob`: Enables [url glob](https://everything.curl.dev/cmdline/globbing) support.
 1. `fs`: Enables storing resources to disk for parsing (may greatly increases performance at the cost of temp storage). [Enabled by default]
 1. `js`: Enables javascript parsing links created with the alpha [jsdom](https://github.com/a11ywatch/jsdom) crate.
 1. `sitemap`: Include sitemap pages in results.
 1. `time`: Enables duration tracking per page.
-1. `chrome`: Enables chrome headless rendering, use the env var `CHROME_URL` to connect remotely [experimental].
+1. `chrome`: Enables chrome headless rendering, use the env var `CHROME_URL` to connect remotely.
 1. `chrome_headed`: Enables chrome rendering headful rendering [experimental].
 1. `chrome_cpu`: Disable gpu usage for chrome browser.
 1. `chrome_stealth`: Enables stealth mode to make it harder to be detected as a bot.
 1. `cookies`: Enables cookies storing and setting to use for request.
+1. `cron`: Enables the ability to start cron jobs for the website.
 
 ### Decentralization
 
 Move processing to a worker, drastically increases performance even if worker is on the same machine due to efficient runtime split IO work.
 
 ```toml
 [dependencies]
-spider = { version = "1.49.13", features = ["decentralized"] }
+spider = { version = "1.50.1", features = ["decentralized"] }
 ```
 
 ```sh
@@ -137,7 +142,7 @@ Use the subscribe method to get a broadcast channel.
 
 ```toml
 [dependencies]
-spider = { version = "1.49.13", features = ["sync"] }
+spider = { version = "1.50.1", features = ["sync"] }
 ```
 
 ```rust,no_run
@@ -148,7 +153,7 @@ use spider::tokio;
 
 #[tokio::main]
 async fn main() {
-    let mut website: Website = Website::new("https://choosealicense.com");
+    let mut website = Website::new("https://choosealicense.com");
     let mut rx2 = website.subscribe(16).unwrap();
 
     let join_handle = tokio::spawn(async move {
@@ -167,7 +172,7 @@ Allow regex for blacklisting routes
 
 ```toml
 [dependencies]
-spider = { version = "1.49.13", features = ["regex"] }
+spider = { version = "1.50.1", features = ["regex"] }
 ```
 
 ```rust,no_run
@@ -178,7 +183,7 @@ use spider::tokio;
 
 #[tokio::main]
 async fn main() {
-    let mut website: Website = Website::new("https://choosealicense.com");
+    let mut website = Website::new("https://choosealicense.com");
     website.configuration.blacklist_url.push("/licenses/".into());
     website.crawl().await;
 
@@ -194,7 +199,7 @@ If you are performing large workloads you may need to control the crawler by ena
 
 ```toml
 [dependencies]
-spider = { version = "1.49.13", features = ["control"] }
+spider = { version = "1.50.1", features = ["control"] }
 ```
 
 ```rust
@@ -211,10 +216,10 @@ async fn main() {
 
     tokio::spawn(async move {
         pause(url).await;
-        sleep(Duration::from_millis(5000)).await;
+        sleep(tokio::time::Duration::from_millis(5000)).await;
         resume(url).await;
         // perform shutdown if crawl takes longer than 15s
-        sleep(Duration::from_millis(15000)).await;
+        sleep(tokio::time::Duration::from_millis(15000)).await;
         // you could also abort the task to shutdown crawls if using website.crawl in another thread.
         shutdown(url).await;
     });
@@ -236,7 +241,7 @@ async fn main() {
     use std::io::{Write, stdout};
 
     let url = "https://choosealicense.com/";
-    let mut website: Website = Website::new(&url);
+    let mut website = Website::new(&url);
 
     website.scrape().await;
 
@@ -258,11 +263,49 @@ async fn main() {
 }
 ```
 
+### Cron Jobs
+
+Use cron jobs to run crawls continuously at anytime.
+
+```toml
+[dependencies]
+spider = { version = "1.50.1", features = ["sync", "cron"] }
+```
+
+```rust,no_run
+extern crate spider;
+
+use spider::website::{Website, run_cron};
+use spider::tokio;
+
+#[tokio::main]
+async fn main() {
+    let mut website = Website::new("https://choosealicense.com");
+    // set the cron to run or use the builder pattern `website.with_cron`.
+    website.cron_str = "1/5 * * * * *".into();
+
+    let mut rx2 = website.subscribe(16).unwrap();
+
+    let join_handle = tokio::spawn(async move {
+        while let Ok(res) = rx2.recv().await {
+            println!("{:?}", res.get_url());
+        }
+    });
+
+    // take ownership of the website. You can also use website.run_cron, except you need to perform abort manually on handles created.
+    let runner = run_cron(website).await;
+
+    println!("Starting the Runner for 10 seconds");
+    tokio::time::sleep(tokio::time::Duration::from_secs(10)).await;
+    let _ = tokio::join!(runner.stop(), join_handle);
+}
+```
+
 ### Chrome
 
 ```toml
 [dependencies]
-spider = { version = "1.49.13", features = ["chrome"] }
+spider = { version = "1.50.1", features = ["chrome"] }
 ```
 
 You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug.