feat(cron): add cron feature flag (#153)

spider-rs · Nov 25, 2023 · cd5cec5 · cd5cec5
1 parent 16c796a
commit cd5cec5
Show file tree

Hide file tree

Showing 10 changed files with 573 additions and 25 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/examples/Cargo.toml b/examples/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider_examples"
-version = "1.49.13"
+version = "1.50.0"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "Multithreaded web crawler written in Rust."
 repository = "https://github.com/spider-rs/spider"
@@ -22,7 +22,7 @@ htr = "0.5.27"
 flexbuffers = "2.0.0"
 
 [dependencies.spider]
-version = "1.49.13"
+version = "1.50.0"
 path = "../spider"
 features = ["serde"]
 

diff --git a/spider/Cargo.toml b/spider/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "spider"
-version = "1.49.13"
+version = "1.50.0"
 authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
 description = "The fastest web crawler written in Rust."
 repository = "https://github.com/spider-rs/spider"
@@ -43,12 +43,15 @@ case_insensitive_string = { version = "0.1.7", features = [ "compact", "serde" ]
 jsdom = { version = "0.0.11-alpha.1", optional = true, features = [ "hashbrown", "tokio" ] }
 chromiumoxide = { version = "0.5.6", optional = true, features = ["tokio-runtime", "bytes"], default-features = false }
 sitemap = { version = "0.4.1", optional = true }
+chrono = "0.4.31"
+cron = "0.12.0"
+async-trait = "0.1.74"
 
 [target.'cfg(all(not(windows), not(target_os = "android"), not(target_env = "musl")))'.dependencies]
 tikv-jemallocator = { version = "0.5.0", optional = true }
 
 [features]
-default = ["sync"]
+default = ["sync", "cron"]
 regex = ["dep:regex"]
 glob = ["dep:regex", "dep:itertools"]
 ua_generator = ["dep:ua_generator"]
@@ -70,4 +73,5 @@ chrome = ["dep:chromiumoxide"]
 chrome_headed = ["chrome"]
 chrome_cpu = ["chrome"]
 chrome_stealth = ["chrome"]
-cookies = ["reqwest/cookies"]
+cookies = ["reqwest/cookies"]
+cron = []
diff --git a/spider/README.md b/spider/README.md
@@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom
 
 ```toml
 [dependencies]
-spider = "1.49.13"
+spider = "1.50.0"
 ```
 
 And then the code:
@@ -30,7 +30,7 @@ use spider::tokio;
 #[tokio::main]
 async fn main() {
     let url = "https://choosealicense.com";
-    let mut website: Website = Website::new(&url);
+    let mut website = Website::new(&url);
     website.crawl().await;
 
     for link in website.get_links() {
@@ -43,7 +43,7 @@ You can use `Configuration` object to configure your crawler:
 
 ```rust
 // ..
-let mut website: Website = Website::new("https://choosealicense.com");
+let mut website = Website::new("https://choosealicense.com");
 
 website.configuration.respect_robots_txt = true;
 website.configuration.subdomains = true;
@@ -56,6 +56,8 @@ website.on_link_find_callback = Some(|s, html| { println!("link target: {}", s);
 website.configuration.blacklist_url.get_or_insert(Default::default()).push("https://choosealicense.com/licenses/".into());
 website.configuration.proxies.get_or_insert(Default::default()).push("socks5://10.1.1.1:12345".into()); // Defaults to None - proxy list.
 website.budget = Some(spider::hashbrown::HashMap::from([(spider::CaseInsensitiveString::new("*"), 300), (spider::CaseInsensitiveString::new("/licenses"), 10)])); // Defaults to None - Requires the `budget` feature flag
+website.cron_str = "1/5 * * * * *".into(); // Defaults to empty string - Requires the `cron` feature flag
+website.cron_type = spider::website::CronType::Crawl; // Defaults to CronType::Crawl - Requires the `cron` feature flag
 
 website.crawl().await;
 ```
@@ -78,6 +80,8 @@ website
     .with_external_domains(Some(Vec::from(["https://creativecommons.org/licenses/by/3.0/"].map(|d| d.to_string())).into_iter()))
     .with_headers(None)
     .with_blacklist_url(Some(Vec::from(["https://choosealicense.com/licenses/".into()])))
+    // requires the `cron` feature flag
+    .with_cron(&"1/5 * * * * *", Default::Default());
     .with_proxies(None);
 ```
 
@@ -87,7 +91,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl
 
 ```toml
 [dependencies]
-spider = { version = "1.49.13", features = ["regex", "ua_generator"] }
+spider = { version = "1.50.0", features = ["regex", "ua_generator"] }
 ```
 
 1. `ua_generator`: Enables auto generating a random real User-Agent.
@@ -117,7 +121,7 @@ Move processing to a worker, drastically increases performance even if worker is
 
 ```toml
 [dependencies]
-spider = { version = "1.49.13", features = ["decentralized"] }
+spider = { version = "1.50.0", features = ["decentralized"] }
 ```
 
 ```sh
@@ -137,7 +141,7 @@ Use the subscribe method to get a broadcast channel.
 
 ```toml
 [dependencies]
-spider = { version = "1.49.13", features = ["sync"] }
+spider = { version = "1.50.0", features = ["sync"] }
 ```
 
 ```rust,no_run
@@ -148,7 +152,7 @@ use spider::tokio;
 
 #[tokio::main]
 async fn main() {
-    let mut website: Website = Website::new("https://choosealicense.com");
+    let mut website = Website::new("https://choosealicense.com");
     let mut rx2 = website.subscribe(16).unwrap();
 
     let join_handle = tokio::spawn(async move {
@@ -167,7 +171,7 @@ Allow regex for blacklisting routes
 
 ```toml
 [dependencies]
-spider = { version = "1.49.13", features = ["regex"] }
+spider = { version = "1.50.0", features = ["regex"] }
 ```
 
 ```rust,no_run
@@ -178,7 +182,7 @@ use spider::tokio;
 
 #[tokio::main]
 async fn main() {
-    let mut website: Website = Website::new("https://choosealicense.com");
+    let mut website = Website::new("https://choosealicense.com");
     website.configuration.blacklist_url.push("/licenses/".into());
     website.crawl().await;
 
@@ -194,7 +198,7 @@ If you are performing large workloads you may need to control the crawler by ena
 
 ```toml
 [dependencies]
-spider = { version = "1.49.13", features = ["control"] }
+spider = { version = "1.50.0", features = ["control"] }
 ```
 
 ```rust
@@ -236,7 +240,7 @@ async fn main() {
     use std::io::{Write, stdout};
 
     let url = "https://choosealicense.com/";
-    let mut website: Website = Website::new(&url);
+    let mut website = Website::new(&url);
 
     website.scrape().await;
 
@@ -258,11 +262,49 @@ async fn main() {
 }
 ```
 
+### Cron Jobs
+
+Use cron jobs to run crawls continuously at anytime.
+
+```toml
+[dependencies]
+spider = { version = "1.50.0", features = ["sync", "cron"] }
+```
+
+```rust,no_run
+extern crate spider;
+
+use spider::website::{Website, run_cron};
+use spider::tokio;
+
+#[tokio::main]
+async fn main() {
+    let mut website = Website::new("https://choosealicense.com");
+    // set the cron to run or use the builder pattern `website.with_cron`.
+    website.cron_str = "1/5 * * * * *".into();
+
+    let mut rx2 = website.subscribe(16).unwrap();
+
+    let join_handle = tokio::spawn(async move {
+        while let Ok(res) = rx2.recv().await {
+            println!("{:?}", res.get_url());
+        }
+    });
+
+    // take ownership of the website. You can also use website.run_cron, except you need to perform abort manually on handles created.
+    let runner = run_cron(website).await;
+
+    println!("Starting the Runner for 10 seconds");
+    tokio::time::sleep(Duration::from_secs(10)).await;
+    let _ = tokio::join!(runner.stop(), join_handle);
+}
+```
+
 ### Chrome
 
 ```toml
 [dependencies]
-spider = { version = "1.49.13", features = ["chrome"] }
+spider = { version = "1.50.0", features = ["chrome"] }
 ```
 
 You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug.