Skip to content

Commit

Permalink
feat(cron): add cron feature flag (#153)
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 25, 2023
1 parent 16c796a commit cd5cec5
Show file tree
Hide file tree
Showing 10 changed files with 573 additions and 25 deletions.
22 changes: 18 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "1.49.13"
version = "1.50.0"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand All @@ -22,7 +22,7 @@ htr = "0.5.27"
flexbuffers = "2.0.0"

[dependencies.spider]
version = "1.49.13"
version = "1.50.0"
path = "../spider"
features = ["serde"]

Expand Down
10 changes: 7 additions & 3 deletions spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "1.49.13"
version = "1.50.0"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "The fastest web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand Down Expand Up @@ -43,12 +43,15 @@ case_insensitive_string = { version = "0.1.7", features = [ "compact", "serde" ]
jsdom = { version = "0.0.11-alpha.1", optional = true, features = [ "hashbrown", "tokio" ] }
chromiumoxide = { version = "0.5.6", optional = true, features = ["tokio-runtime", "bytes"], default-features = false }
sitemap = { version = "0.4.1", optional = true }
chrono = "0.4.31"
cron = "0.12.0"
async-trait = "0.1.74"

[target.'cfg(all(not(windows), not(target_os = "android"), not(target_env = "musl")))'.dependencies]
tikv-jemallocator = { version = "0.5.0", optional = true }

[features]
default = ["sync"]
default = ["sync", "cron"]
regex = ["dep:regex"]
glob = ["dep:regex", "dep:itertools"]
ua_generator = ["dep:ua_generator"]
Expand All @@ -70,4 +73,5 @@ chrome = ["dep:chromiumoxide"]
chrome_headed = ["chrome"]
chrome_cpu = ["chrome"]
chrome_stealth = ["chrome"]
cookies = ["reqwest/cookies"]
cookies = ["reqwest/cookies"]
cron = []
66 changes: 54 additions & 12 deletions spider/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom

```toml
[dependencies]
spider = "1.49.13"
spider = "1.50.0"
```

And then the code:
Expand All @@ -30,7 +30,7 @@ use spider::tokio;
#[tokio::main]
async fn main() {
let url = "https://choosealicense.com";
let mut website: Website = Website::new(&url);
let mut website = Website::new(&url);
website.crawl().await;
for link in website.get_links() {
Expand All @@ -43,7 +43,7 @@ You can use `Configuration` object to configure your crawler:

```rust
// ..
let mut website: Website = Website::new("https://choosealicense.com");
let mut website = Website::new("https://choosealicense.com");

website.configuration.respect_robots_txt = true;
website.configuration.subdomains = true;
Expand All @@ -56,6 +56,8 @@ website.on_link_find_callback = Some(|s, html| { println!("link target: {}", s);
website.configuration.blacklist_url.get_or_insert(Default::default()).push("https://choosealicense.com/licenses/".into());
website.configuration.proxies.get_or_insert(Default::default()).push("socks5://10.1.1.1:12345".into()); // Defaults to None - proxy list.
website.budget = Some(spider::hashbrown::HashMap::from([(spider::CaseInsensitiveString::new("*"), 300), (spider::CaseInsensitiveString::new("/licenses"), 10)])); // Defaults to None - Requires the `budget` feature flag
website.cron_str = "1/5 * * * * *".into(); // Defaults to empty string - Requires the `cron` feature flag
website.cron_type = spider::website::CronType::Crawl; // Defaults to CronType::Crawl - Requires the `cron` feature flag

website.crawl().await;
```
Expand All @@ -78,6 +80,8 @@ website
.with_external_domains(Some(Vec::from(["https://creativecommons.org/licenses/by/3.0/"].map(|d| d.to_string())).into_iter()))
.with_headers(None)
.with_blacklist_url(Some(Vec::from(["https://choosealicense.com/licenses/".into()])))
// requires the `cron` feature flag
.with_cron(&"1/5 * * * * *", Default::Default());
.with_proxies(None);
```

Expand All @@ -87,7 +91,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl

```toml
[dependencies]
spider = { version = "1.49.13", features = ["regex", "ua_generator"] }
spider = { version = "1.50.0", features = ["regex", "ua_generator"] }
```

1. `ua_generator`: Enables auto generating a random real User-Agent.
Expand Down Expand Up @@ -117,7 +121,7 @@ Move processing to a worker, drastically increases performance even if worker is

```toml
[dependencies]
spider = { version = "1.49.13", features = ["decentralized"] }
spider = { version = "1.50.0", features = ["decentralized"] }
```

```sh
Expand All @@ -137,7 +141,7 @@ Use the subscribe method to get a broadcast channel.

```toml
[dependencies]
spider = { version = "1.49.13", features = ["sync"] }
spider = { version = "1.50.0", features = ["sync"] }
```

```rust,no_run
Expand All @@ -148,7 +152,7 @@ use spider::tokio;
#[tokio::main]
async fn main() {
let mut website: Website = Website::new("https://choosealicense.com");
let mut website = Website::new("https://choosealicense.com");
let mut rx2 = website.subscribe(16).unwrap();
let join_handle = tokio::spawn(async move {
Expand All @@ -167,7 +171,7 @@ Allow regex for blacklisting routes

```toml
[dependencies]
spider = { version = "1.49.13", features = ["regex"] }
spider = { version = "1.50.0", features = ["regex"] }
```

```rust,no_run
Expand All @@ -178,7 +182,7 @@ use spider::tokio;
#[tokio::main]
async fn main() {
let mut website: Website = Website::new("https://choosealicense.com");
let mut website = Website::new("https://choosealicense.com");
website.configuration.blacklist_url.push("/licenses/".into());
website.crawl().await;
Expand All @@ -194,7 +198,7 @@ If you are performing large workloads you may need to control the crawler by ena

```toml
[dependencies]
spider = { version = "1.49.13", features = ["control"] }
spider = { version = "1.50.0", features = ["control"] }
```

```rust
Expand Down Expand Up @@ -236,7 +240,7 @@ async fn main() {
use std::io::{Write, stdout};

let url = "https://choosealicense.com/";
let mut website: Website = Website::new(&url);
let mut website = Website::new(&url);

website.scrape().await;

Expand All @@ -258,11 +262,49 @@ async fn main() {
}
```

### Cron Jobs

Use cron jobs to run crawls continuously at anytime.

```toml
[dependencies]
spider = { version = "1.50.0", features = ["sync", "cron"] }
```

```rust,no_run
extern crate spider;
use spider::website::{Website, run_cron};
use spider::tokio;
#[tokio::main]
async fn main() {
let mut website = Website::new("https://choosealicense.com");
// set the cron to run or use the builder pattern `website.with_cron`.
website.cron_str = "1/5 * * * * *".into();
let mut rx2 = website.subscribe(16).unwrap();
let join_handle = tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
println!("{:?}", res.get_url());
}
});
// take ownership of the website. You can also use website.run_cron, except you need to perform abort manually on handles created.
let runner = run_cron(website).await;
println!("Starting the Runner for 10 seconds");
tokio::time::sleep(Duration::from_secs(10)).await;
let _ = tokio::join!(runner.stop(), join_handle);
}
```

### Chrome

```toml
[dependencies]
spider = { version = "1.49.13", features = ["chrome"] }
spider = { version = "1.50.0", features = ["chrome"] }
```

You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug.
Expand Down
Loading

0 comments on commit cd5cec5

Please sign in to comment.