Skip to content

Commit

Permalink
feat(cron): add cron feature flag (#153)
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 25, 2023
1 parent 16c796a commit dfef12f
Show file tree
Hide file tree
Showing 14 changed files with 631 additions and 33 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## Unreleased

## v1.50.1

1. feat(cron): add cron feature flag [#153]

## v1.36.0

1. feat(sync): subscribe to page updates to perform async handling of data
Expand Down
22 changes: 18 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,16 @@

The fastest web crawler and indexer. Foundational building blocks for data curation workloads.

- Concurrent
- Streaming
- Decentralization
- Headless Chrome Rendering
- HTTP Proxies
- Cron Jobs
- Subscriptions
- Blacklisting and Budgeting Depth
- [Changelog](CHANGELOG.md)

## Getting Started

The simplest way to get started is to use the [Spider Cloud](https://spiderwebai.xyz) for a pain free hosted service. View the [spider](./spider/README.md) or [spider_cli](./spider_cli/README.md) directory for local installations.
Expand Down
11 changes: 8 additions & 3 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "1.49.13"
version = "1.50.1"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand All @@ -22,7 +22,7 @@ htr = "0.5.27"
flexbuffers = "2.0.0"

[dependencies.spider]
version = "1.49.13"
version = "1.50.1"
path = "../spider"
features = ["serde"]

Expand Down Expand Up @@ -70,4 +70,9 @@ path = "configuration.rs"
[[example]]
name = "budget"
path = "budget.rs"
required-features = ["spider/budget", "spider/sync"]
required-features = ["spider/budget", "spider/sync"]

[[example]]
name = "cron"
path = "cron.rs"
required-features = ["spider/sync", "spider/cron"]
4 changes: 4 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,7 @@ Crawl the page and output the links via [Serde](./serde.rs).
Crawl links with a budget of amount of pages allowed [Budget](./budget.rs).

- `cargo run --example budget`

Crawl links at a given cron time [Cron](./cron.rs).

- `cargo run --example cron`
25 changes: 25 additions & 0 deletions examples/cron.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
//! `cargo run --example cron`
extern crate spider;

use spider::tokio;
use spider::website::{Website, run_cron};

#[tokio::main]
async fn main() {
let mut website: Website = Website::new("https://rsseau.fr");
website.cron_str = "1/5 * * * * *".into();

let mut rx2 = website.subscribe(16).unwrap();

let join_handle = tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
println!("{:?}", res.get_url());
}
});

let runner = run_cron(website).await;

println!("Starting the Runner for 20 seconds");
tokio::time::sleep(tokio::time::Duration::from_secs(20)).await;
let _ = tokio::join!(runner.stop(), join_handle);
}
10 changes: 7 additions & 3 deletions spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "1.49.13"
version = "1.50.1"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "The fastest web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand Down Expand Up @@ -43,12 +43,15 @@ case_insensitive_string = { version = "0.1.7", features = [ "compact", "serde" ]
jsdom = { version = "0.0.11-alpha.1", optional = true, features = [ "hashbrown", "tokio" ] }
chromiumoxide = { version = "0.5.6", optional = true, features = ["tokio-runtime", "bytes"], default-features = false }
sitemap = { version = "0.4.1", optional = true }
chrono = { version = "0.4.31", optional = true }
cron = { version = "0.12.0", optional = true }
async-trait = { version = "0.1.74", optional = true }

[target.'cfg(all(not(windows), not(target_os = "android"), not(target_env = "musl")))'.dependencies]
tikv-jemallocator = { version = "0.5.0", optional = true }

[features]
default = ["sync"]
default = ["sync", "cron"]
regex = ["dep:regex"]
glob = ["dep:regex", "dep:itertools"]
ua_generator = ["dep:ua_generator"]
Expand All @@ -70,4 +73,5 @@ chrome = ["dep:chromiumoxide"]
chrome_headed = ["chrome"]
chrome_cpu = ["chrome"]
chrome_stealth = ["chrome"]
cookies = ["reqwest/cookies"]
cookies = ["reqwest/cookies"]
cron = ["dep:chrono", "dep:cron", "dep:async-trait"]
77 changes: 60 additions & 17 deletions spider/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom

```toml
[dependencies]
spider = "1.49.13"
spider = "1.50.1"
```

And then the code:
Expand All @@ -30,7 +30,7 @@ use spider::tokio;
#[tokio::main]
async fn main() {
let url = "https://choosealicense.com";
let mut website: Website = Website::new(&url);
let mut website = Website::new(&url);
website.crawl().await;
for link in website.get_links() {
Expand All @@ -43,7 +43,7 @@ You can use `Configuration` object to configure your crawler:

```rust
// ..
let mut website: Website = Website::new("https://choosealicense.com");
let mut website = Website::new("https://choosealicense.com");

website.configuration.respect_robots_txt = true;
website.configuration.subdomains = true;
Expand All @@ -52,10 +52,12 @@ website.configuration.delay = 0; // Defaults to 0 ms due to concurrency handling
website.configuration.request_timeout = None; // Defaults to 15000 ms
website.configuration.http2_prior_knowledge = false; // Enable if you know the webserver supports http2
website.configuration.user_agent = Some("myapp/version".into()); // Defaults to using a random agent
website.on_link_find_callback = Some(|s, html| { println!("link target: {}", s); (s, html)}); // Callback to run on each link find
website.on_link_find_callback = Some(|s, html| { println!("link target: {}", s); (s, html)}); // Callback to run on each link find - useful for mutating the url, ex: convert the top level domain from `.fr` to `.es`.
website.configuration.blacklist_url.get_or_insert(Default::default()).push("https://choosealicense.com/licenses/".into());
website.configuration.proxies.get_or_insert(Default::default()).push("socks5://10.1.1.1:12345".into()); // Defaults to None - proxy list.
website.budget = Some(spider::hashbrown::HashMap::from([(spider::CaseInsensitiveString::new("*"), 300), (spider::CaseInsensitiveString::new("/licenses"), 10)])); // Defaults to None - Requires the `budget` feature flag
website.cron_str = "1/5 * * * * *".into(); // Defaults to empty string - Requires the `cron` feature flag
website.cron_type = spider::website::CronType::Crawl; // Defaults to CronType::Crawl - Requires the `cron` feature flag

website.crawl().await;
```
Expand All @@ -78,6 +80,8 @@ website
.with_external_domains(Some(Vec::from(["https://creativecommons.org/licenses/by/3.0/"].map(|d| d.to_string())).into_iter()))
.with_headers(None)
.with_blacklist_url(Some(Vec::from(["https://choosealicense.com/licenses/".into()])))
// requires the `cron` feature flag
.with_cron("1/5 * * * * *", Default::Default());
.with_proxies(None);
```

Expand All @@ -87,7 +91,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl

```toml
[dependencies]
spider = { version = "1.49.13", features = ["regex", "ua_generator"] }
spider = { version = "1.50.1", features = ["regex", "ua_generator"] }
```

1. `ua_generator`: Enables auto generating a random real User-Agent.
Expand All @@ -97,27 +101,28 @@ spider = { version = "1.49.13", features = ["regex", "ua_generator"] }
1. `sync`: Subscribe to changes for Page data processing async. [Enabled by default]
1. `budget`: Allows setting a crawl budget per path with depth.
1. `control`: Enables the ability to pause, start, and shutdown crawls on demand.
1. `full_resources`: Enables gathering all content that relates to the domain like css,jss, and etc.
1. `full_resources`: Enables gathering all content that relates to the domain like CSS, JS, and etc.
1. `serde`: Enables serde serialization support.
1. `socks`: Enables socks5 proxy support.
1. `glob`: Enables [url glob](https://everything.curl.dev/cmdline/globbing) support.
1. `fs`: Enables storing resources to disk for parsing (may greatly increases performance at the cost of temp storage). [Enabled by default]
1. `js`: Enables javascript parsing links created with the alpha [jsdom](https://github.com/a11ywatch/jsdom) crate.
1. `sitemap`: Include sitemap pages in results.
1. `time`: Enables duration tracking per page.
1. `chrome`: Enables chrome headless rendering, use the env var `CHROME_URL` to connect remotely [experimental].
1. `chrome`: Enables chrome headless rendering, use the env var `CHROME_URL` to connect remotely.
1. `chrome_headed`: Enables chrome rendering headful rendering [experimental].
1. `chrome_cpu`: Disable gpu usage for chrome browser.
1. `chrome_stealth`: Enables stealth mode to make it harder to be detected as a bot.
1. `cookies`: Enables cookies storing and setting to use for request.
1. `cron`: Enables the ability to start cron jobs for the website.

### Decentralization

Move processing to a worker, drastically increases performance even if worker is on the same machine due to efficient runtime split IO work.

```toml
[dependencies]
spider = { version = "1.49.13", features = ["decentralized"] }
spider = { version = "1.50.1", features = ["decentralized"] }
```

```sh
Expand All @@ -137,7 +142,7 @@ Use the subscribe method to get a broadcast channel.

```toml
[dependencies]
spider = { version = "1.49.13", features = ["sync"] }
spider = { version = "1.50.1", features = ["sync"] }
```

```rust,no_run
Expand All @@ -148,7 +153,7 @@ use spider::tokio;
#[tokio::main]
async fn main() {
let mut website: Website = Website::new("https://choosealicense.com");
let mut website = Website::new("https://choosealicense.com");
let mut rx2 = website.subscribe(16).unwrap();
let join_handle = tokio::spawn(async move {
Expand All @@ -167,7 +172,7 @@ Allow regex for blacklisting routes

```toml
[dependencies]
spider = { version = "1.49.13", features = ["regex"] }
spider = { version = "1.50.1", features = ["regex"] }
```

```rust,no_run
Expand All @@ -178,7 +183,7 @@ use spider::tokio;
#[tokio::main]
async fn main() {
let mut website: Website = Website::new("https://choosealicense.com");
let mut website = Website::new("https://choosealicense.com");
website.configuration.blacklist_url.push("/licenses/".into());
website.crawl().await;
Expand All @@ -194,7 +199,7 @@ If you are performing large workloads you may need to control the crawler by ena

```toml
[dependencies]
spider = { version = "1.49.13", features = ["control"] }
spider = { version = "1.50.1", features = ["control"] }
```

```rust
Expand All @@ -211,10 +216,10 @@ async fn main() {

tokio::spawn(async move {
pause(url).await;
sleep(Duration::from_millis(5000)).await;
sleep(tokio::time::Duration::from_millis(5000)).await;
resume(url).await;
// perform shutdown if crawl takes longer than 15s
sleep(Duration::from_millis(15000)).await;
sleep(tokio::time::Duration::from_millis(15000)).await;
// you could also abort the task to shutdown crawls if using website.crawl in another thread.
shutdown(url).await;
});
Expand All @@ -236,7 +241,7 @@ async fn main() {
use std::io::{Write, stdout};

let url = "https://choosealicense.com/";
let mut website: Website = Website::new(&url);
let mut website = Website::new(&url);

website.scrape().await;

Expand All @@ -258,11 +263,49 @@ async fn main() {
}
```

### Cron Jobs

Use cron jobs to run crawls continuously at anytime.

```toml
[dependencies]
spider = { version = "1.50.1", features = ["sync", "cron"] }
```

```rust,no_run
extern crate spider;
use spider::website::{Website, run_cron};
use spider::tokio;
#[tokio::main]
async fn main() {
let mut website = Website::new("https://choosealicense.com");
// set the cron to run or use the builder pattern `website.with_cron`.
website.cron_str = "1/5 * * * * *".into();
let mut rx2 = website.subscribe(16).unwrap();
let join_handle = tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
println!("{:?}", res.get_url());
}
});
// take ownership of the website. You can also use website.run_cron, except you need to perform abort manually on handles created.
let runner = run_cron(website).await;
println!("Starting the Runner for 10 seconds");
tokio::time::sleep(tokio::time::Duration::from_secs(10)).await;
let _ = tokio::join!(runner.stop(), join_handle);
}
```

### Chrome

```toml
[dependencies]
spider = { version = "1.49.13", features = ["chrome"] }
spider = { version = "1.50.1", features = ["chrome"] }
```

You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug.
Expand Down
Loading

0 comments on commit dfef12f

Please sign in to comment.