Skip to content

Commit

Permalink
feat(cron): add cron feature flag (#153)
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 25, 2023
1 parent 16c796a commit bb76651
Show file tree
Hide file tree
Showing 14 changed files with 632 additions and 34 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## Unreleased

## v1.50.1

1. feat(cron): add cron feature flag [#153]

## v1.36.0

1. feat(sync): subscribe to page updates to perform async handling of data
Expand All @@ -12,7 +16,7 @@

## v1.30.5

1. "feat(worker): add tls support"
1. feat(worker): add tls support

## v1.30.3

Expand Down
22 changes: 18 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,16 @@

The fastest web crawler and indexer. Foundational building blocks for data curation workloads.

- Concurrent
- Streaming
- Decentralization
- Headless Chrome Rendering
- HTTP Proxies
- Cron Jobs
- Subscriptions
- Blacklisting and Budgeting Depth
- [Changelog](CHANGELOG.md)

## Getting Started

The simplest way to get started is to use the [Spider Cloud](https://spiderwebai.xyz) for a pain free hosted service. View the [spider](./spider/README.md) or [spider_cli](./spider_cli/README.md) directory for local installations.
Expand Down
11 changes: 8 additions & 3 deletions examples/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_examples"
version = "1.49.13"
version = "1.50.1"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "Multithreaded web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand All @@ -22,7 +22,7 @@ htr = "0.5.27"
flexbuffers = "2.0.0"

[dependencies.spider]
version = "1.49.13"
version = "1.50.1"
path = "../spider"
features = ["serde"]

Expand Down Expand Up @@ -70,4 +70,9 @@ path = "configuration.rs"
[[example]]
name = "budget"
path = "budget.rs"
required-features = ["spider/budget", "spider/sync"]
required-features = ["spider/budget", "spider/sync"]

[[example]]
name = "cron"
path = "cron.rs"
required-features = ["spider/sync", "spider/cron"]
4 changes: 4 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,7 @@ Crawl the page and output the links via [Serde](./serde.rs).
Crawl links with a budget of amount of pages allowed [Budget](./budget.rs).

- `cargo run --example budget`

Crawl links at a given cron time [Cron](./cron.rs).

- `cargo run --example cron`
25 changes: 25 additions & 0 deletions examples/cron.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
//! `cargo run --example cron`
extern crate spider;

use spider::tokio;
use spider::website::{Website, run_cron};

#[tokio::main]
async fn main() {
let mut website: Website = Website::new("https://rsseau.fr");
website.cron_str = "1/5 * * * * *".into();

let mut rx2 = website.subscribe(16).unwrap();

let join_handle = tokio::spawn(async move {
while let Ok(res) = rx2.recv().await {
println!("{:?}", res.get_url());
}
});

let runner = run_cron(website).await;

println!("Starting the Runner for 20 seconds");
tokio::time::sleep(tokio::time::Duration::from_secs(20)).await;
let _ = tokio::join!(runner.stop(), join_handle);
}
10 changes: 7 additions & 3 deletions spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "1.49.13"
version = "1.50.1"
authors = ["madeindjs <contact@rousseau-alexandre.fr>", "j-mendez <jeff@a11ywatch.com>"]
description = "The fastest web crawler written in Rust."
repository = "https://github.com/spider-rs/spider"
Expand Down Expand Up @@ -43,12 +43,15 @@ case_insensitive_string = { version = "0.1.7", features = [ "compact", "serde" ]
jsdom = { version = "0.0.11-alpha.1", optional = true, features = [ "hashbrown", "tokio" ] }
chromiumoxide = { version = "0.5.6", optional = true, features = ["tokio-runtime", "bytes"], default-features = false }
sitemap = { version = "0.4.1", optional = true }
chrono = { version = "0.4.31", optional = true }
cron = { version = "0.12.0", optional = true }
async-trait = { version = "0.1.74", optional = true }

[target.'cfg(all(not(windows), not(target_os = "android"), not(target_env = "musl")))'.dependencies]
tikv-jemallocator = { version = "0.5.0", optional = true }

[features]
default = ["sync"]
default = ["sync", "cron"]
regex = ["dep:regex"]
glob = ["dep:regex", "dep:itertools"]
ua_generator = ["dep:ua_generator"]
Expand All @@ -70,4 +73,5 @@ chrome = ["dep:chromiumoxide"]
chrome_headed = ["chrome"]
chrome_cpu = ["chrome"]
chrome_stealth = ["chrome"]
cookies = ["reqwest/cookies"]
cookies = ["reqwest/cookies"]
cron = ["dep:chrono", "dep:cron", "dep:async-trait"]
Loading

0 comments on commit bb76651

Please sign in to comment.