diff --git a/.gitignore b/.gitignore index 690514cbc..88bdce6d7 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ target .env .DS_Store _temp_spider_downloads -storage \ No newline at end of file +storage +http-cacache \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 3d0d041df..96c603665 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -75,7 +75,7 @@ dependencies = [ "once_cell", "parking_lot", "paste", - "windows", + "windows 0.42.0", ] [[package]] @@ -684,6 +684,12 @@ version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" +[[package]] +name = "bytecount" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1e5f035d16fc623ae5f74981db80a439803888314e3a555fd6f04acd51a3205" + [[package]] name = "bytemuck" version = "1.14.0" @@ -719,6 +725,33 @@ dependencies = [ "serde", ] +[[package]] +name = "cacache" +version = "12.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "142316461ed3a3dfcba10417317472da5bfd0461e4d276bf7c07b330766d9490" +dependencies = [ + "digest", + "either", + "futures", + "hex", + "libc", + "memmap2", + "miette", + "reflink-copy", + "serde", + "serde_derive", + "serde_json", + "sha1", + "sha2", + "ssri", + "tempfile", + "thiserror", + "tokio", + "tokio-stream", + "walkdir", +] + [[package]] name = "calloop" version = "0.10.6" @@ -733,6 +766,37 @@ dependencies = [ "vec_map", ] +[[package]] +name = "camino" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +dependencies = [ + "serde", +] + +[[package]] +name = "cargo-platform" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12024c4645c97566567129c204f65d5815a8c9aecf30fcbe682b2fe034996d36" +dependencies = [ + "serde", +] + +[[package]] +name = "cargo_metadata" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4acbb09d9ee8e23699b9634375c72795d095bf268439da88562cf9b501f181fa" +dependencies = [ + "camino", + "cargo-platform", + "semver", + "serde", + "serde_json", +] + [[package]] name = "case_insensitive_string" version = "0.1.7" @@ -1136,6 +1200,16 @@ dependencies = [ "once_cell", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + [[package]] name = "crossbeam-deque" version = "0.8.3" @@ -1509,6 +1583,15 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "error-chain" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d2f06b9cac1506ece98fe3231e3cc9c4410ec3d5b1f24ae1c8946f0742cdefc" +dependencies = [ + "version_check", +] + [[package]] name = "error-code" version = "2.3.1" @@ -1864,6 +1947,12 @@ dependencies = [ "xml-rs", ] +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "glow" version = "0.12.3" @@ -2117,6 +2206,63 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-cache" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57bae69908277d47122334bf6009e514cd3d24e65d0bf2b56f1b45db6ad8864d" +dependencies = [ + "async-trait", + "bincode", + "cacache", + "http", + "http-cache-semantics", + "httpdate", + "moka", + "serde", + "url", +] + +[[package]] +name = "http-cache-reqwest" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee5a40915f9da8f3feef6f942f9aace74393803c47bce6870f19cd2b5123f27a" +dependencies = [ + "anyhow", + "async-trait", + "http", + "http-cache", + "http-cache-semantics", + "reqwest", + "reqwest-middleware", + "serde", + "task-local-extensions", + "url", +] + +[[package]] +name = "http-cache-semantics" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7aec9f678bca3f4a15194b980f20ed9bfe0dd38e8d298c65c559a93dfbd6380a" +dependencies = [ + "http", + "http-serde", + "serde", + "time", +] + +[[package]] +name = "http-serde" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f560b665ad9f1572cfcaf034f7fb84338a7ce945216d64a90fd81f046a3caee" +dependencies = [ + "http", + "serde", +] + [[package]] name = "httparse" version = "1.8.0" @@ -2197,7 +2343,7 @@ dependencies = [ "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows-core", + "windows-core 0.51.1", ] [[package]] @@ -2461,6 +2607,15 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" +[[package]] +name = "mach2" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d0d1830bcd151a6fc4aea1369af235b36c1528fe976b8ff678683c9995eade8" +dependencies = [ + "libc", +] + [[package]] name = "malloc_buf" version = "0.0.6" @@ -2518,6 +2673,29 @@ dependencies = [ "autocfg", ] +[[package]] +name = "miette" +version = "5.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59bb584eaeeab6bd0226ccf3509a69d7936d148cf3d036ad350abe35e8c6856e" +dependencies = [ + "miette-derive", + "once_cell", + "thiserror", + "unicode-width", +] + +[[package]] +name = "miette-derive" +version = "5.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49e7bc1560b95a3c4a25d03de42fe76ca718ab92d1a22a55b9b4cf67b3ae635c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.40", +] + [[package]] name = "mime" version = "0.3.17" @@ -2562,6 +2740,30 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "moka" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8017ec3548ffe7d4cef7ac0e12b044c01164a74c0f3119420faeaf13490ad8b" +dependencies = [ + "async-lock 2.8.0", + "async-trait", + "crossbeam-channel", + "crossbeam-epoch", + "crossbeam-utils", + "futures-util", + "once_cell", + "parking_lot", + "quanta", + "rustc_version", + "skeptic", + "smallvec", + "tagptr", + "thiserror", + "triomphe", + "uuid", +] + [[package]] name = "multer" version = "2.1.0" @@ -3287,6 +3489,33 @@ dependencies = [ "vec1", ] +[[package]] +name = "pulldown-cmark" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a1a2f1f0a7ecff9c31abbe177637be0e97a0aef46cf8738ece09327985d998" +dependencies = [ + "bitflags 1.3.2", + "memchr", + "unicase", +] + +[[package]] +name = "quanta" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a17e662a7a8291a865152364c20c7abc5e60486ab2001e8ec10b24862de0b9ab" +dependencies = [ + "crossbeam-utils", + "libc", + "mach2", + "once_cell", + "raw-cpuid", + "wasi", + "web-sys", + "winapi", +] + [[package]] name = "quinn" version = "0.10.2" @@ -3373,6 +3602,15 @@ dependencies = [ "getrandom", ] +[[package]] +name = "raw-cpuid" +version = "10.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c297679cb867470fa8c9f67dbba74a78d78e3e98d7cf2b08d6d71540f797332" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "raw-window-handle" version = "0.5.2" @@ -3417,6 +3655,17 @@ dependencies = [ "bitflags 1.3.2", ] +[[package]] +name = "reflink-copy" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "767be24c0da52e7448d495b8d162506a9aa125426651d547d545d6c2b4b65b62" +dependencies = [ + "cfg-if", + "rustix 0.38.28", + "windows 0.52.0", +] + [[package]] name = "regex" version = "1.10.2" @@ -3473,6 +3722,7 @@ dependencies = [ "js-sys", "log", "mime", + "mime_guess", "native-tls", "once_cell", "percent-encoding", @@ -3499,6 +3749,21 @@ dependencies = [ "winreg 0.50.0", ] +[[package]] +name = "reqwest-middleware" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88a3e86aa6053e59030e7ce2d2a3b258dd08fc2d337d52f73f6cb480f5858690" +dependencies = [ + "anyhow", + "async-trait", + "http", + "reqwest", + "serde", + "task-local-extensions", + "thiserror", +] + [[package]] name = "res-regex" version = "0.1.4" @@ -3580,6 +3845,15 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "0.37.27" @@ -3756,6 +4030,15 @@ dependencies = [ "smallvec", ] +[[package]] +name = "semver" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "836fa6a3e1e547f9a2c4040802ec865b5d85f4014efe00555d7090a3dcaa1090" +dependencies = [ + "serde", +] + [[package]] name = "serde" version = "1.0.193" @@ -3841,6 +4124,17 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "sha-1" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5058ada175748e33390e40e872bd0fe59a19f265d0158daa551c5a88a76009c" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sha1" version = "0.10.6" @@ -3852,6 +4146,17 @@ dependencies = [ "digest", ] +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "signal-hook-registry" version = "1.4.1" @@ -3885,6 +4190,21 @@ dependencies = [ "xml-rs", ] +[[package]] +name = "skeptic" +version = "0.13.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16d23b015676c90a0f01c197bfdc786c20342c73a0afdda9025adb0bc42940a8" +dependencies = [ + "bytecount", + "cargo_metadata", + "error-chain", + "glob", + "pulldown-cmark", + "tempfile", + "walkdir", +] + [[package]] name = "slab" version = "0.4.9" @@ -3960,7 +4280,7 @@ dependencies = [ [[package]] name = "spider" -version = "1.70.5" +version = "1.80.1" dependencies = [ "ahash", "async-trait", @@ -3976,6 +4296,7 @@ dependencies = [ "fast_html5ever", "flexbuffers", "hashbrown 0.14.3", + "http-cache-reqwest", "itertools 0.12.0", "jsdom", "lazy_static", @@ -3985,6 +4306,7 @@ dependencies = [ "percent-encoding", "regex", "reqwest", + "reqwest-middleware", "selectors", "serde", "sitemap", @@ -4001,7 +4323,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "1.70.5" +version = "1.80.1" dependencies = [ "clap 4.4.11", "env_logger 0.9.3", @@ -4013,7 +4335,7 @@ dependencies = [ [[package]] name = "spider_examples" -version = "1.70.5" +version = "1.80.1" dependencies = [ "convert_case 0.5.0", "env_logger 0.9.3", @@ -4034,7 +4356,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "1.70.5" +version = "1.80.1" dependencies = [ "env_logger 0.10.1", "lazy_static", @@ -4054,6 +4376,23 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +[[package]] +name = "ssri" +version = "9.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da7a2b3c2bc9693bcb40870c4e9b5bf0d79f9cb46273321bf855ec513e919082" +dependencies = [ + "base64", + "digest", + "hex", + "miette", + "serde", + "sha-1", + "sha2", + "thiserror", + "xxhash-rust", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -4193,6 +4532,21 @@ dependencies = [ "libc", ] +[[package]] +name = "tagptr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" + +[[package]] +name = "task-local-extensions" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba323866e5d033818e3240feeb9f7db2c4296674e4d9e16b97b7bf8f490434e8" +dependencies = [ + "pin-utils", +] + [[package]] name = "tempfile" version = "3.8.1" @@ -4508,6 +4862,12 @@ dependencies = [ "once_cell", ] +[[package]] +name = "triomphe" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1ee9bd9239c339d714d657fac840c6d2a4f9c45f4f9ec7b0975113458be78db" + [[package]] name = "try-lock" version = "0.2.5" @@ -4706,6 +5066,7 @@ dependencies = [ "form_urlencoded", "idna 0.5.0", "percent-encoding", + "serde", ] [[package]] @@ -4720,6 +5081,15 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +[[package]] +name = "uuid" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560" +dependencies = [ + "getrandom", +] + [[package]] name = "vcpkg" version = "0.2.15" @@ -5072,6 +5442,16 @@ dependencies = [ "windows_x86_64_msvc 0.42.2", ] +[[package]] +name = "windows" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" +dependencies = [ + "windows-core 0.52.0", + "windows-targets 0.52.0", +] + [[package]] name = "windows-core" version = "0.51.1" @@ -5081,6 +5461,15 @@ dependencies = [ "windows-targets 0.48.5", ] +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.0", +] + [[package]] name = "windows-implement" version = "0.42.0" @@ -5409,6 +5798,12 @@ version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fcb9cbac069e033553e8bb871be2fbdffcab578eb25bd0f7c508cedc6dcd75a" +[[package]] +name = "xxhash-rust" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9828b178da53440fa9c766a3d2f73f7cf5d0ac1fe3980c1e5018d899fd19e07b" + [[package]] name = "zbus" version = "3.14.1" diff --git a/examples/Cargo.toml b/examples/Cargo.toml index 06a8a433a..d80024f95 100644 --- a/examples/Cargo.toml +++ b/examples/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_examples" -version = "1.70.5" +version = "1.80.1" authors = ["madeindjs ", "j-mendez "] description = "Multithreaded web crawler written in Rust." repository = "https://github.com/spider-rs/spider" @@ -22,7 +22,7 @@ htr = "0.5.27" flexbuffers = "2.0.0" [dependencies.spider] -version = "1.70.5" +version = "1.80.1" path = "../spider" features = ["serde"] diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 37f5c6345..0649b33a0 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "1.70.5" +version = "1.80.1" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler written in Rust." repository = "https://github.com/spider-rs/spider" @@ -49,6 +49,8 @@ async-trait = { version = "0.1.74", optional = true } napi = { version = "2", features = ["async", "tokio_rt", "napi4"], optional = true } strum = { version = "0.25", features = ["derive"] } async_job = { version = "0.1.0", optional = true } +reqwest-middleware = { version = "0.2.4", optional = true } +http-cache-reqwest = { version = "0.12.0", optional = true } [target.'cfg(all(not(windows), not(target_os = "android"), not(target_env = "musl")))'.dependencies] tikv-jemallocator = { version = "0.5.0", optional = true } @@ -72,6 +74,8 @@ reqwest_json = ["reqwest/json"] sitemap = ["dep:sitemap"] js = ["dep:jsdom"] budget = [] +cache = ["dep:reqwest-middleware", "dep:http-cache-reqwest"] +cache_mem = ["cache", "http-cache-reqwest/manager-moka"] chrome = ["dep:chromiumoxide"] chrome_headed = ["chrome"] chrome_cpu = ["chrome"] @@ -84,4 +88,4 @@ cron = ["dep:async_job", "dep:chrono", "dep:cron", "dep:async-trait"] napi = ["dep:napi"] napi_rustls_tls = ["napi", "reqwest/rustls-tls"] http3 = ["reqwest/http3"] -smart = ["chrome", "dep:regex"] \ No newline at end of file +smart = ["chrome", "dep:regex"] diff --git a/spider/README.md b/spider/README.md index 384f1dd0b..9d00ba349 100644 --- a/spider/README.md +++ b/spider/README.md @@ -16,7 +16,7 @@ This is a basic async example crawling a web page, add spider to your `Cargo.tom ```toml [dependencies] -spider = "1.70.5" +spider = "1.80.1" ``` And then the code: @@ -91,7 +91,7 @@ We have a couple optional feature flags. Regex blacklisting, jemaloc backend, gl ```toml [dependencies] -spider = { version = "1.70.5", features = ["regex", "ua_generator"] } +spider = { version = "1.80.1", features = ["regex", "ua_generator"] } ``` 1. `ua_generator`: Enables auto generating a random real User-Agent. @@ -109,6 +109,8 @@ spider = { version = "1.70.5", features = ["regex", "ua_generator"] } 1. `js`: Enables javascript parsing links created with the alpha [jsdom](https://github.com/a11ywatch/jsdom) crate. 1. `sitemap`: Include sitemap pages in results. 1. `time`: Enables duration tracking per page. +1. `cache`: Enables HTTP caching request to disk. +1. `cache_mem`: Enables HTTP caching request to persist in memory. 1. `chrome`: Enables chrome headless rendering, use the env var `CHROME_URL` to connect remotely. 1. `chrome_store_page`: Store the page object to perform other actions like taking screenshots conditionally. 1. `chrome_screenshot`: Enables storing a screenshot of each page on crawl. Defaults the screenshots to the ./storage/ directory. Use the env variable `SCREENSHOT_DIRECTORY` to adjust the directory. To save the background set the env var `SCREENSHOT_OMIT_BACKGROUND` to false. @@ -126,7 +128,7 @@ Move processing to a worker, drastically increases performance even if worker is ```toml [dependencies] -spider = { version = "1.70.5", features = ["decentralized"] } +spider = { version = "1.80.1", features = ["decentralized"] } ``` ```sh @@ -146,7 +148,7 @@ Use the subscribe method to get a broadcast channel. ```toml [dependencies] -spider = { version = "1.70.5", features = ["sync"] } +spider = { version = "1.80.1", features = ["sync"] } ``` ```rust,no_run @@ -176,7 +178,7 @@ Allow regex for blacklisting routes ```toml [dependencies] -spider = { version = "1.70.5", features = ["regex"] } +spider = { version = "1.80.1", features = ["regex"] } ``` ```rust,no_run @@ -203,7 +205,7 @@ If you are performing large workloads you may need to control the crawler by ena ```toml [dependencies] -spider = { version = "1.70.5", features = ["control"] } +spider = { version = "1.80.1", features = ["control"] } ``` ```rust @@ -273,7 +275,7 @@ Use cron jobs to run crawls continuously at anytime. ```toml [dependencies] -spider = { version = "1.70.5", features = ["sync", "cron"] } +spider = { version = "1.80.1", features = ["sync", "cron"] } ``` ```rust,no_run @@ -312,7 +314,7 @@ the feature flag [`chrome_intercept`] to possibly speed up request using Network ```toml [dependencies] -spider = { version = "1.70.5", features = ["chrome", "chrome_intercept"] } +spider = { version = "1.80.1", features = ["chrome", "chrome_intercept"] } ``` You can use `website.crawl_concurrent_raw` to perform a crawl without chromium when needed. Use the feature flag `chrome_headed` to enable headful browser usage if needed to debug. @@ -337,13 +339,45 @@ async fn main() { } ``` + +### Caching + +Enabling HTTP cache can be done with the feature flag [`cache`] or [`cache_mem`]. + +```toml +[dependencies] +spider = { version = "1.80.1", features = ["cache"] } +``` + +You need to set `website.cache` to true to enable as well. + +```rust +extern crate spider; + +use spider::tokio; +use spider::website::Website; + +#[tokio::main] +async fn main() { + let mut website: Website = Website::new("https://rsseau.fr") + .with_caching(true) + .build() + .unwrap(); + + website.crawl().await; + + println!("Links found {:?}", website.get_links().len()); + /// next run to website.crawl().await; will be faster since content is stored on disk. +} +``` + ### Smart Mode Intelligently run crawls using HTTP and JavaScript Rendering when needed. The best of both worlds to maintain speed and extract every page. This requires a chrome connection or browser installed on the system. ```toml [dependencies] -spider = { version = "1.70.5", features = ["smart"] } +spider = { version = "1.80.1", features = ["smart"] } ``` ```rust,no_run diff --git a/spider/src/lib.rs b/spider/src/lib.rs index 67fade9bb..370773c50 100644 --- a/spider/src/lib.rs +++ b/spider/src/lib.rs @@ -59,6 +59,8 @@ //! - `sitemap`: Include sitemap pages in results. //! - `js`: Enables javascript parsing links created with the alpha [jsdom](https://github.com/a11ywatch/jsdom) crate. //! - `time`: Enables duration tracking per page. +//! - `cache`: Enables HTTP caching request to disk. +//! - `cache_mem`: Enables HTTP caching request to persist in memory. //! - `chrome`: Enables chrome headless rendering, use the env var `CHROME_URL` to connect remotely [experimental]. //! - `chrome_headed`: Enables chrome rendering headful rendering [experimental]. //! - `chrome_cpu`: Disable gpu usage for chrome browser. @@ -95,6 +97,11 @@ pub extern crate case_insensitive_string; pub extern crate smallvec; pub extern crate url; +#[cfg(feature = "cache")] +pub extern crate http_cache_reqwest; +#[cfg(feature = "cache")] +pub extern crate reqwest_middleware; + #[macro_use] pub extern crate string_concat; #[macro_use] @@ -150,3 +157,17 @@ pub mod black_list { blacklist_url.contains(&link) } } + +/// The asynchronous Client to make requests with. +#[cfg(not(feature = "cache"))] +pub type Client = reqwest::Client; +#[cfg(not(feature = "cache"))] +/// The asynchronous Client Builder. +pub type ClientBuilder = reqwest::ClientBuilder; + +/// The asynchronous Client to make requests with HTTP Cache. +#[cfg(feature = "cache")] +pub type Client = reqwest_middleware::ClientWithMiddleware; +#[cfg(feature = "cache")] +/// The asynchronous Client Builder. +pub type ClientBuilder = reqwest_middleware::ClientBuilder; diff --git a/spider/src/packages/robotparser/parser.rs b/spider/src/packages/robotparser/parser.rs index 8b9f51e3b..0f58e1f36 100644 --- a/spider/src/packages/robotparser/parser.rs +++ b/spider/src/packages/robotparser/parser.rs @@ -26,7 +26,7 @@ //! ``` use compact_str::CompactString; -use reqwest::Client; +use crate::Client; use reqwest::Response; use reqwest::StatusCode; use std::time::{Duration, SystemTime, UNIX_EPOCH}; diff --git a/spider/src/page.rs b/spider/src/page.rs index 6240312a3..2073ade47 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -3,10 +3,11 @@ use crate::packages::scraper::Html; use crate::utils::log; use crate::utils::PageResponse; use crate::CaseInsensitiveString; +use crate::Client; use bytes::Bytes; use compact_str::CompactString; use hashbrown::HashSet; -use reqwest::{Client, StatusCode}; +use reqwest::StatusCode; use smallvec::SmallVec; #[cfg(all(feature = "time", not(feature = "decentralized")))] @@ -266,7 +267,7 @@ impl Page { build(url, page_resource) } - #[cfg(all(not(feature = "decentralized"), feature = "chrome"))] + #[cfg(all(not(feature = "decentralized"), feature = "chrome",))] /// Instantiate a new page and gather the html. pub async fn new(url: &str, client: &Client, page: &chromiumoxide::Page) -> Self { let page_resource = crate::utils::fetch_page_html(&url, &client, &page).await; @@ -280,6 +281,29 @@ impl Page { p } + /// Instantiate a new page and gather the links. + #[cfg(feature = "decentralized")] + pub async fn new(url: &str, client: &Client) -> Self { + use crate::serde::Deserialize; + use bytes::Buf; + let links = match crate::utils::fetch_page(&url, &client).await { + Some(b) => match flexbuffers::Reader::get_root(b.chunk()) { + Ok(buf) => match HashSet::::deserialize(buf) { + Ok(link) => link, + _ => Default::default(), + }, + _ => Default::default(), + }, + _ => Default::default(), + }; + + Page { + html: None, + links, + ..Default::default() + } + } + #[cfg(all(not(feature = "decentralized"), feature = "chrome"))] /// Take a screenshot of the page. The feature flag [chrome_store_page] is required. pub async fn screenshot(&self, full_page: bool, omit_background: bool) { @@ -312,29 +336,6 @@ impl Page { } } - /// Instantiate a new page and gather the links. - #[cfg(feature = "decentralized")] - pub async fn new(url: &str, client: &Client) -> Self { - use crate::serde::Deserialize; - use bytes::Buf; - let links = match crate::utils::fetch_page(&url, &client).await { - Some(b) => match flexbuffers::Reader::get_root(b.chunk()) { - Ok(buf) => match HashSet::::deserialize(buf) { - Ok(link) => link, - _ => Default::default(), - }, - _ => Default::default(), - }, - _ => Default::default(), - }; - - Page { - html: None, - links, - ..Default::default() - } - } - /// Page request fulfilled. pub fn is_empty(&self) -> bool { self.html.is_none() @@ -966,7 +967,11 @@ impl Page { } } -#[cfg(all(not(feature = "decentralized"), not(feature = "chrome")))] +#[cfg(all( + not(feature = "decentralized"), + not(feature = "chrome"), + not(feature = "cache") +))] #[tokio::test] async fn parse_links() { let client = Client::builder() @@ -988,7 +993,11 @@ async fn parse_links() { ); } -#[cfg(all(not(feature = "decentralized"), not(feature = "chrome")))] +#[cfg(all( + not(feature = "decentralized"), + not(feature = "chrome"), + not(feature = "cache") +))] #[tokio::test] async fn test_status_code() { let client = Client::builder() @@ -1001,7 +1010,11 @@ async fn test_status_code() { assert_eq!(page.status_code.as_u16(), 404); } -#[cfg(all(not(feature = "decentralized"), not(feature = "chrome")))] +#[cfg(all( + not(feature = "decentralized"), + not(feature = "chrome"), + not(feature = "cache") +))] #[tokio::test] async fn test_abs_path() { let client = Client::builder() diff --git a/spider/src/utils.rs b/spider/src/utils.rs index 51f298de6..d76c8b4ad 100644 --- a/spider/src/utils.rs +++ b/spider/src/utils.rs @@ -1,5 +1,6 @@ +use crate::Client; use log::{info, log_enabled, Level}; -use reqwest::{Client, Error, Response, StatusCode}; +use reqwest::{Error, Response, StatusCode}; /// The response of a web page. #[derive(Debug, Default)] diff --git a/spider/src/website.rs b/spider/src/website.rs index 2b15cb0da..bf2191b6b 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -4,6 +4,7 @@ use crate::packages::robotparser::parser::RobotFileParser; use crate::page::{build, get_page_selectors, Page}; use crate::utils::log; use crate::CaseInsensitiveString; +use crate::Client; #[cfg(feature = "cron")] use async_job::{async_trait, Job, Runner}; @@ -14,7 +15,6 @@ use compact_str::CompactString; use hashbrown::HashMap; use hashbrown::HashSet; -use reqwest::Client; #[cfg(not(feature = "napi"))] use std::io::{Error, ErrorKind}; use std::sync::atomic::{AtomicI8, Ordering}; @@ -195,6 +195,9 @@ pub struct Website { /// Block all images from rendering in Chrome. #[cfg(feature = "chrome_intercept")] pub chrome_intercept_block_images: bool, + /// Cache the page following HTTP Caching rules. + #[cfg(feature = "cache")] + pub cache: bool, } impl Website { @@ -533,8 +536,8 @@ impl Website { } /// build the http client - #[cfg(not(feature = "decentralized"))] - fn configure_http_client_builder(&mut self) -> reqwest::ClientBuilder { + #[cfg(all(not(feature = "decentralized"), not(feature = "cache")))] + fn configure_http_client_builder(&mut self) -> crate::ClientBuilder { use crate::page::domain_name; use reqwest::redirect::Attempt; use std::sync::atomic::AtomicU8; @@ -626,24 +629,128 @@ impl Website { _ => client, }; + let client = self.configure_http_client_cookies(client); + client } - /// configure http client - #[cfg(all(not(feature = "decentralized"), not(feature = "cookies")))] - pub fn configure_http_client(&mut self) -> Client { - let client = self.configure_http_client_builder(); + /// build the http client with caching enabled. + #[cfg(all(not(feature = "decentralized"), feature = "cache"))] + fn configure_http_client_builder(&mut self) -> crate::ClientBuilder { + use crate::page::domain_name; + use http_cache_reqwest::{CACacheManager, Cache, CacheMode, HttpCache, HttpCacheOptions}; + use reqwest::redirect::Attempt; + use reqwest_middleware::ClientBuilder; + use std::sync::atomic::AtomicU8; - // should unwrap using native-tls-alpn - unsafe { client.build().unwrap_unchecked() } + let host_str = self.domain_parsed.as_deref().cloned(); + let default_policy = reqwest::redirect::Policy::default(); + + let policy = match host_str { + Some(host_s) => { + let initial_redirect = Arc::new(AtomicU8::new(0)); + let initial_redirect_limit = if self.configuration.respect_robots_txt { + 2 + } else { + 1 + }; + let subdomains = self.configuration.subdomains; + let tld = self.configuration.tld; + let host_domain_name = if tld { + domain_name(&host_s).to_string() + } else { + Default::default() + }; + + let custom_policy = { + move |attempt: Attempt| { + if tld && domain_name(attempt.url()) == host_domain_name + || subdomains + && attempt + .url() + .host_str() + .unwrap_or_default() + .ends_with(host_s.host_str().unwrap_or_default()) + || attempt.url().host() == host_s.host() + { + default_policy.redirect(attempt) + } else if attempt.previous().len() > 7 { + attempt.error("too many redirects") + } else if attempt.status().is_redirection() + && (0..initial_redirect_limit) + .contains(&initial_redirect.load(Ordering::Relaxed)) + { + initial_redirect.fetch_add(1, Ordering::Relaxed); + default_policy.redirect(attempt) + } else { + attempt.stop() + } + } + }; + reqwest::redirect::Policy::custom(custom_policy) + } + _ => default_policy, + }; + + let client = reqwest::Client::builder() + .user_agent(match &self.configuration.user_agent { + Some(ua) => ua.as_str(), + _ => &get_ua(), + }) + .redirect(policy) + .tcp_keepalive(Duration::from_millis(500)) + .pool_idle_timeout(None); + + let client = if self.configuration.http2_prior_knowledge { + client.http2_prior_knowledge() + } else { + client + }; + + let client = match &self.configuration.headers { + Some(headers) => client.default_headers(*headers.to_owned()), + _ => client, + }; + + let mut client = match &self.configuration.request_timeout { + Some(t) => client.timeout(**t), + _ => client, + }; + + let client = match &self.configuration.proxies { + Some(proxies) => { + for proxie in proxies.iter() { + match reqwest::Proxy::all(proxie) { + Ok(proxy) => client = client.proxy(proxy), + _ => (), + } + } + client + } + _ => client, + }; + + let client = self.configure_http_client_cookies(client); + let client = ClientBuilder::new(unsafe { client.build().unwrap_unchecked() }); + + if self.cache { + client.with(Cache(HttpCache { + mode: CacheMode::Default, + manager: CACacheManager::default(), + options: HttpCacheOptions::default(), + })) + } else { + client + } } - /// build the client with cookie configurations + /// build the client with cookie configurations. #[cfg(all(not(feature = "decentralized"), feature = "cookies"))] - pub fn configure_http_client(&mut self) -> Client { - let client = self.configure_http_client_builder(); + fn configure_http_client_cookies( + &mut self, + client: reqwest::ClientBuilder, + ) -> reqwest::ClientBuilder { let client = client.cookie_store(true); - let client = if !self.cookie_str.is_empty() && self.domain_parsed.is_some() { match self.domain_parsed.clone() { Some(p) => { @@ -656,13 +763,35 @@ impl Website { } else { client }; + client + } + /// build the client with cookie configurations. This does nothing with [cookies] flag enabled. + #[cfg(all(not(feature = "decentralized"), not(feature = "cookies")))] + fn configure_http_client_cookies( + &mut self, + client: reqwest::ClientBuilder, + ) -> reqwest::ClientBuilder { + client + } + + /// configure http client + #[cfg(all(not(feature = "decentralized"), not(feature = "cache")))] + pub fn configure_http_client(&mut self) -> Client { + let client = self.configure_http_client_builder(); // should unwrap using native-tls-alpn unsafe { client.build().unwrap_unchecked() } } + /// configure http client + #[cfg(all(not(feature = "decentralized"), feature = "cache"))] + pub fn configure_http_client(&mut self) -> Client { + let client = self.configure_http_client_builder(); + client.build() + } + /// configure http client for decentralization - #[cfg(feature = "decentralized")] + #[cfg(all(feature = "decentralized", not(feature = "cache")))] pub fn configure_http_client(&mut self) -> Client { use reqwest::header::HeaderMap; use reqwest::header::HeaderValue; @@ -750,6 +879,103 @@ impl Website { } } + /// configure http client for decentralization + #[cfg(all(feature = "decentralized", feature = "cache"))] + pub fn configure_http_client(&mut self) -> Client { + use http_cache_reqwest::{CACacheManager, Cache, CacheMode, HttpCache, HttpCacheOptions}; + use reqwest::header::HeaderMap; + use reqwest::header::HeaderValue; + use reqwest_middleware::ClientBuilder; + + let mut headers = HeaderMap::new(); + + let host_str = self.domain_parsed.take(); + let default_policy = reqwest::redirect::Policy::default(); + let policy = match host_str { + Some(host_s) => reqwest::redirect::Policy::custom(move |attempt| { + if attempt.url().host_str() != host_s.host_str() { + attempt.stop() + } else { + default_policy.redirect(attempt) + } + }), + _ => default_policy, + }; + + let mut client = reqwest::Client::builder() + .user_agent(match &self.configuration.user_agent { + Some(ua) => ua.as_str(), + _ => &get_ua(), + }) + .redirect(policy) + .tcp_keepalive(Duration::from_millis(500)) + .pool_idle_timeout(None); + + let referer = if self.configuration.tld && self.configuration.subdomains { + 2 + } else if self.configuration.tld { + 2 + } else if self.configuration.subdomains { + 1 + } else { + 0 + }; + + if referer > 0 { + // use expected http headers for providers that drop invalid headers + headers.insert(reqwest::header::REFERER, HeaderValue::from(referer)); + } + + match &self.configuration.headers { + Some(h) => headers.extend(*h.to_owned()), + _ => (), + }; + + match self.get_absolute_path(None) { + Some(domain_url) => { + let domain_url = domain_url.as_str(); + let domain_host = if domain_url.ends_with("/") { + &domain_url[0..domain_url.len() - 1] + } else { + domain_url + }; + match HeaderValue::from_str(domain_host) { + Ok(value) => { + headers.insert(reqwest::header::HOST, value); + } + _ => (), + } + } + _ => (), + } + + for worker in WORKERS.iter() { + match reqwest::Proxy::all(worker) { + Ok(worker) => { + client = client.proxy(worker); + } + _ => (), + } + } + + let client = ClientBuilder::new(unsafe { + match &self.configuration.request_timeout { + Some(t) => client.timeout(**t), + _ => client, + } + .default_headers(headers) + .build() + .unwrap_unchecked() + }) + .with(Cache(HttpCache { + mode: CacheMode::Default, + manager: CACacheManager::default(), + options: HttpCacheOptions::default(), + })); + + client.build() + } + /// setup atomic controller #[cfg(feature = "control")] fn configure_handler(&self) -> (Arc, tokio::task::JoinHandle<()>) { @@ -781,7 +1007,79 @@ impl Website { } }); - (handle, join_handle) + (handle, join_handle) + } + + /// Setup interception for chrome request + #[cfg(all(feature = "chrome", feature = "chrome_intercept"))] + async fn setup_chrome_interception( + &self, + chrome_page: &Arc, + ) -> Option> { + if self.chrome_intercept { + use chromiumoxide::cdp::browser_protocol::network::ResourceType; + + match chrome_page + .event_listener::() + .await + { + Ok(mut rp) => { + let host_name = self.domain.inner().to_string(); + let intercept_page = chrome_page.clone(); + let ignore_images = self.chrome_intercept_block_images; + + let ih = task::spawn(async move { + while let Some(event) = rp.next().await { + let u = &event.request.url; + if ignore_images && ResourceType::Image == event.resource_type || !u.starts_with(&host_name) && !crate::page::JS_FRAMEWORK_ALLOW.contains(&u.as_str()) { + match chromiumoxide::cdp::browser_protocol::fetch::FulfillRequestParams::builder() + .request_id(event.request_id.clone()) + .response_code(200) + .build() { + Ok(c) => { + if let Err(e) = intercept_page.execute(c).await + { + log("Failed to fullfill request: ", e.to_string()); + } + } + _ => { + log("Failed to get request handle ", &host_name); + } + } + } else if let Err(e) = intercept_page + .execute(chromiumoxide::cdp::browser_protocol::fetch::ContinueRequestParams::new(event.request_id.clone())) + .await + { + log("Failed to continue request: ", e.to_string()); + } + } + }); + + Some(ih) + } + _ => None, + } + } else { + None + } + } + + /// Setup interception for chrome request + #[cfg(all(feature = "chrome", not(feature = "chrome_intercept")))] + async fn setup_chrome_interception( + &self, + _chrome_page: &Arc, + ) -> Option> { + None + } + + /// setup selectors for handling link targets + fn setup_selectors(&self) -> Option<(CompactString, smallvec::SmallVec<[CompactString; 2]>)> { + get_page_selectors( + &self.domain.inner(), + self.configuration.subdomains, + self.configuration.tld, + ) } /// setup config for crawl @@ -819,15 +1117,6 @@ impl Website { (self.configure_robots_parser(client).await, None) } - /// setup selectors for handling link targets - fn setup_selectors(&self) -> Option<(CompactString, smallvec::SmallVec<[CompactString; 2]>)> { - get_page_selectors( - &self.domain.inner(), - self.configuration.subdomains, - self.configuration.tld, - ) - } - /// setup shared concurrent configs fn setup_crawl( &mut self, @@ -842,18 +1131,6 @@ impl Website { (interval, throttle) } - /// get base link for crawl establishing - #[cfg(feature = "regex")] - fn get_base_link(&self) -> &CaseInsensitiveString { - &self.domain - } - - /// get base link for crawl establishing - #[cfg(not(feature = "regex"))] - fn get_base_link(&self) -> &CompactString { - self.domain.inner() - } - /// expand links for crawl async fn _crawl_establish( &mut self, @@ -1547,268 +1824,42 @@ impl Website { let page_links = page.links(&*selectors).await; - (link, page, page_links) - }); - } - - task::yield_now().await; - - if links.capacity() >= 1500 { - links.shrink_to_fit(); - } - - while let Some(res) = set.join_next().await { - match res { - Ok(msg) => { - let page = msg.1; - links.extend(&msg.2 - &self.links_visited); - task::yield_now().await; - match self.pages.as_mut() { - Some(p) => p.push(page), - _ => (), - }; - } - _ => (), - }; - } - - task::yield_now().await; - if links.is_empty() { - break; - } - } - } - } - - /// Setup interception for chrome request - #[cfg(all(feature = "chrome", feature = "chrome_intercept"))] - async fn setup_chrome_interception( - &self, - chrome_page: &Arc, - ) -> Option> { - if self.chrome_intercept { - use chromiumoxide::cdp::browser_protocol::network::ResourceType; - - match chrome_page - .event_listener::() - .await - { - Ok(mut rp) => { - let host_name = self.domain.inner().to_string(); - let intercept_page = chrome_page.clone(); - let ignore_images = self.chrome_intercept_block_images; - - let ih = task::spawn(async move { - while let Some(event) = rp.next().await { - let u = &event.request.url; - if ignore_images && ResourceType::Image == event.resource_type || !u.starts_with(&host_name) && !crate::page::JS_FRAMEWORK_ALLOW.contains(&u.as_str()) { - match chromiumoxide::cdp::browser_protocol::fetch::FulfillRequestParams::builder() - .request_id(event.request_id.clone()) - .response_code(200) - .build() { - Ok(c) => { - if let Err(e) = intercept_page.execute(c).await - { - log("Failed to fullfill request: ", e.to_string()); - } - } - _ => { - log("Failed to get request handle ", &host_name); - } - } - } else if let Err(e) = intercept_page - .execute(chromiumoxide::cdp::browser_protocol::fetch::ContinueRequestParams::new(event.request_id.clone())) - .await - { - log("Failed to continue request: ", e.to_string()); - } - } - }); - - Some(ih) - } - _ => None, - } - } else { - None - } - } - - /// Setup interception for chrome request - #[cfg(all(feature = "chrome", not(feature = "chrome_intercept")))] - async fn setup_chrome_interception( - &self, - _chrome_page: &Arc, - ) -> Option> { - None - } - - /// Start to crawl website concurrently - #[cfg(all(not(feature = "decentralized"), feature = "chrome"))] - async fn crawl_concurrent(&mut self, client: &Client, handle: &Option>) { - self.start(); - let selectors = self.setup_selectors(); - - // crawl if valid selector - if selectors.is_some() { - let (mut interval, throttle) = self.setup_crawl(); - let blacklist_url = self.configuration.get_blacklist(); - - let on_link_find_callback = self.on_link_find_callback; - - match launch_browser(&self.configuration).await { - Some((mut browser, browser_handle)) => { - match browser.new_page("about:blank").await { - Ok(new_page) => { - if cfg!(feature = "chrome_stealth") || self.stealth_mode { - let _ = new_page.enable_stealth_mode_with_agent(&if self - .configuration - .user_agent - .is_some() - { - &self.configuration.user_agent.as_ref().unwrap().as_str() - } else { - "" - }); - } - - let mut selectors = unsafe { selectors.unwrap_unchecked() }; - - let chrome_page = Arc::new(new_page.clone()); - - let intercept_handle = - self.setup_chrome_interception(&chrome_page).await; - - let mut links: HashSet = self - .crawl_establish(&client, &mut selectors, false, &chrome_page) - .await; - - let shared = Arc::new(( - client.to_owned(), - selectors, - self.channel.clone(), - chrome_page, - self.external_domains_caseless.clone(), - )); - - let add_external = shared.4.len() > 0; - - if !links.is_empty() { - let mut set: JoinSet> = - JoinSet::new(); - let chandle = Handle::current(); - - // crawl while links exists - loop { - let stream = - tokio_stream::iter::>( - links.drain().collect(), - ) - .throttle(*throttle); - tokio::pin!(stream); - - loop { - match stream.next().await { - Some(link) => { - match handle.as_ref() { - Some(handle) => { - while handle.load(Ordering::Relaxed) == 1 { - interval.tick().await; - } - if handle.load(Ordering::Relaxed) == 2 - || self.shutdown - { - set.shutdown().await; - break; - } - } - None => (), - } - - if !self.is_allowed(&link, &blacklist_url) { - continue; - } - - log("fetch", &link); - self.links_visited.insert(link.clone()); - let permit = SEM.acquire().await.unwrap(); - let shared = shared.clone(); - task::yield_now().await; - - set.spawn_on( - async move { - let link_result = - match on_link_find_callback { - Some(cb) => cb(link, None), - _ => (link, None), - }; - let mut page = Page::new( - &link_result.0.as_ref(), - &shared.0, - &shared.3, - ) - .await; - - if add_external { - page.set_external(shared.4.clone()); - } - - let page_links = - page.links(&shared.1).await; - - channel_send_page(&shared.2, page); - - drop(permit); - - page_links - }, - &chandle, - ); - } - _ => break, - } - } + (link, page, page_links) + }); + } - while let Some(res) = set.join_next().await { - match res { - Ok(msg) => links.extend(&msg - &self.links_visited), - _ => (), - }; - } + task::yield_now().await; - if links.is_empty() { - break; - } - } - } + if links.capacity() >= 1500 { + links.shrink_to_fit(); + } - if !std::env::var("CHROME_URL").is_ok() { - let _ = browser.close().await; - let _ = browser_handle.await; - } else { - let _ = new_page.close().await; - if !browser_handle.is_finished() { - browser_handle.abort(); - } - } - match intercept_handle { - Some(intercept_handle) => { - let _ = intercept_handle.await; - } + while let Some(res) = set.join_next().await { + match res { + Ok(msg) => { + let page = msg.1; + links.extend(&msg.2 - &self.links_visited); + task::yield_now().await; + match self.pages.as_mut() { + Some(p) => p.push(page), _ => (), - } + }; } - _ => log("", "Chrome failed to open page."), - } + _ => (), + }; + } + + task::yield_now().await; + if links.is_empty() { + break; } - _ => log("", "Chrome failed to start."), } } } /// Start to crawl website concurrently - #[cfg(all(not(feature = "decentralized"), feature = "smart"))] - async fn crawl_concurrent_smart(&mut self, client: &Client, handle: &Option>) { + #[cfg(all(not(feature = "decentralized"), feature = "chrome",))] + async fn crawl_concurrent(&mut self, client: &Client, handle: &Option>) { self.start(); let selectors = self.setup_selectors(); @@ -1916,9 +1967,8 @@ impl Website { page.set_external(shared.4.clone()); } - let page_links = page - .smart_links(&shared.1, &shared.3) - .await; + let page_links = + page.links(&shared.1).await; channel_send_page(&shared.2, page); @@ -1955,7 +2005,6 @@ impl Website { browser_handle.abort(); } } - match intercept_handle { Some(intercept_handle) => { let _ = intercept_handle.await; @@ -1972,7 +2021,7 @@ impl Website { } /// Start to crawl website concurrently - #[cfg(all(not(feature = "decentralized"), not(feature = "chrome")))] + #[cfg(all(not(feature = "decentralized"), not(feature = "chrome"),))] async fn crawl_concurrent(&mut self, client: &Client, handle: &Option>) { self.start(); // crawl if valid selector @@ -2189,6 +2238,171 @@ impl Website { } } + /// Start to crawl website concurrently + #[cfg(all(not(feature = "decentralized"), feature = "smart"))] + async fn crawl_concurrent_smart(&mut self, client: &Client, handle: &Option>) { + self.start(); + let selectors = self.setup_selectors(); + + // crawl if valid selector + if selectors.is_some() { + let (mut interval, throttle) = self.setup_crawl(); + let blacklist_url = self.configuration.get_blacklist(); + + let on_link_find_callback = self.on_link_find_callback; + + match launch_browser(&self.configuration).await { + Some((mut browser, browser_handle)) => { + match browser.new_page("about:blank").await { + Ok(new_page) => { + if cfg!(feature = "chrome_stealth") || self.stealth_mode { + let _ = new_page.enable_stealth_mode_with_agent(&if self + .configuration + .user_agent + .is_some() + { + &self.configuration.user_agent.as_ref().unwrap().as_str() + } else { + "" + }); + } + + let mut selectors = unsafe { selectors.unwrap_unchecked() }; + + let chrome_page = Arc::new(new_page.clone()); + + let intercept_handle = + self.setup_chrome_interception(&chrome_page).await; + + let mut links: HashSet = self + .crawl_establish(&client, &mut selectors, false, &chrome_page) + .await; + + let shared = Arc::new(( + client.to_owned(), + selectors, + self.channel.clone(), + chrome_page, + self.external_domains_caseless.clone(), + )); + + let add_external = shared.4.len() > 0; + + if !links.is_empty() { + let mut set: JoinSet> = + JoinSet::new(); + let chandle = Handle::current(); + + // crawl while links exists + loop { + let stream = + tokio_stream::iter::>( + links.drain().collect(), + ) + .throttle(*throttle); + tokio::pin!(stream); + + loop { + match stream.next().await { + Some(link) => { + match handle.as_ref() { + Some(handle) => { + while handle.load(Ordering::Relaxed) == 1 { + interval.tick().await; + } + if handle.load(Ordering::Relaxed) == 2 + || self.shutdown + { + set.shutdown().await; + break; + } + } + None => (), + } + + if !self.is_allowed(&link, &blacklist_url) { + continue; + } + + log("fetch", &link); + self.links_visited.insert(link.clone()); + let permit = SEM.acquire().await.unwrap(); + let shared = shared.clone(); + task::yield_now().await; + + set.spawn_on( + async move { + let link_result = + match on_link_find_callback { + Some(cb) => cb(link, None), + _ => (link, None), + }; + let mut page = Page::new( + &link_result.0.as_ref(), + &shared.0, + &shared.3, + ) + .await; + + if add_external { + page.set_external(shared.4.clone()); + } + + let page_links = page + .smart_links(&shared.1, &shared.3) + .await; + + channel_send_page(&shared.2, page); + + drop(permit); + + page_links + }, + &chandle, + ); + } + _ => break, + } + } + + while let Some(res) = set.join_next().await { + match res { + Ok(msg) => links.extend(&msg - &self.links_visited), + _ => (), + }; + } + + if links.is_empty() { + break; + } + } + } + + if !std::env::var("CHROME_URL").is_ok() { + let _ = browser.close().await; + let _ = browser_handle.await; + } else { + let _ = new_page.close().await; + if !browser_handle.is_finished() { + browser_handle.abort(); + } + } + + match intercept_handle { + Some(intercept_handle) => { + let _ = intercept_handle.await; + } + _ => (), + } + } + _ => log("", "Chrome failed to open page."), + } + } + _ => log("", "Chrome failed to start."), + } + } + } + #[cfg(not(feature = "chrome"))] /// Start to scape website concurrently and store resources async fn scrape_concurrent(&mut self, client: &Client, handle: &Option>) { @@ -2602,6 +2816,18 @@ impl Website { } } + /// get base link for crawl establishing + #[cfg(feature = "regex")] + fn get_base_link(&self) -> &CaseInsensitiveString { + &self.domain + } + + /// get base link for crawl establishing + #[cfg(not(feature = "regex"))] + fn get_base_link(&self) -> &CompactString { + self.domain.inner() + } + /// Respect robots.txt file. pub fn with_respect_robots_txt(&mut self, respect_robots_txt: bool) -> &mut Self { self.configuration @@ -2756,6 +2982,19 @@ impl Website { self } + #[cfg(feature = "cache")] + /// Cache the page following HTTP rules. This method does nothing if the [cache] feature is not enabled. + pub fn with_caching(&mut self, cache: bool) -> &mut Self { + self.cache = cache; + self + } + + #[cfg(not(feature = "cache"))] + /// Cache the page following HTTP rules. This method does nothing if the [cache] feature is not enabled. + pub fn with_caching(&mut self, _cache: bool) -> &mut Self { + self + } + #[cfg(feature = "chrome_intercept")] /// Use request intercept for the request to only allow content that matches the host. If the content is from a 3rd party it needs to be part of our include list. This method does nothing if the [chrome_intercept] is not enabled. pub fn with_chrome_intercept( @@ -3304,3 +3543,26 @@ async fn test_crawl_shutdown() { assert_eq!(website.links_visited.len(), 1); } + +#[tokio::test] +#[cfg(all(feature = "cache", not(feature = "decentralized")))] +async fn test_cache() { + let domain = "https://choosealicense.com/"; + let mut website: Website = Website::new(&domain); + website.cache = true; + + let fresh_start = tokio::time::Instant::now(); + website.crawl().await; + let fresh_duration = fresh_start.elapsed(); + + let cached_start = tokio::time::Instant::now(); + website.crawl().await; + let cached_duration = cached_start.elapsed(); + + // cache should be faster at least 5x. + assert!( + fresh_duration.as_millis() > cached_duration.as_millis() * 5, + "{:?}", + cached_duration + ); +} diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 900f9b050..1c758a2e7 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "1.70.5" +version = "1.80.1" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler CLI written in Rust." repository = "https://github.com/spider-rs/spider" @@ -26,7 +26,7 @@ quote = "1.0.18" failure_derive = "0.1.8" [dependencies.spider] -version = "1.70.5" +version = "1.80.1" path = "../spider" [[bin]] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 4ac3a603f..88cbc3385 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "1.70.5" +version = "1.80.1" authors = ["madeindjs ", "j-mendez "] description = "The fastest web crawler as a worker or proxy." repository = "https://github.com/spider-rs/spider" @@ -22,7 +22,7 @@ lazy_static = "1.4.0" env_logger = "0.10.0" [dependencies.spider] -version = "1.70.5" +version = "1.80.1" path = "../spider" features = ["serde", "flexbuffers"] @@ -31,3 +31,4 @@ default = [] scrape = [] tls = ["warp/tls"] full_resources = ["spider/full_resources"] +cache = ["spider/cache"] diff --git a/spider_worker/src/main.rs b/spider_worker/src/main.rs index 47a73b79d..6d9b482e7 100644 --- a/spider_worker/src/main.rs +++ b/spider_worker/src/main.rs @@ -8,7 +8,7 @@ extern crate lazy_static; lazy_static! { /// top level request client to re-use - static ref CLIENT: spider::reqwest::Client = { + static ref CLIENT: spider::Client = { let mut proxy_website = Website::new("proxy"); let client = proxy_website.configure_http_client();