From cc11d7ac1f0662c0563d21576923a9d142d759e9 Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Fri, 24 May 2024 13:29:24 -0400 Subject: [PATCH 01/23] inital "just barely works" Fuse.js support --- Cargo.lock | 161 +++++++++++++------------ components/config/src/config/search.rs | 1 + components/search/src/lib.rs | 2 +- components/site/Cargo.toml | 1 + components/site/src/lib.rs | 51 ++++++-- 5 files changed, 130 insertions(+), 86 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fc1cfce7e5..ad80379c59 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -150,9 +150,9 @@ checksum = "70033777eb8b5124a81a1889416543dddef2de240019b674c81285a2635a7e1e" [[package]] name = "anyhow" -version = "1.0.83" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25bdb32cbbdce2b519a9cd7df3a678443100e265d5e25ca763b7572a5104f5f3" +checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da" [[package]] name = "arbitrary" @@ -168,7 +168,7 @@ checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea" dependencies = [ "proc-macro2", "quote", - "syn 2.0.61", + "syn 2.0.65", ] [[package]] @@ -402,9 +402,9 @@ checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" [[package]] name = "bytemuck" -version = "1.15.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d6d68c57235a3a081186990eca2867354726650f42f7516ca50c28d6281fd15" +checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5" [[package]] name = "byteorder" @@ -436,9 +436,9 @@ checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" [[package]] name = "camino" -version = "1.1.6" +version = "1.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +checksum = "e0ec6b951b160caa93cc0c7b209e5a3bff7aae9062213451ac99493cd844c239" dependencies = [ "serde", ] @@ -467,9 +467,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.97" +version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "099a5357d84c4c61eb35fc8eafa9a79a902c2f76911e5747ced4e032edd8d9b4" +checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f" dependencies = [ "jobserver", "libc", @@ -618,7 +618,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.61", + "syn 2.0.65", ] [[package]] @@ -765,9 +765,9 @@ dependencies = [ [[package]] name = "crc32fast" -version = "1.4.0" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" dependencies = [ "cfg-if 1.0.0", ] @@ -802,9 +802,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.19" +version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" [[package]] name = "crunchy" @@ -851,7 +851,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" dependencies = [ "quote", - "syn 2.0.61", + "syn 2.0.65", ] [[package]] @@ -924,9 +924,9 @@ dependencies = [ [[package]] name = "deunicode" -version = "1.4.4" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "322ef0094744e63628e6f0eb2295517f79276a5b342a4c2ff3042566ca181d4e" +checksum = "339544cc9e2c4dc3fc7149fd630c5f22263a4fdf18a98afd0075784968b5cf00" [[package]] name = "digest" @@ -955,7 +955,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.61", + "syn 2.0.65", ] [[package]] @@ -981,9 +981,9 @@ dependencies = [ [[package]] name = "either" -version = "1.11.0" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2" +checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b" [[package]] name = "elasticlunr-rs" @@ -1367,8 +1367,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if 1.0.0", + "js-sys", "libc", "wasi 0.11.0+wasi-snapshot-preview1", + "wasm-bindgen", ] [[package]] @@ -1429,18 +1431,19 @@ dependencies = [ [[package]] name = "grass" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b89786a806d5b192cf4e573f9831c847a455a142d000c922bdfc1e5edad14303" +checksum = "a46def7216d331efa51a6aa796ef777bfdfe9605378382827a553344b7e5eefc" dependencies = [ + "getrandom 0.2.15", "grass_compiler", ] [[package]] name = "grass_compiler" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cf7d155dd7cef20195016d01005033a5521aad307033f0f8e8bf0a02f5f7554" +checksum = "f39216c1843182f78541276fec96f88406861f16aa19cc9f8add70f8e67b7577" dependencies = [ "codemap", "indexmap 2.2.6", @@ -1536,7 +1539,7 @@ dependencies = [ "markup5ever", "proc-macro2", "quote", - "syn 2.0.61", + "syn 2.0.65", ] [[package]] @@ -1782,9 +1785,9 @@ dependencies = [ [[package]] name = "insta" -version = "1.38.0" +version = "1.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3eab73f58e59ca6526037208f0e98851159ec1633cf17b6cd2e1f2c3fd5d53cc" +checksum = "810ae6042d48e2c9e9215043563a58a80b877bc863228a74cf10c49d4620a6f5" dependencies = [ "console 0.15.8", "lazy_static", @@ -1800,7 +1803,7 @@ checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60" dependencies = [ "proc-macro2", "quote", - "syn 2.0.61", + "syn 2.0.65", ] [[package]] @@ -1983,9 +1986,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.154" +version = "0.2.155" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae743338b92ff9146ce83992f766a31066a91a8c84a45e0e9f21e7cf6de6d346" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" [[package]] name = "libfuzzer-sys" @@ -2042,7 +2045,7 @@ dependencies = [ "tera", "termcolor", "time", - "toml 0.8.12", + "toml 0.8.13", "unic-langid", "unicode-segmentation", "url", @@ -2062,9 +2065,9 @@ dependencies = [ [[package]] name = "lightningcss" -version = "1.0.0-alpha.55" +version = "1.0.0-alpha.56" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bd5bed3814fb631bfc1e24c2be6f7e86a9837c660909acab79a38374dcb8798" +checksum = "668e9f1774a4dda9e2233ad0f78c6987878bcf4201d2085bc3517a7f84d0ee92" dependencies = [ "ahash 0.8.11", "bitflags 2.5.0", @@ -2074,6 +2077,7 @@ dependencies = [ "dashmap", "data-encoding", "getrandom 0.2.15", + "indexmap 2.2.6", "itertools 0.10.5", "lazy_static", "parcel_selectors", @@ -2267,9 +2271,9 @@ checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" [[package]] name = "linux-raw-sys" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "lock_api" @@ -2443,9 +2447,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" +checksum = "87dfd01fe195c66b572b37921ad8803d010623c0aca821bea2302239d155cdae" dependencies = [ "adler", "simd-adler32", @@ -2694,7 +2698,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn 2.0.61", + "syn 2.0.65", ] [[package]] @@ -2790,9 +2794,9 @@ checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" [[package]] name = "open" -version = "5.1.2" +version = "5.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "449f0ff855d85ddbf1edd5b646d65249ead3f5e422aaa86b7d2d0b049b103e32" +checksum = "2eb49fbd5616580e9974662cb96a3463da4476e649a7e4b258df0de065db0657" dependencies = [ "is-wsl", "libc", @@ -2822,7 +2826,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.61", + "syn 2.0.65", ] [[package]] @@ -2857,9 +2861,9 @@ checksum = "7f222829ae9293e33a9f5e9f440c6760a3d450a64affe1846486b140db81c1f4" [[package]] name = "parcel_selectors" -version = "0.26.4" +version = "0.26.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05d74befe2d076330d9a58bf9ca2da424568724ab278adf15fb5718253133887" +checksum = "ce9c47a67c66fee4a5a42756f9784d92941bd0ab2b653539a9e90521a44b66f0" dependencies = [ "bitflags 2.5.0", "cssparser", @@ -2985,7 +2989,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.61", + "syn 2.0.65", ] [[package]] @@ -3068,7 +3072,7 @@ dependencies = [ "phf_shared 0.11.2", "proc-macro2", "quote", - "syn 2.0.61", + "syn 2.0.65", ] [[package]] @@ -3178,9 +3182,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.82" +version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ad3d49ab951a01fbaafe34f2ec74122942fe18a3f9814c3268f1bb72042131b" +checksum = "0b33eb56c327dec362a9e55b3ad14f9d2f0904fb5a5b03b513ab5465399e9f43" dependencies = [ "unicode-ident", ] @@ -3201,7 +3205,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8021cf59c8ec9c432cfc2526ac6b8aa508ecaf29cd415f271b8406c1b851c3fd" dependencies = [ "quote", - "syn 2.0.61", + "syn 2.0.65", ] [[package]] @@ -3785,22 +3789,22 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.201" +version = "1.0.202" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "780f1cebed1629e4753a1a38a3c72d30b97ec044f0aef68cb26650a3c5cf363c" +checksum = "226b61a0d411b2ba5ff6d7f73a476ac4f8bb900373459cd00fab8512828ba395" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.201" +version = "1.0.202" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5e405930b9796f1c00bee880d03fc7e0bb4b9a11afc776885ffe84320da2865" +checksum = "6048858004bcff69094cd972ed40a32500f153bd3be9f716b2eed2e8217c4838" dependencies = [ "proc-macro2", "quote", - "syn 2.0.61", + "syn 2.0.65", ] [[package]] @@ -3817,9 +3821,9 @@ dependencies = [ [[package]] name = "serde_spanned" -version = "0.6.5" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb3622f419d1296904700073ea6cc23ad690adbd66f13ea683df73298736f0c1" +checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0" dependencies = [ "serde", ] @@ -3928,6 +3932,7 @@ dependencies = [ "path-slash", "search", "serde", + "serde_json", "tempfile", "templates", "utils", @@ -4056,9 +4061,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.61" +version = "2.0.65" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c993ed8ccba56ae856363b1845da7266a7cb78e1d146c8a32d54b45a8b831fc9" +checksum = "d2863d96a84c6439701d7a38f9de935ec562c8832cc55d1dde0f513b52fad106" dependencies = [ "proc-macro2", "quote", @@ -4123,7 +4128,7 @@ dependencies = [ "cfg-expr", "heck 0.5.0", "pkg-config", - "toml 0.8.12", + "toml 0.8.13", "version-compare", ] @@ -4237,7 +4242,7 @@ dependencies = [ "cfg-if 1.0.0", "proc-macro2", "quote", - "syn 2.0.61", + "syn 2.0.65", ] [[package]] @@ -4248,7 +4253,7 @@ checksum = "5c89e72a01ed4c579669add59014b9a524d609c0c88c6a585ce37485879f6ffb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.61", + "syn 2.0.65", "test-case-core", ] @@ -4260,22 +4265,22 @@ checksum = "23d434d3f8967a09480fb04132ebe0a3e088c173e6d0ee7897abbdf4eab0f8b9" [[package]] name = "thiserror" -version = "1.0.60" +version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "579e9083ca58dd9dcf91a9923bb9054071b9ebbd800b342194c9feb0ee89fc18" +checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.60" +version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2470041c06ec3ac1ab38d0356a6119054dedaea53e12fbefc0de730a1c08524" +checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" dependencies = [ "proc-macro2", "quote", - "syn 2.0.61", + "syn 2.0.65", ] [[package]] @@ -4405,9 +4410,9 @@ dependencies = [ [[package]] name = "toml" -version = "0.8.12" +version = "0.8.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9dd1545e8208b4a5af1aa9bbd0b4cf7e9ea08fabc5d0a5c67fcaafa17433aa3" +checksum = "a4e43f8cc456c9704c851ae29c67e17ef65d2c30017c17a9765b89c382dc8bba" dependencies = [ "serde", "serde_spanned", @@ -4417,18 +4422,18 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.6.5" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1" +checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf" dependencies = [ "serde", ] [[package]] name = "toml_edit" -version = "0.22.12" +version = "0.22.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3328d4f68a705b2a4498da1d580585d39a6510f98318a2cec3018a7ec61ddef" +checksum = "c127785850e8c20836d49732ae6abfa47616e60bf9d9f57c43c250361a9db96c" dependencies = [ "indexmap 2.2.6", "serde", @@ -4503,18 +4508,18 @@ checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc" [[package]] name = "unic-langid" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "238722e6d794ed130f91f4ea33e01fcff4f188d92337a21297892521c72df516" +checksum = "23dd9d1e72a73b25e07123a80776aae3e7b0ec461ef94f9151eed6ec88005a44" dependencies = [ "unic-langid-impl", ] [[package]] name = "unic-langid-impl" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bd55a2063fdea4ef1f8633243a7b0524cbeef1905ae04c31a1c9b9775c55bc6" +checksum = "0a5422c1f65949306c99240b81de9f3f15929f5a8bfe05bb44b034cc8bf593e5" dependencies = [ "tinystr", ] @@ -4722,7 +4727,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.61", + "syn 2.0.65", "wasm-bindgen-shared", ] @@ -4756,7 +4761,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.61", + "syn 2.0.65", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -5104,7 +5109,7 @@ checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.61", + "syn 2.0.65", ] [[package]] diff --git a/components/config/src/config/search.rs b/components/config/src/config/search.rs index 3ce6878cd2..6d0a337362 100644 --- a/components/config/src/config/search.rs +++ b/components/config/src/config/search.rs @@ -7,6 +7,7 @@ pub enum IndexFormat { ElasticlunrJson, #[default] ElasticlunrJavascript, + FuseJson, } #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] diff --git a/components/search/src/lib.rs b/components/search/src/lib.rs index 51d25579d2..0022f21a53 100644 --- a/components/search/src/lib.rs +++ b/components/search/src/lib.rs @@ -12,7 +12,7 @@ use errors::{bail, Result}; pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js"); -static AMMONIA: Lazy> = Lazy::new(|| { +pub static AMMONIA: Lazy> = Lazy::new(|| { let mut clean_content = HashSet::new(); clean_content.insert("script"); clean_content.insert("style"); diff --git a/components/site/Cargo.toml b/components/site/Cargo.toml index 3388f2dc88..b26deb6512 100644 --- a/components/site/Cargo.toml +++ b/components/site/Cargo.toml @@ -18,6 +18,7 @@ imageproc = { path = "../imageproc" } link_checker = { path = "../link_checker" } libs = { path = "../libs" } content = { path = "../content" } +serde_json = "1.0.117" [dev-dependencies] tempfile = "3" diff --git a/components/site/src/lib.rs b/components/site/src/lib.rs index a84ca526fd..c23a44047e 100644 --- a/components/site/src/lib.rs +++ b/components/site/src/lib.rs @@ -799,15 +799,52 @@ impl Site { } fn index_for_lang(&self, lang: &str) -> Result<()> { - let index_json = search::build_index(lang, &self.library.read().unwrap(), &self.config)?; let (path, content) = match &self.config.search.index_format { - IndexFormat::ElasticlunrJson => { - let path = self.output_path.join(format!("search_index.{}.json", lang)); - (path, index_json) + format @ IndexFormat::ElasticlunrJavascript | format @ IndexFormat::ElasticlunrJson => { + let index_json = + search::build_index(lang, &self.library.read().unwrap(), &self.config)?; + if *format == IndexFormat::ElasticlunrJson { + let path = self.output_path.join(format!("search_index.{}.json", lang)); + (path, index_json) + } else { + let path = self.output_path.join(format!("search_index.{}.js", lang)); + let content = format!("window.searchIndex = {};", index_json); + (path, content) + } } - IndexFormat::ElasticlunrJavascript => { - let path = self.output_path.join(format!("search_index.{}.js", lang)); - let content = format!("window.searchIndex = {};", index_json); + IndexFormat::FuseJson => { + #[derive(serde::Serialize)] + struct Item { + title: String, + body: String, + url: String, + } + let path = self.output_path.join(format!("search_index.{}.json", lang)); + let mut items: Vec = Vec::new(); + let library = self.library.read().unwrap(); + for (_, section) in &library.sections { + if section.lang == lang + && section.meta.redirect_to.is_none() + && section.meta.in_search_index + { + items.push(Item { + title: section.meta.title.clone().unwrap_or_default(), + body: search::AMMONIA.clean(§ion.content).to_string(), + url: section.permalink.clone(), + }); + for page in §ion.pages { + let page = &library.pages[page]; + if page.meta.in_search_index { + items.push(Item { + title: page.meta.title.clone().unwrap_or_default(), + body: search::AMMONIA.clean(&page.content).to_string(), + url: page.permalink.clone(), + }) + } + } + } + } + let content = serde_json::to_string(&items)?; (path, content) } }; From 39a1e71e37645e92454337f9532fec07ab164e57 Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Fri, 24 May 2024 13:46:37 -0400 Subject: [PATCH 02/23] implement FuseJavascript; refactor index_for_lang --- components/config/src/config/search.rs | 1 + components/site/src/lib.rs | 36 ++++++++++++++------------ components/utils/src/fs.rs | 5 ++-- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/components/config/src/config/search.rs b/components/config/src/config/search.rs index 6d0a337362..d35a61d482 100644 --- a/components/config/src/config/search.rs +++ b/components/config/src/config/search.rs @@ -8,6 +8,7 @@ pub enum IndexFormat { #[default] ElasticlunrJavascript, FuseJson, + FuseJavascript, } #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] diff --git a/components/site/src/lib.rs b/components/site/src/lib.rs index c23a44047e..5db284cb18 100644 --- a/components/site/src/lib.rs +++ b/components/site/src/lib.rs @@ -799,27 +799,22 @@ impl Site { } fn index_for_lang(&self, lang: &str) -> Result<()> { - let (path, content) = match &self.config.search.index_format { - format @ IndexFormat::ElasticlunrJavascript | format @ IndexFormat::ElasticlunrJson => { - let index_json = - search::build_index(lang, &self.library.read().unwrap(), &self.config)?; - if *format == IndexFormat::ElasticlunrJson { - let path = self.output_path.join(format!("search_index.{}.json", lang)); - (path, index_json) - } else { - let path = self.output_path.join(format!("search_index.{}.js", lang)); - let content = format!("window.searchIndex = {};", index_json); - (path, content) - } + let extension = match &self.config.search.index_format { + IndexFormat::ElasticlunrJavascript | IndexFormat::FuseJavascript => "js", + IndexFormat::ElasticlunrJson | IndexFormat::FuseJson => "json", + }; + let path = &self.output_path.join(format!("search_index.{}.{}", lang, extension)); + let content = match &self.config.search.index_format { + IndexFormat::ElasticlunrJavascript | IndexFormat::ElasticlunrJson => { + search::build_index(lang, &self.library.read().unwrap(), &self.config)? } - IndexFormat::FuseJson => { + IndexFormat::FuseJson | IndexFormat::FuseJavascript => { #[derive(serde::Serialize)] struct Item { title: String, body: String, url: String, } - let path = self.output_path.join(format!("search_index.{}.json", lang)); let mut items: Vec = Vec::new(); let library = self.library.read().unwrap(); for (_, section) in &library.sections { @@ -844,11 +839,18 @@ impl Site { } } } - let content = serde_json::to_string(&items)?; - (path, content) + serde_json::to_string(&items)? } }; - create_file(&path, &content) + create_file( + path, + match self.config.search.index_format { + IndexFormat::ElasticlunrJson | IndexFormat::FuseJson => content, + IndexFormat::ElasticlunrJavascript | IndexFormat::FuseJavascript => { + format!("window.searchIndex = {}", content) + } + }, + ) } pub fn build_search_index(&self) -> Result<()> { diff --git a/components/utils/src/fs.rs b/components/utils/src/fs.rs index 2b1557068a..4aa13994b8 100644 --- a/components/utils/src/fs.rs +++ b/components/utils/src/fs.rs @@ -28,11 +28,12 @@ fn create_parent(path: &Path) -> Result<()> { } /// Create a file with the content given -pub fn create_file(path: &Path, content: &str) -> Result<()> { +/// `content`` can be `&str`, `String`, or `&String` (and probably others) +pub fn create_file(path: &Path, content: impl AsRef) -> Result<()> { create_parent(path)?; let mut file = File::create(path).with_context(|| format!("Failed to create file {}", path.display()))?; - file.write_all(content.as_bytes())?; + file.write_all(content.as_ref().as_bytes())?; Ok(()) } From e291942df064fc349e03c100df48e96e2c22fd81 Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Fri, 24 May 2024 13:54:41 -0400 Subject: [PATCH 03/23] support search config --- components/site/src/lib.rs | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/components/site/src/lib.rs b/components/site/src/lib.rs index 5db284cb18..29fb718f25 100644 --- a/components/site/src/lib.rs +++ b/components/site/src/lib.rs @@ -811,29 +811,46 @@ impl Site { IndexFormat::FuseJson | IndexFormat::FuseJavascript => { #[derive(serde::Serialize)] struct Item { - title: String, - body: String, - url: String, + title: Option, + body: Option, + path: String, } let mut items: Vec = Vec::new(); let library = self.library.read().unwrap(); + let config = &self.config.search; for (_, section) in &library.sections { if section.lang == lang && section.meta.redirect_to.is_none() && section.meta.in_search_index { items.push(Item { - title: section.meta.title.clone().unwrap_or_default(), - body: search::AMMONIA.clean(§ion.content).to_string(), - url: section.permalink.clone(), + path: section.path.clone(), + title: if config.include_title { + Some(section.meta.title.clone().unwrap_or_default()) + } else { + None + }, + body: if config.include_content { + Some(search::AMMONIA.clean(§ion.content).to_string()) + } else { + None + }, }); for page in §ion.pages { let page = &library.pages[page]; if page.meta.in_search_index { items.push(Item { - title: page.meta.title.clone().unwrap_or_default(), - body: search::AMMONIA.clean(&page.content).to_string(), - url: page.permalink.clone(), + title: if config.include_title { + Some(page.meta.title.clone().unwrap_or_default()) + } else { + None + }, + body: if config.include_content { + Some(search::AMMONIA.clean(&page.content).to_string()) + } else { + None + }, + path: page.path.clone(), }) } } From 5c310c3585ae911bbf975ab9a78631cc7a60b5e6 Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Fri, 24 May 2024 14:15:56 -0400 Subject: [PATCH 04/23] move fuse index building to it's own file --- Cargo.lock | 3 +- components/search/Cargo.toml | 2 ++ components/search/src/fuse.rs | 52 ++++++++++++++++++++++++++++++++++ components/search/src/lib.rs | 2 ++ components/site/Cargo.toml | 1 - components/site/src/lib.rs | 53 +++-------------------------------- 6 files changed, 62 insertions(+), 51 deletions(-) create mode 100644 components/search/src/fuse.rs diff --git a/Cargo.lock b/Cargo.lock index ad80379c59..eea2100310 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3753,6 +3753,8 @@ dependencies = [ "content", "errors", "libs", + "serde", + "serde_json", ] [[package]] @@ -3932,7 +3934,6 @@ dependencies = [ "path-slash", "search", "serde", - "serde_json", "tempfile", "templates", "utils", diff --git a/components/search/Cargo.toml b/components/search/Cargo.toml index d2b26b528f..4b3848de86 100644 --- a/components/search/Cargo.toml +++ b/components/search/Cargo.toml @@ -8,3 +8,5 @@ errors = { path = "../errors" } content = { path = "../content" } config = { path = "../config" } libs = { path = "../libs" } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" diff --git a/components/search/src/fuse.rs b/components/search/src/fuse.rs new file mode 100644 index 0000000000..6d7309cebc --- /dev/null +++ b/components/search/src/fuse.rs @@ -0,0 +1,52 @@ +use config::Search; +use content::Library; + +/// build index in Fuse.js format. +pub fn build_index(lang: &str, library: &Library, config: &Search) -> super::Result { + #[derive(serde::Serialize)] + struct Item { + title: Option, + body: Option, + path: String, + } + let mut items: Vec = Vec::new(); + for (_, section) in &library.sections { + if section.lang == lang + && section.meta.redirect_to.is_none() + && section.meta.in_search_index + { + items.push(Item { + path: section.path.clone(), + title: if config.include_title { + Some(section.meta.title.clone().unwrap_or_default()) + } else { + None + }, + body: if config.include_content { + Some(super::AMMONIA.clean(§ion.content).to_string()) + } else { + None + }, + }); + for page in §ion.pages { + let page = &library.pages[page]; + if page.meta.in_search_index { + items.push(Item { + title: if config.include_title { + Some(page.meta.title.clone().unwrap_or_default()) + } else { + None + }, + body: if config.include_content { + Some(super::AMMONIA.clean(&page.content).to_string()) + } else { + None + }, + path: page.path.clone(), + }) + } + } + } + } + Ok(serde_json::to_string(&items)?) +} diff --git a/components/search/src/lib.rs b/components/search/src/lib.rs index 0022f21a53..32a6e20932 100644 --- a/components/search/src/lib.rs +++ b/components/search/src/lib.rs @@ -1,3 +1,5 @@ +pub mod fuse; + use std::collections::{HashMap, HashSet}; use libs::ammonia; diff --git a/components/site/Cargo.toml b/components/site/Cargo.toml index b26deb6512..3388f2dc88 100644 --- a/components/site/Cargo.toml +++ b/components/site/Cargo.toml @@ -18,7 +18,6 @@ imageproc = { path = "../imageproc" } link_checker = { path = "../link_checker" } libs = { path = "../libs" } content = { path = "../content" } -serde_json = "1.0.117" [dev-dependencies] tempfile = "3" diff --git a/components/site/src/lib.rs b/components/site/src/lib.rs index 29fb718f25..2a9ee694e1 100644 --- a/components/site/src/lib.rs +++ b/components/site/src/lib.rs @@ -804,61 +804,16 @@ impl Site { IndexFormat::ElasticlunrJson | IndexFormat::FuseJson => "json", }; let path = &self.output_path.join(format!("search_index.{}.{}", lang, extension)); + let library = self.library.read().unwrap(); let content = match &self.config.search.index_format { IndexFormat::ElasticlunrJavascript | IndexFormat::ElasticlunrJson => { - search::build_index(lang, &self.library.read().unwrap(), &self.config)? + search::build_index(lang, &library, &self.config)? } IndexFormat::FuseJson | IndexFormat::FuseJavascript => { - #[derive(serde::Serialize)] - struct Item { - title: Option, - body: Option, - path: String, - } - let mut items: Vec = Vec::new(); - let library = self.library.read().unwrap(); - let config = &self.config.search; - for (_, section) in &library.sections { - if section.lang == lang - && section.meta.redirect_to.is_none() - && section.meta.in_search_index - { - items.push(Item { - path: section.path.clone(), - title: if config.include_title { - Some(section.meta.title.clone().unwrap_or_default()) - } else { - None - }, - body: if config.include_content { - Some(search::AMMONIA.clean(§ion.content).to_string()) - } else { - None - }, - }); - for page in §ion.pages { - let page = &library.pages[page]; - if page.meta.in_search_index { - items.push(Item { - title: if config.include_title { - Some(page.meta.title.clone().unwrap_or_default()) - } else { - None - }, - body: if config.include_content { - Some(search::AMMONIA.clean(&page.content).to_string()) - } else { - None - }, - path: page.path.clone(), - }) - } - } - } - } - serde_json::to_string(&items)? + search::fuse::build_index(lang, &library, &self.config.search)? } }; + drop(library); // no need to hold on to this guard while writing create_file( path, match self.config.search.index_format { From 19392771ebe21a678e8ce9389691cefba9768b33 Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Fri, 24 May 2024 14:17:03 -0400 Subject: [PATCH 05/23] update doc of Search.index_format --- components/config/src/config/search.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/config/src/config/search.rs b/components/config/src/config/search.rs index d35a61d482..d30977428a 100644 --- a/components/config/src/config/search.rs +++ b/components/config/src/config/search.rs @@ -28,7 +28,7 @@ pub struct Search { pub include_date: bool, /// Include the path of the page in the search index. `false` by default. pub include_path: bool, - /// Foramt of the search index to be produced. Javascript by default + /// Foramt of the search index to be produced. 'elasticlunr_javascript' by default. pub index_format: IndexFormat, } From 35bd4e0f6eb643f23da3e7b161cb27b862e00094 Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Fri, 24 May 2024 14:42:18 -0400 Subject: [PATCH 06/23] update config docs --- docs/content/documentation/getting-started/configuration.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/content/documentation/getting-started/configuration.md b/docs/content/documentation/getting-started/configuration.md index dce920ce8d..f94b83f9cc 100644 --- a/docs/content/documentation/getting-started/configuration.md +++ b/docs/content/documentation/getting-started/configuration.md @@ -183,7 +183,9 @@ include_content = true # truncate_content_length = 100 # Wether to produce the search index as a javascript file or as a JSON file -# Accepted value "elasticlunr_javascript" or "elasticlunr_json" +# Accepted values: +# - "elasticlunr_javascript", "elasticlunr_json" +# - "fuse_javascript", "fuse_json" index_format = "elasticlunr_javascript" # Optional translation object for the default language From 4cc9a3085de9667b9026196e8384e9eaf843e771 Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Fri, 24 May 2024 15:10:28 -0400 Subject: [PATCH 07/23] update search documentation --- docs/content/documentation/content/search.md | 41 ++++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/docs/content/documentation/content/search.md b/docs/content/documentation/content/search.md index c57c1fc114..c36309fba8 100644 --- a/docs/content/documentation/content/search.md +++ b/docs/content/documentation/content/search.md @@ -4,7 +4,7 @@ weight = 100 +++ Zola can build a search index from the sections and pages content to -be used by a JavaScript library such as [elasticlunr](http://elasticlunr.com/). +be used by a JavaScript library such as [elasticlunr](http://elasticlunr.com/) or [fuse](https://www.fusejs.io). To enable it, you only need to set `build_search_index = true` in your `config.toml` and Zola will generate an index for the `default_language` set for all pages not excluded from the search index. @@ -12,21 +12,38 @@ generate an index for the `default_language` set for all pages not excluded from It is very important to set the `default_language` in your `config.toml` if you are writing a site not in English; the index building pipelines are very different depending on the language. -After `zola build` or `zola serve`, you should see two files in your public directory: - -- `search_index.${default_language}.js`: so `search_index.en.js` for a default setup -- `elasticlunr.min.js` - -If you set `index_format = "elasticlunr_json"` in your `config.toml`, a `search_index.${default_language}.json` is generated -instead of the default `search_index.${default_language}.js`. - As each site will be different, Zola makes no assumptions about your search function and doesn't provide the JavaScript/CSS code to do an actual search and display results. You can look at how this site -implements it to get an idea: [search.js](https://github.com/getzola/zola/tree/master/docs/static/search.js). +implements it (using elasticlunr) to get an idea: [search.js](https://github.com/getzola/zola/tree/master/docs/static/search.js). -If you are using a language other than English, you will also need to include the corresponding JavaScript stemmer file. -See for details. ## Configuring the search index In some cases, the default indexing strategy is not suitable. You can customize which fields to include and whether to truncate the content in the [search configuration](@/documentation/getting-started/configuration.md). + +## Index Formats + +### Elasticlunr + +Compatible with [elasticlunr](http://elasticlunr.com/). Also produces `elasticlunr.min.js`. + +```toml +# config.toml +[search] +index_format = "elasticlunr_javascript" # or "elasticlunr_json" +``` + +If you are using a language other than English, you will also need to include the corresponding JavaScript stemmer file. +See for details. + +### Fuse + +Compatible with [fuse.js](https://www.fusejs.io/), [tinysearch](https://github.com/tinysearch/tinysearch), and almost +compatible with [stork](https://stork-search.net/) + +```toml +# config.toml +[search] +index_format = "fuse_javascript" # or "fuse_json" +``` + From 1dbddce6435e86065d58ebb7f8cdadee12f4c6d7 Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Fri, 24 May 2024 17:56:22 -0400 Subject: [PATCH 08/23] use &str where possible --- components/search/src/fuse.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/components/search/src/fuse.rs b/components/search/src/fuse.rs index 6d7309cebc..e11f1273c1 100644 --- a/components/search/src/fuse.rs +++ b/components/search/src/fuse.rs @@ -4,10 +4,10 @@ use content::Library; /// build index in Fuse.js format. pub fn build_index(lang: &str, library: &Library, config: &Search) -> super::Result { #[derive(serde::Serialize)] - struct Item { - title: Option, - body: Option, - path: String, + struct Item<'a> { + path: &'a str, + title: Option<&'a str>, + body: Option, // AMMONIA.clean has to allocate anyway } let mut items: Vec = Vec::new(); for (_, section) in &library.sections { @@ -16,9 +16,9 @@ pub fn build_index(lang: &str, library: &Library, config: &Search) -> super::Res && section.meta.in_search_index { items.push(Item { - path: section.path.clone(), + path: §ion.path, title: if config.include_title { - Some(section.meta.title.clone().unwrap_or_default()) + Some(§ion.meta.title.as_deref().unwrap_or_default()) } else { None }, @@ -33,7 +33,7 @@ pub fn build_index(lang: &str, library: &Library, config: &Search) -> super::Res if page.meta.in_search_index { items.push(Item { title: if config.include_title { - Some(page.meta.title.clone().unwrap_or_default()) + Some(&page.meta.title.as_deref().unwrap_or_default()) } else { None }, @@ -42,7 +42,7 @@ pub fn build_index(lang: &str, library: &Library, config: &Search) -> super::Res } else { None }, - path: page.path.clone(), + path: &page.path, }) } } From b6765e34b0e9619a8832dc003def4aa4ff11f80a Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Fri, 24 May 2024 17:59:14 -0400 Subject: [PATCH 09/23] use libs::serde_json remmeber to commit Cargo.lock --- Cargo.lock | 1 - components/search/Cargo.toml | 1 - components/search/src/fuse.rs | 1 + 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index eea2100310..44eb1db4a7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3754,7 +3754,6 @@ dependencies = [ "errors", "libs", "serde", - "serde_json", ] [[package]] diff --git a/components/search/Cargo.toml b/components/search/Cargo.toml index 4b3848de86..eb12f7e533 100644 --- a/components/search/Cargo.toml +++ b/components/search/Cargo.toml @@ -9,4 +9,3 @@ content = { path = "../content" } config = { path = "../config" } libs = { path = "../libs" } serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0" diff --git a/components/search/src/fuse.rs b/components/search/src/fuse.rs index e11f1273c1..8fc4557348 100644 --- a/components/search/src/fuse.rs +++ b/components/search/src/fuse.rs @@ -1,5 +1,6 @@ use config::Search; use content::Library; +use libs::serde_json; /// build index in Fuse.js format. pub fn build_index(lang: &str, library: &Library, config: &Search) -> super::Result { From 7247d869b7a73547b9aaa50f9d0f226bec7cf30c Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Fri, 24 May 2024 18:07:23 -0400 Subject: [PATCH 10/23] move extension logic to IndexFormat --- components/config/src/config/search.rs | 10 ++++++++++ components/site/src/lib.rs | 5 +---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/components/config/src/config/search.rs b/components/config/src/config/search.rs index d30977428a..9036c24f7a 100644 --- a/components/config/src/config/search.rs +++ b/components/config/src/config/search.rs @@ -11,6 +11,16 @@ pub enum IndexFormat { FuseJavascript, } +impl IndexFormat { + /// file extension which ought to be used for this index format. + pub fn extension(&self) -> &'static str { + match *self { + IndexFormat::ElasticlunrJavascript | IndexFormat::FuseJavascript => "js", + IndexFormat::ElasticlunrJson | IndexFormat::FuseJson => "json", + } + } +} + #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] #[serde(default)] pub struct Search { diff --git a/components/site/src/lib.rs b/components/site/src/lib.rs index 2a9ee694e1..94d4b26df4 100644 --- a/components/site/src/lib.rs +++ b/components/site/src/lib.rs @@ -799,10 +799,7 @@ impl Site { } fn index_for_lang(&self, lang: &str) -> Result<()> { - let extension = match &self.config.search.index_format { - IndexFormat::ElasticlunrJavascript | IndexFormat::FuseJavascript => "js", - IndexFormat::ElasticlunrJson | IndexFormat::FuseJson => "json", - }; + let extension = self.config.search.index_format.extension(); let path = &self.output_path.join(format!("search_index.{}.{}", lang, extension)); let library = self.library.read().unwrap(); let content = match &self.config.search.index_format { From 0c29dc8b43272d9581f9de83c4e093f2df1235ca Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Fri, 24 May 2024 18:19:40 -0400 Subject: [PATCH 11/23] move the entire filename logic inside IndexFormat --- components/config/src/config/search.rs | 7 ++++++- components/site/src/lib.rs | 3 +-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/components/config/src/config/search.rs b/components/config/src/config/search.rs index 9036c24f7a..301d53d56f 100644 --- a/components/config/src/config/search.rs +++ b/components/config/src/config/search.rs @@ -13,12 +13,17 @@ pub enum IndexFormat { impl IndexFormat { /// file extension which ought to be used for this index format. - pub fn extension(&self) -> &'static str { + fn extension(&self) -> &'static str { match *self { IndexFormat::ElasticlunrJavascript | IndexFormat::FuseJavascript => "js", IndexFormat::ElasticlunrJson | IndexFormat::FuseJson => "json", } } + + /// the filename which ought to be used for this format and language `lang` + pub fn filename(&self, lang: &str) -> String { + format!("search_index.{}.{}", lang, self.extension()) + } } #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] diff --git a/components/site/src/lib.rs b/components/site/src/lib.rs index 94d4b26df4..49acde79ed 100644 --- a/components/site/src/lib.rs +++ b/components/site/src/lib.rs @@ -799,8 +799,7 @@ impl Site { } fn index_for_lang(&self, lang: &str) -> Result<()> { - let extension = self.config.search.index_format.extension(); - let path = &self.output_path.join(format!("search_index.{}.{}", lang, extension)); + let path = &self.output_path.join(self.config.search.index_format.filename(lang)); let library = self.library.read().unwrap(); let content = match &self.config.search.index_format { IndexFormat::ElasticlunrJavascript | IndexFormat::ElasticlunrJson => { From c1c6eb4ed5563dc9d5e8051875577547bb19ade2 Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Fri, 24 May 2024 18:33:07 -0400 Subject: [PATCH 12/23] move elasticlunr to it's own module --- components/search/src/elasticlunr.rs | 242 ++++++++++++++++++++++++++ components/search/src/fuse.rs | 3 +- components/search/src/lib.rs | 247 +-------------------------- components/site/src/lib.rs | 2 +- 4 files changed, 247 insertions(+), 247 deletions(-) create mode 100644 components/search/src/elasticlunr.rs diff --git a/components/search/src/elasticlunr.rs b/components/search/src/elasticlunr.rs new file mode 100644 index 0000000000..07b970cfef --- /dev/null +++ b/components/search/src/elasticlunr.rs @@ -0,0 +1,242 @@ +use config::{Config, Search}; +use content::{Library, Section}; +use errors::{bail, Result}; +use libs::elasticlunr::{lang, Index, IndexBuilder}; +use libs::time::format_description::well_known::Rfc3339; +use libs::time::OffsetDateTime; + +fn build_fields(search_config: &Search, mut index: IndexBuilder) -> IndexBuilder { + if search_config.include_title { + index = index.add_field("title"); + } + + if search_config.include_description { + index = index.add_field("description"); + } + + if search_config.include_date { + index = index.add_field("date") + } + + if search_config.include_path { + index = index.add_field_with_tokenizer("path", Box::new(path_tokenizer)); + } + + if search_config.include_content { + index = index.add_field("body") + } + + index +} + +fn path_tokenizer(text: &str) -> Vec { + text.split(|c: char| c.is_whitespace() || c == '-' || c == '/') + .filter(|s| !s.is_empty()) + .map(|s| s.trim().to_lowercase()) + .collect() +} + +fn fill_index( + search_config: &Search, + title: &Option, + description: &Option, + datetime: &Option, + path: &str, + content: &str, +) -> Vec { + let mut row = vec![]; + + if search_config.include_title { + row.push(title.clone().unwrap_or_default()); + } + + if search_config.include_description { + row.push(description.clone().unwrap_or_default()); + } + + if search_config.include_date { + if let Some(date) = datetime { + if let Ok(d) = date.format(&Rfc3339) { + row.push(d); + } + } + } + + if search_config.include_path { + row.push(path.to_string()); + } + + if search_config.include_content { + let body = super::AMMONIA.clean(content).to_string(); + if let Some(truncate_len) = search_config.truncate_content_length { + // Not great for unicode + // TODO: fix it like the truncate in Tera + match body.char_indices().nth(truncate_len) { + None => row.push(body), + Some((idx, _)) => row.push((body[..idx]).to_string()), + }; + } else { + row.push(body); + }; + } + row +} + +/// Returns the generated JSON index with all the documents of the site added using +/// the language given +/// Errors if the language given is not available in Elasticlunr +/// TODO: is making `in_search_index` apply to subsections of a `false` section useful? +pub fn build_index(lang: &str, library: &Library, config: &Config) -> Result { + let language = match lang::from_code(lang) { + Some(l) => l, + None => { + bail!("Tried to build search index for language {} which is not supported", lang); + } + }; + let language_options = &config.languages[lang]; + let mut index = IndexBuilder::with_language(language); + index = build_fields(&language_options.search, index); + let mut index = index.build(); + + for (_, section) in &library.sections { + if section.lang == lang { + add_section_to_index(&mut index, section, library, &language_options.search); + } + } + + Ok(index.to_json()) +} + +fn add_section_to_index( + index: &mut Index, + section: &Section, + library: &Library, + search_config: &Search, +) { + if !section.meta.in_search_index { + return; + } + + // Don't index redirecting sections + if section.meta.redirect_to.is_none() { + index.add_doc( + §ion.permalink, + &fill_index( + search_config, + §ion.meta.title, + §ion.meta.description, + &None, + §ion.path, + §ion.content, + ), + ); + } + + for key in §ion.pages { + let page = &library.pages[key]; + if !page.meta.in_search_index { + continue; + } + + index.add_doc( + &page.permalink, + &fill_index( + search_config, + &page.meta.title, + &page.meta.description, + &page.meta.datetime, + &page.path, + &page.content, + ), + ); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use config::Config; + use libs::elasticlunr::IndexBuilder; + + #[test] + fn can_build_fields() { + let mut config = Config::default(); + let index = build_fields(&config.search, IndexBuilder::new()).build(); + assert_eq!(index.get_fields(), vec!["title", "body"]); + + config.search.include_content = false; + config.search.include_description = true; + let index = build_fields(&config.search, IndexBuilder::new()).build(); + assert_eq!(index.get_fields(), vec!["title", "description"]); + + config.search.include_content = true; + let index = build_fields(&config.search, IndexBuilder::new()).build(); + assert_eq!(index.get_fields(), vec!["title", "description", "body"]); + + config.search.include_title = false; + let index = build_fields(&config.search, IndexBuilder::new()).build(); + assert_eq!(index.get_fields(), vec!["description", "body"]); + } + + #[test] + fn can_fill_index_default() { + let config = Config::default(); + let title = Some("A title".to_string()); + let description = Some("A description".to_string()); + let path = "/a/page/".to_string(); + let content = "Some content".to_string(); + + let res = fill_index(&config.search, &title, &description, &None, &path, &content); + assert_eq!(res.len(), 2); + assert_eq!(res[0], title.unwrap()); + assert_eq!(res[1], content); + } + + #[test] + fn can_fill_index_description() { + let mut config = Config::default(); + config.search.include_description = true; + let title = Some("A title".to_string()); + let description = Some("A description".to_string()); + let path = "/a/page/".to_string(); + let content = "Some content".to_string(); + + let res = fill_index(&config.search, &title, &description, &None, &path, &content); + assert_eq!(res.len(), 3); + assert_eq!(res[0], title.unwrap()); + assert_eq!(res[1], description.unwrap()); + assert_eq!(res[2], content); + } + + #[test] + fn can_fill_index_truncated_content() { + let mut config = Config::default(); + config.search.truncate_content_length = Some(5); + let title = Some("A title".to_string()); + let description = Some("A description".to_string()); + let path = "/a/page/".to_string(); + let content = "Some content".to_string(); + + let res = fill_index(&config.search, &title, &description, &None, &path, &content); + assert_eq!(res.len(), 2); + assert_eq!(res[0], title.unwrap()); + assert_eq!(res[1], content[..5]); + } + + #[test] + fn can_fill_index_date() { + let mut config = Config::default(); + config.search.include_date = true; + let title = Some("A title".to_string()); + let description = Some("A description".to_string()); + let path = "/a/page/".to_string(); + let content = "Some content".to_string(); + let datetime = Some(OffsetDateTime::parse("2023-01-31T00:00:00Z", &Rfc3339).unwrap()); + + let res = fill_index(&config.search, &title, &description, &datetime, &path, &content); + assert_eq!(res.len(), 3); + assert_eq!(res[0], title.unwrap()); + assert_eq!(res[1], "2023-01-31T00:00:00Z"); + assert_eq!(res[2], content); + } +} diff --git a/components/search/src/fuse.rs b/components/search/src/fuse.rs index 8fc4557348..b636792003 100644 --- a/components/search/src/fuse.rs +++ b/components/search/src/fuse.rs @@ -1,9 +1,10 @@ use config::Search; use content::Library; +use errors::Result; use libs::serde_json; /// build index in Fuse.js format. -pub fn build_index(lang: &str, library: &Library, config: &Search) -> super::Result { +pub fn build_index(lang: &str, library: &Library, config: &Search) -> Result { #[derive(serde::Serialize)] struct Item<'a> { path: &'a str, diff --git a/components/search/src/lib.rs b/components/search/src/lib.rs index 32a6e20932..6a97e20874 100644 --- a/components/search/src/lib.rs +++ b/components/search/src/lib.rs @@ -1,16 +1,9 @@ +pub mod elasticlunr; pub mod fuse; -use std::collections::{HashMap, HashSet}; - use libs::ammonia; -use libs::elasticlunr::{lang, Index, IndexBuilder}; use libs::once_cell::sync::Lazy; -use libs::time::format_description::well_known::Rfc3339; -use libs::time::OffsetDateTime; - -use config::{Config, Search}; -use content::{Library, Section}; -use errors::{bail, Result}; +use std::collections::{HashMap, HashSet}; pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js"); @@ -29,239 +22,3 @@ pub static AMMONIA: Lazy> = Lazy::new(|| { .clean_content_tags(clean_content); builder }); - -fn build_fields(search_config: &Search, mut index: IndexBuilder) -> IndexBuilder { - if search_config.include_title { - index = index.add_field("title"); - } - - if search_config.include_description { - index = index.add_field("description"); - } - - if search_config.include_date { - index = index.add_field("date") - } - - if search_config.include_path { - index = index.add_field_with_tokenizer("path", Box::new(path_tokenizer)); - } - - if search_config.include_content { - index = index.add_field("body") - } - - index -} - -fn path_tokenizer(text: &str) -> Vec { - text.split(|c: char| c.is_whitespace() || c == '-' || c == '/') - .filter(|s| !s.is_empty()) - .map(|s| s.trim().to_lowercase()) - .collect() -} - -fn fill_index( - search_config: &Search, - title: &Option, - description: &Option, - datetime: &Option, - path: &str, - content: &str, -) -> Vec { - let mut row = vec![]; - - if search_config.include_title { - row.push(title.clone().unwrap_or_default()); - } - - if search_config.include_description { - row.push(description.clone().unwrap_or_default()); - } - - if search_config.include_date { - if let Some(date) = datetime { - if let Ok(d) = date.format(&Rfc3339) { - row.push(d); - } - } - } - - if search_config.include_path { - row.push(path.to_string()); - } - - if search_config.include_content { - let body = AMMONIA.clean(content).to_string(); - if let Some(truncate_len) = search_config.truncate_content_length { - // Not great for unicode - // TODO: fix it like the truncate in Tera - match body.char_indices().nth(truncate_len) { - None => row.push(body), - Some((idx, _)) => row.push((body[..idx]).to_string()), - }; - } else { - row.push(body); - }; - } - row -} - -/// Returns the generated JSON index with all the documents of the site added using -/// the language given -/// Errors if the language given is not available in Elasticlunr -/// TODO: is making `in_search_index` apply to subsections of a `false` section useful? -pub fn build_index(lang: &str, library: &Library, config: &Config) -> Result { - let language = match lang::from_code(lang) { - Some(l) => l, - None => { - bail!("Tried to build search index for language {} which is not supported", lang); - } - }; - let language_options = &config.languages[lang]; - let mut index = IndexBuilder::with_language(language); - index = build_fields(&language_options.search, index); - let mut index = index.build(); - - for (_, section) in &library.sections { - if section.lang == lang { - add_section_to_index(&mut index, section, library, &language_options.search); - } - } - - Ok(index.to_json()) -} - -fn add_section_to_index( - index: &mut Index, - section: &Section, - library: &Library, - search_config: &Search, -) { - if !section.meta.in_search_index { - return; - } - - // Don't index redirecting sections - if section.meta.redirect_to.is_none() { - index.add_doc( - §ion.permalink, - &fill_index( - search_config, - §ion.meta.title, - §ion.meta.description, - &None, - §ion.path, - §ion.content, - ), - ); - } - - for key in §ion.pages { - let page = &library.pages[key]; - if !page.meta.in_search_index { - continue; - } - - index.add_doc( - &page.permalink, - &fill_index( - search_config, - &page.meta.title, - &page.meta.description, - &page.meta.datetime, - &page.path, - &page.content, - ), - ); - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use config::Config; - - #[test] - fn can_build_fields() { - let mut config = Config::default(); - let index = build_fields(&config.search, IndexBuilder::new()).build(); - assert_eq!(index.get_fields(), vec!["title", "body"]); - - config.search.include_content = false; - config.search.include_description = true; - let index = build_fields(&config.search, IndexBuilder::new()).build(); - assert_eq!(index.get_fields(), vec!["title", "description"]); - - config.search.include_content = true; - let index = build_fields(&config.search, IndexBuilder::new()).build(); - assert_eq!(index.get_fields(), vec!["title", "description", "body"]); - - config.search.include_title = false; - let index = build_fields(&config.search, IndexBuilder::new()).build(); - assert_eq!(index.get_fields(), vec!["description", "body"]); - } - - #[test] - fn can_fill_index_default() { - let config = Config::default(); - let title = Some("A title".to_string()); - let description = Some("A description".to_string()); - let path = "/a/page/".to_string(); - let content = "Some content".to_string(); - - let res = fill_index(&config.search, &title, &description, &None, &path, &content); - assert_eq!(res.len(), 2); - assert_eq!(res[0], title.unwrap()); - assert_eq!(res[1], content); - } - - #[test] - fn can_fill_index_description() { - let mut config = Config::default(); - config.search.include_description = true; - let title = Some("A title".to_string()); - let description = Some("A description".to_string()); - let path = "/a/page/".to_string(); - let content = "Some content".to_string(); - - let res = fill_index(&config.search, &title, &description, &None, &path, &content); - assert_eq!(res.len(), 3); - assert_eq!(res[0], title.unwrap()); - assert_eq!(res[1], description.unwrap()); - assert_eq!(res[2], content); - } - - #[test] - fn can_fill_index_truncated_content() { - let mut config = Config::default(); - config.search.truncate_content_length = Some(5); - let title = Some("A title".to_string()); - let description = Some("A description".to_string()); - let path = "/a/page/".to_string(); - let content = "Some content".to_string(); - - let res = fill_index(&config.search, &title, &description, &None, &path, &content); - assert_eq!(res.len(), 2); - assert_eq!(res[0], title.unwrap()); - assert_eq!(res[1], content[..5]); - } - - #[test] - fn can_fill_index_date() { - let mut config = Config::default(); - config.search.include_date = true; - let title = Some("A title".to_string()); - let description = Some("A description".to_string()); - let path = "/a/page/".to_string(); - let content = "Some content".to_string(); - let datetime = Some(OffsetDateTime::parse("2023-01-31T00:00:00Z", &Rfc3339).unwrap()); - - let res = fill_index(&config.search, &title, &description, &datetime, &path, &content); - assert_eq!(res.len(), 3); - assert_eq!(res[0], title.unwrap()); - assert_eq!(res[1], "2023-01-31T00:00:00Z"); - assert_eq!(res[2], content); - } -} diff --git a/components/site/src/lib.rs b/components/site/src/lib.rs index 49acde79ed..4b8848724c 100644 --- a/components/site/src/lib.rs +++ b/components/site/src/lib.rs @@ -803,7 +803,7 @@ impl Site { let library = self.library.read().unwrap(); let content = match &self.config.search.index_format { IndexFormat::ElasticlunrJavascript | IndexFormat::ElasticlunrJson => { - search::build_index(lang, &library, &self.config)? + search::elasticlunr::build_index(lang, &library, &self.config)? } IndexFormat::FuseJson | IndexFormat::FuseJavascript => { search::fuse::build_index(lang, &library, &self.config.search)? From d26b612af729278348b5b18caacc45dd3f31e4bf Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Fri, 24 May 2024 19:07:15 -0400 Subject: [PATCH 13/23] only create elasticlunr.min.js if we're actually using elasticlunr --- components/site/src/lib.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/components/site/src/lib.rs b/components/site/src/lib.rs index 4b8848724c..87c2d59121 100644 --- a/components/site/src/lib.rs +++ b/components/site/src/lib.rs @@ -834,8 +834,13 @@ impl Site { } } - // then elasticlunr.min.js - create_file(&self.output_path.join("elasticlunr.min.js"), search::ELASTICLUNR_JS)?; + match self.config.search.index_format { + IndexFormat::ElasticlunrJavascript | IndexFormat::ElasticlunrJson => { + // then elasticlunr.min.js + create_file(&self.output_path.join("elasticlunr.min.js"), search::ELASTICLUNR_JS)?; + } + _ => {} + } Ok(()) } From df6faaba7f06dd52e082a20dfa8cb6b64de250bc Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Sun, 26 May 2024 21:02:49 -0400 Subject: [PATCH 14/23] move ELASTICLUNR_JS to elasticlunr.js --- components/search/src/elasticlunr.rs | 2 ++ components/search/src/lib.rs | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/components/search/src/elasticlunr.rs b/components/search/src/elasticlunr.rs index 07b970cfef..f37458019c 100644 --- a/components/search/src/elasticlunr.rs +++ b/components/search/src/elasticlunr.rs @@ -5,6 +5,8 @@ use libs::elasticlunr::{lang, Index, IndexBuilder}; use libs::time::format_description::well_known::Rfc3339; use libs::time::OffsetDateTime; +pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js"); + fn build_fields(search_config: &Search, mut index: IndexBuilder) -> IndexBuilder { if search_config.include_title { index = index.add_field("title"); diff --git a/components/search/src/lib.rs b/components/search/src/lib.rs index 6a97e20874..5795be300a 100644 --- a/components/search/src/lib.rs +++ b/components/search/src/lib.rs @@ -5,7 +5,7 @@ use libs::ammonia; use libs::once_cell::sync::Lazy; use std::collections::{HashMap, HashSet}; -pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js"); +pub use elasticlunr::ELASTICLUNR_JS; pub static AMMONIA: Lazy> = Lazy::new(|| { let mut clean_content = HashSet::new(); From 3ddb39468c89b34b8e8d78a4d9c46a351cac247e Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Sun, 26 May 2024 21:03:33 -0400 Subject: [PATCH 15/23] hide the details of search's submodules --- components/search/src/lib.rs | 7 ++++--- components/site/src/lib.rs | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/components/search/src/lib.rs b/components/search/src/lib.rs index 5795be300a..ac02b4224a 100644 --- a/components/search/src/lib.rs +++ b/components/search/src/lib.rs @@ -1,11 +1,12 @@ -pub mod elasticlunr; -pub mod fuse; +mod elasticlunr; +mod fuse; use libs::ammonia; use libs::once_cell::sync::Lazy; use std::collections::{HashMap, HashSet}; -pub use elasticlunr::ELASTICLUNR_JS; +pub use elasticlunr::{build_index as build_elasticlunr, ELASTICLUNR_JS}; +pub use fuse::build_index as build_fuse; pub static AMMONIA: Lazy> = Lazy::new(|| { let mut clean_content = HashSet::new(); diff --git a/components/site/src/lib.rs b/components/site/src/lib.rs index 87c2d59121..2d3d578c21 100644 --- a/components/site/src/lib.rs +++ b/components/site/src/lib.rs @@ -803,10 +803,10 @@ impl Site { let library = self.library.read().unwrap(); let content = match &self.config.search.index_format { IndexFormat::ElasticlunrJavascript | IndexFormat::ElasticlunrJson => { - search::elasticlunr::build_index(lang, &library, &self.config)? + search::build_elasticlunr(lang, &library, &self.config)? } IndexFormat::FuseJson | IndexFormat::FuseJavascript => { - search::fuse::build_index(lang, &library, &self.config.search)? + search::build_fuse(lang, &library, &self.config.search)? } }; drop(library); // no need to hold on to this guard while writing From 2b9c1e3be03b798f0447b272a98eb4cb22477c38 Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Tue, 28 May 2024 09:37:10 -0400 Subject: [PATCH 16/23] optionally include path --- components/search/src/fuse.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/components/search/src/fuse.rs b/components/search/src/fuse.rs index b636792003..50f66cf1dc 100644 --- a/components/search/src/fuse.rs +++ b/components/search/src/fuse.rs @@ -7,9 +7,10 @@ use libs::serde_json; pub fn build_index(lang: &str, library: &Library, config: &Search) -> Result { #[derive(serde::Serialize)] struct Item<'a> { - path: &'a str, + url: &'a str, title: Option<&'a str>, body: Option, // AMMONIA.clean has to allocate anyway + path: Option<&'a str>, } let mut items: Vec = Vec::new(); for (_, section) in &library.sections { @@ -18,7 +19,7 @@ pub fn build_index(lang: &str, library: &Library, config: &Search) -> Result Result Result Date: Tue, 28 May 2024 09:38:23 -0400 Subject: [PATCH 17/23] explain include_path better --- docs/content/documentation/getting-started/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/content/documentation/getting-started/configuration.md b/docs/content/documentation/getting-started/configuration.md index f94b83f9cc..f799d62863 100644 --- a/docs/content/documentation/getting-started/configuration.md +++ b/docs/content/documentation/getting-started/configuration.md @@ -174,7 +174,7 @@ include_title = true include_description = false # Whether to include the RFC3339 datetime of the page in the search index include_date = false -# Whether to include the path of the page/section in the index +# Whether to include the path of the page/section in the index (the full url is always included) include_path = false # Whether to include the rendered content of the page/section in the index include_content = true From b4b2c5258e0bc97f7cc48e5f8f7124149b00ce14 Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Tue, 28 May 2024 09:42:07 -0400 Subject: [PATCH 18/23] remove references to stork --- docs/content/documentation/content/search.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/content/documentation/content/search.md b/docs/content/documentation/content/search.md index c36309fba8..b66aa06f7a 100644 --- a/docs/content/documentation/content/search.md +++ b/docs/content/documentation/content/search.md @@ -38,12 +38,10 @@ See for details. ### Fuse -Compatible with [fuse.js](https://www.fusejs.io/), [tinysearch](https://github.com/tinysearch/tinysearch), and almost -compatible with [stork](https://stork-search.net/) +Compatible with [fuse.js](https://www.fusejs.io/) and [tinysearch](https://github.com/tinysearch/tinysearch). ```toml # config.toml [search] index_format = "fuse_javascript" # or "fuse_json" ``` - From 2ecb379768518ef344dc9373700cbe7fbd558737 Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Tue, 28 May 2024 09:48:25 -0400 Subject: [PATCH 19/23] replace if with match --- components/search/src/fuse.rs | 38 ++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/components/search/src/fuse.rs b/components/search/src/fuse.rs index 50f66cf1dc..eb9430eb7a 100644 --- a/components/search/src/fuse.rs +++ b/components/search/src/fuse.rs @@ -20,34 +20,36 @@ pub fn build_index(lang: &str, library: &Library, config: &Search) -> Result Some(§ion.meta.title.as_deref().unwrap_or_default()), + false => None, }, - body: if config.include_content { - Some(super::AMMONIA.clean(§ion.content).to_string()) - } else { - None + body: match config.include_content { + true => Some(super::AMMONIA.clean(§ion.content).to_string()), + false => None, + }, + path: match config.include_path { + true => Some(§ion.path), + false => None, }, - path: if config.include_path { Some(§ion.path) } else { None }, }); for page in §ion.pages { let page = &library.pages[page]; if page.meta.in_search_index { items.push(Item { url: &page.permalink, - title: if config.include_title { - Some(&page.meta.title.as_deref().unwrap_or_default()) - } else { - None + title: match config.include_title { + true => Some(&page.meta.title.as_deref().unwrap_or_default()), + false => None, + }, + body: match config.include_content { + true => Some(super::AMMONIA.clean(&page.content).to_string()), + false => None, }, - body: if config.include_content { - Some(super::AMMONIA.clean(&page.content).to_string()) - } else { - None + path: match config.include_path { + true => Some(&page.path), + false => None, }, - path: if config.include_path { Some(&page.path) } else { None }, }) } } From 16ea90a9af7482454c425821927910c3c6862ed0 Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Tue, 28 May 2024 09:53:19 -0400 Subject: [PATCH 20/23] support include_description --- components/search/src/fuse.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/components/search/src/fuse.rs b/components/search/src/fuse.rs index eb9430eb7a..c122c0a896 100644 --- a/components/search/src/fuse.rs +++ b/components/search/src/fuse.rs @@ -9,6 +9,7 @@ pub fn build_index(lang: &str, library: &Library, config: &Search) -> Result { url: &'a str, title: Option<&'a str>, + description: Option<&'a str>, body: Option, // AMMONIA.clean has to allocate anyway path: Option<&'a str>, } @@ -24,6 +25,10 @@ pub fn build_index(lang: &str, library: &Library, config: &Search) -> Result Some(§ion.meta.title.as_deref().unwrap_or_default()), false => None, }, + description: match config.include_description { + true => Some(§ion.meta.description.as_deref().unwrap_or_default()), + false => None, + }, body: match config.include_content { true => Some(super::AMMONIA.clean(§ion.content).to_string()), false => None, @@ -42,6 +47,10 @@ pub fn build_index(lang: &str, library: &Library, config: &Search) -> Result Some(&page.meta.title.as_deref().unwrap_or_default()), false => None, }, + description: match config.include_description { + true => Some(&page.meta.description.as_deref().unwrap_or_default()), + false => None, + }, body: match config.include_content { true => Some(super::AMMONIA.clean(&page.content).to_string()), false => None, From 8c3e79bc5a0cbd36dc23ff41112cb9eb65cd7e7e Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Tue, 28 May 2024 09:54:27 -0400 Subject: [PATCH 21/23] specify "permalink" --- docs/content/documentation/getting-started/configuration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/content/documentation/getting-started/configuration.md b/docs/content/documentation/getting-started/configuration.md index f799d62863..9ca57c85bc 100644 --- a/docs/content/documentation/getting-started/configuration.md +++ b/docs/content/documentation/getting-started/configuration.md @@ -174,7 +174,7 @@ include_title = true include_description = false # Whether to include the RFC3339 datetime of the page in the search index include_date = false -# Whether to include the path of the page/section in the index (the full url is always included) +# Whether to include the path of the page/section in the index (the permalink is always included) include_path = false # Whether to include the rendered content of the page/section in the index include_content = true From c0fb3b127428796acda2fea99b4ff082fb33d7c9 Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Tue, 28 May 2024 10:24:29 -0400 Subject: [PATCH 22/23] move body cleaning and truncation to a function --- components/search/src/elasticlunr.rs | 14 +++----------- components/search/src/fuse.rs | 12 ++++++++++-- components/search/src/lib.rs | 25 ++++++++++++++++++++++++- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/components/search/src/elasticlunr.rs b/components/search/src/elasticlunr.rs index f37458019c..af7583ef8a 100644 --- a/components/search/src/elasticlunr.rs +++ b/components/search/src/elasticlunr.rs @@ -5,6 +5,8 @@ use libs::elasticlunr::{lang, Index, IndexBuilder}; use libs::time::format_description::well_known::Rfc3339; use libs::time::OffsetDateTime; +use crate::clean_and_truncate_body; + pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js"); fn build_fields(search_config: &Search, mut index: IndexBuilder) -> IndexBuilder { @@ -69,17 +71,7 @@ fn fill_index( } if search_config.include_content { - let body = super::AMMONIA.clean(content).to_string(); - if let Some(truncate_len) = search_config.truncate_content_length { - // Not great for unicode - // TODO: fix it like the truncate in Tera - match body.char_indices().nth(truncate_len) { - None => row.push(body), - Some((idx, _)) => row.push((body[..idx]).to_string()), - }; - } else { - row.push(body); - }; + row.push(clean_and_truncate_body(search_config.truncate_content_length, content)); } row } diff --git a/components/search/src/fuse.rs b/components/search/src/fuse.rs index c122c0a896..604a0d9fcd 100644 --- a/components/search/src/fuse.rs +++ b/components/search/src/fuse.rs @@ -3,6 +3,8 @@ use content::Library; use errors::Result; use libs::serde_json; +use crate::clean_and_truncate_body; + /// build index in Fuse.js format. pub fn build_index(lang: &str, library: &Library, config: &Search) -> Result { #[derive(serde::Serialize)] @@ -30,7 +32,10 @@ pub fn build_index(lang: &str, library: &Library, config: &Search) -> Result None, }, body: match config.include_content { - true => Some(super::AMMONIA.clean(§ion.content).to_string()), + true => Some(clean_and_truncate_body( + config.truncate_content_length, + §ion.content, + )), false => None, }, path: match config.include_path { @@ -52,7 +57,10 @@ pub fn build_index(lang: &str, library: &Library, config: &Search) -> Result None, }, body: match config.include_content { - true => Some(super::AMMONIA.clean(&page.content).to_string()), + true => Some(super::clean_and_truncate_body( + config.truncate_content_length, + &page.content, + )), false => None, }, path: match config.include_path { diff --git a/components/search/src/lib.rs b/components/search/src/lib.rs index ac02b4224a..cf2908bbdb 100644 --- a/components/search/src/lib.rs +++ b/components/search/src/lib.rs @@ -8,7 +8,7 @@ use std::collections::{HashMap, HashSet}; pub use elasticlunr::{build_index as build_elasticlunr, ELASTICLUNR_JS}; pub use fuse::build_index as build_fuse; -pub static AMMONIA: Lazy> = Lazy::new(|| { +static AMMONIA: Lazy> = Lazy::new(|| { let mut clean_content = HashSet::new(); clean_content.insert("script"); clean_content.insert("style"); @@ -23,3 +23,26 @@ pub static AMMONIA: Lazy> = Lazy::new(|| { .clean_content_tags(clean_content); builder }); + +/// uses ammonia to clean the body, and truncates it to `truncate_content_length` +pub fn clean_and_truncate_body(truncate_content_length: Option, body: &str) -> String { + let mut clean = AMMONIA.clean(body).to_string(); + if let Some(new_len) = truncate_content_length { + clean.truncate(clean.char_indices().nth(new_len).map(|(i, _)| i).unwrap_or(clean.len())) + } + clean +} + +#[cfg(test)] +#[test] +fn clean_and_truncate_body_test() { + assert_eq!(clean_and_truncate_body(None, "hello world"), "hello world"); + assert_eq!( + clean_and_truncate_body(None, "hello world"), + "hello world" + ); + assert_eq!(clean_and_truncate_body(Some(100), "hello"), "hello"); + assert_eq!(clean_and_truncate_body(Some(2), "hello"), "he"); + assert_eq!(clean_and_truncate_body(Some(6), "hello \u{202E} world"), "hello "); + assert_eq!(clean_and_truncate_body(Some(7), "hello \u{202E} world"), "hello \u{202e}"); +} From 1535965e07f49514e688ffd348e4ac995c60b031 Mon Sep 17 00:00:00 2001 From: SIGSTACKFAULT Date: Tue, 28 May 2024 10:27:19 -0400 Subject: [PATCH 23/23] update truncate_content_length docs to specify *code points* --- components/config/src/config/search.rs | 2 +- docs/content/documentation/getting-started/configuration.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/components/config/src/config/search.rs b/components/config/src/config/search.rs index 301d53d56f..e259b57bf9 100644 --- a/components/config/src/config/search.rs +++ b/components/config/src/config/search.rs @@ -34,7 +34,7 @@ pub struct Search { /// Includes the whole content in the search index. Ok for small sites but becomes /// too big on large sites. `true` by default. pub include_content: bool, - /// Optionally truncate the content down to `n` chars. This might cut content in a word + /// Optionally truncate the content down to `n` code points. This might cut content in a word pub truncate_content_length: Option, /// Includes the description in the search index. When the site becomes too large, you can switch /// to that instead. `false` by default diff --git a/docs/content/documentation/getting-started/configuration.md b/docs/content/documentation/getting-started/configuration.md index 9ca57c85bc..1130cecbfe 100644 --- a/docs/content/documentation/getting-started/configuration.md +++ b/docs/content/documentation/getting-started/configuration.md @@ -178,7 +178,7 @@ include_date = false include_path = false # Whether to include the rendered content of the page/section in the index include_content = true -# At which character to truncate the content to. Useful if you have a lot of pages and the index would +# At which code point to truncate the content to. Useful if you have a lot of pages and the index would # become too big to load on the site. Defaults to not being set. # truncate_content_length = 100