Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for Fuse.js search format #2507

Merged
merged 23 commits into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
cc11d7a
inital "just barely works" Fuse.js support
SIGSTACKFAULT May 24, 2024
39a1e71
implement FuseJavascript; refactor index_for_lang
SIGSTACKFAULT May 24, 2024
e291942
support search config
SIGSTACKFAULT May 24, 2024
5c310c3
move fuse index building to it's own file
SIGSTACKFAULT May 24, 2024
1939277
update doc of Search.index_format
SIGSTACKFAULT May 24, 2024
35bd4e0
update config docs
SIGSTACKFAULT May 24, 2024
4cc9a30
update search documentation
SIGSTACKFAULT May 24, 2024
1dbddce
use &str where possible
SIGSTACKFAULT May 24, 2024
b6765e3
use libs::serde_json
SIGSTACKFAULT May 24, 2024
7247d86
move extension logic to IndexFormat
SIGSTACKFAULT May 24, 2024
0c29dc8
move the entire filename logic inside IndexFormat
SIGSTACKFAULT May 24, 2024
c1c6eb4
move elasticlunr to it's own module
SIGSTACKFAULT May 24, 2024
d26b612
only create elasticlunr.min.js if we're actually using elasticlunr
SIGSTACKFAULT May 24, 2024
df6faab
move ELASTICLUNR_JS to elasticlunr.js
SIGSTACKFAULT May 27, 2024
3ddb394
hide the details of search's submodules
SIGSTACKFAULT May 27, 2024
2b9c1e3
optionally include path
SIGSTACKFAULT May 28, 2024
2298a29
explain include_path better
SIGSTACKFAULT May 28, 2024
b4b2c52
remove references to stork
SIGSTACKFAULT May 28, 2024
2ecb379
replace if with match
SIGSTACKFAULT May 28, 2024
16ea90a
support include_description
SIGSTACKFAULT May 28, 2024
8c3e79b
specify "permalink"
SIGSTACKFAULT May 28, 2024
c0fb3b1
move body cleaning and truncation to a function
SIGSTACKFAULT May 28, 2024
1535965
update truncate_content_length docs to specify *code points*
SIGSTACKFAULT May 28, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 83 additions & 78 deletions Cargo.lock

Large diffs are not rendered by default.

21 changes: 19 additions & 2 deletions components/config/src/config/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,23 @@ pub enum IndexFormat {
ElasticlunrJson,
#[default]
ElasticlunrJavascript,
FuseJson,
FuseJavascript,
}

impl IndexFormat {
/// file extension which ought to be used for this index format.
fn extension(&self) -> &'static str {
match *self {
IndexFormat::ElasticlunrJavascript | IndexFormat::FuseJavascript => "js",
IndexFormat::ElasticlunrJson | IndexFormat::FuseJson => "json",
}
}

/// the filename which ought to be used for this format and language `lang`
pub fn filename(&self, lang: &str) -> String {
format!("search_index.{}.{}", lang, self.extension())
}
}

#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
Expand All @@ -17,7 +34,7 @@ pub struct Search {
/// Includes the whole content in the search index. Ok for small sites but becomes
/// too big on large sites. `true` by default.
pub include_content: bool,
/// Optionally truncate the content down to `n` chars. This might cut content in a word
/// Optionally truncate the content down to `n` code points. This might cut content in a word
pub truncate_content_length: Option<usize>,
/// Includes the description in the search index. When the site becomes too large, you can switch
/// to that instead. `false` by default
Expand All @@ -26,7 +43,7 @@ pub struct Search {
pub include_date: bool,
/// Include the path of the page in the search index. `false` by default.
pub include_path: bool,
/// Foramt of the search index to be produced. Javascript by default
/// Foramt of the search index to be produced. 'elasticlunr_javascript' by default.
pub index_format: IndexFormat,
}

Expand Down
1 change: 1 addition & 0 deletions components/search/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ errors = { path = "../errors" }
content = { path = "../content" }
config = { path = "../config" }
libs = { path = "../libs" }
serde = { version = "1.0", features = ["derive"] }
236 changes: 236 additions & 0 deletions components/search/src/elasticlunr.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
use config::{Config, Search};
use content::{Library, Section};
use errors::{bail, Result};
use libs::elasticlunr::{lang, Index, IndexBuilder};
use libs::time::format_description::well_known::Rfc3339;
use libs::time::OffsetDateTime;

use crate::clean_and_truncate_body;

pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js");

fn build_fields(search_config: &Search, mut index: IndexBuilder) -> IndexBuilder {
if search_config.include_title {
index = index.add_field("title");
}

if search_config.include_description {
index = index.add_field("description");
}

if search_config.include_date {
index = index.add_field("date")
}

if search_config.include_path {
index = index.add_field_with_tokenizer("path", Box::new(path_tokenizer));
}

if search_config.include_content {
index = index.add_field("body")
}

index
}

fn path_tokenizer(text: &str) -> Vec<String> {
text.split(|c: char| c.is_whitespace() || c == '-' || c == '/')
.filter(|s| !s.is_empty())
.map(|s| s.trim().to_lowercase())
.collect()
}

fn fill_index(
search_config: &Search,
title: &Option<String>,
description: &Option<String>,
datetime: &Option<OffsetDateTime>,
path: &str,
content: &str,
) -> Vec<String> {
let mut row = vec![];

if search_config.include_title {
row.push(title.clone().unwrap_or_default());
}

if search_config.include_description {
row.push(description.clone().unwrap_or_default());
}

if search_config.include_date {
if let Some(date) = datetime {
if let Ok(d) = date.format(&Rfc3339) {
row.push(d);
}
}
}

if search_config.include_path {
row.push(path.to_string());
}

if search_config.include_content {
row.push(clean_and_truncate_body(search_config.truncate_content_length, content));
}
row
}

/// Returns the generated JSON index with all the documents of the site added using
/// the language given
/// Errors if the language given is not available in Elasticlunr
/// TODO: is making `in_search_index` apply to subsections of a `false` section useful?
pub fn build_index(lang: &str, library: &Library, config: &Config) -> Result<String> {
let language = match lang::from_code(lang) {
Some(l) => l,
None => {
bail!("Tried to build search index for language {} which is not supported", lang);
}
};
let language_options = &config.languages[lang];
let mut index = IndexBuilder::with_language(language);
index = build_fields(&language_options.search, index);
let mut index = index.build();

for (_, section) in &library.sections {
if section.lang == lang {
add_section_to_index(&mut index, section, library, &language_options.search);
}
}

Ok(index.to_json())
}

fn add_section_to_index(
index: &mut Index,
section: &Section,
library: &Library,
search_config: &Search,
) {
if !section.meta.in_search_index {
return;
}

// Don't index redirecting sections
if section.meta.redirect_to.is_none() {
index.add_doc(
&section.permalink,
&fill_index(
search_config,
&section.meta.title,
&section.meta.description,
&None,
&section.path,
&section.content,
),
);
}

for key in &section.pages {
let page = &library.pages[key];
if !page.meta.in_search_index {
continue;
}

index.add_doc(
&page.permalink,
&fill_index(
search_config,
&page.meta.title,
&page.meta.description,
&page.meta.datetime,
&page.path,
&page.content,
),
);
}
}

#[cfg(test)]
mod tests {
use super::*;
use config::Config;
use libs::elasticlunr::IndexBuilder;

#[test]
fn can_build_fields() {
let mut config = Config::default();
let index = build_fields(&config.search, IndexBuilder::new()).build();
assert_eq!(index.get_fields(), vec!["title", "body"]);

config.search.include_content = false;
config.search.include_description = true;
let index = build_fields(&config.search, IndexBuilder::new()).build();
assert_eq!(index.get_fields(), vec!["title", "description"]);

config.search.include_content = true;
let index = build_fields(&config.search, IndexBuilder::new()).build();
assert_eq!(index.get_fields(), vec!["title", "description", "body"]);

config.search.include_title = false;
let index = build_fields(&config.search, IndexBuilder::new()).build();
assert_eq!(index.get_fields(), vec!["description", "body"]);
}

#[test]
fn can_fill_index_default() {
let config = Config::default();
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();

let res = fill_index(&config.search, &title, &description, &None, &path, &content);
assert_eq!(res.len(), 2);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], content);
}

#[test]
fn can_fill_index_description() {
let mut config = Config::default();
config.search.include_description = true;
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();

let res = fill_index(&config.search, &title, &description, &None, &path, &content);
assert_eq!(res.len(), 3);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], description.unwrap());
assert_eq!(res[2], content);
}

#[test]
fn can_fill_index_truncated_content() {
let mut config = Config::default();
config.search.truncate_content_length = Some(5);
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();

let res = fill_index(&config.search, &title, &description, &None, &path, &content);
assert_eq!(res.len(), 2);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], content[..5]);
}

#[test]
fn can_fill_index_date() {
let mut config = Config::default();
config.search.include_date = true;
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();
let datetime = Some(OffsetDateTime::parse("2023-01-31T00:00:00Z", &Rfc3339).unwrap());

let res = fill_index(&config.search, &title, &description, &datetime, &path, &content);
assert_eq!(res.len(), 3);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], "2023-01-31T00:00:00Z");
assert_eq!(res[2], content);
}
}
76 changes: 76 additions & 0 deletions components/search/src/fuse.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
use config::Search;
use content::Library;
use errors::Result;
use libs::serde_json;

use crate::clean_and_truncate_body;

/// build index in Fuse.js format.
pub fn build_index(lang: &str, library: &Library, config: &Search) -> Result<String> {
#[derive(serde::Serialize)]
struct Item<'a> {
url: &'a str,
title: Option<&'a str>,
description: Option<&'a str>,
body: Option<String>, // AMMONIA.clean has to allocate anyway
path: Option<&'a str>,
}
let mut items: Vec<Item> = Vec::new();
for (_, section) in &library.sections {
if section.lang == lang
&& section.meta.redirect_to.is_none()
&& section.meta.in_search_index
{
items.push(Item {
url: &section.permalink,
title: match config.include_title {
true => Some(&section.meta.title.as_deref().unwrap_or_default()),
false => None,
},
description: match config.include_description {
true => Some(&section.meta.description.as_deref().unwrap_or_default()),
false => None,
},
body: match config.include_content {
true => Some(clean_and_truncate_body(
config.truncate_content_length,
&section.content,
)),
false => None,
},
path: match config.include_path {
true => Some(&section.path),
false => None,
},
});
for page in &section.pages {
let page = &library.pages[page];
if page.meta.in_search_index {
items.push(Item {
url: &page.permalink,
title: match config.include_title {
true => Some(&page.meta.title.as_deref().unwrap_or_default()),
false => None,
},
description: match config.include_description {
true => Some(&page.meta.description.as_deref().unwrap_or_default()),
false => None,
},
body: match config.include_content {
true => Some(super::clean_and_truncate_body(
config.truncate_content_length,
&page.content,
)),
false => None,
},
path: match config.include_path {
true => Some(&page.path),
false => None,
},
})
}
}
}
}
Ok(serde_json::to_string(&items)?)
}
Loading