diff --git a/README.md b/README.md index da12641..37f399f 100644 --- a/README.md +++ b/README.md @@ -16,12 +16,11 @@ Search engine written in Rust, based on an inverted index on disk. **Index construction** - [x] In-memory datasets index construction; - [x] Proper vocabulary and paths on disk; -- [ ] Spelling correction index. +- [ ] Spelling correction index: in progress. **Queries** - [x] Tf-idf ranked retrieval; - [x] Window computation; -- [ ] FIle content retrieval. **Evaluation** - [ ] Query speed; @@ -30,13 +29,7 @@ Search engine written in Rust, based on an inverted index on disk. **Client** - [x] CLI; -- [ ] Web interface. - -## Crates in use -- [stemmer-rs](https://github.com/lise-henry/stemmer-rs) -- [tokenizers](https://github.com/huggingface/tokenizers) -- [indicatif](https://github.com/console-rs/indicatif) -- [fxhash](https://github.com/cbreeden/fxhash) +- [x] Web interface. ## References [Introduction to Information Retrieval](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze diff --git a/makefile b/makefile new file mode 100644 index 0000000..15f5363 --- /dev/null +++ b/makefile @@ -0,0 +1,8 @@ +web: + cargo run --release --bin server $(index_name) + +cli: + cargo run --release --bin search $(index_name) ${action} + +test: + cargo test --release diff --git a/search/Cargo.toml b/search/Cargo.toml index 7a086f6..fa90ac5 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -6,9 +6,9 @@ edition = "2021" [dependencies] rand = "0.8" -tokenizers = { version = "0.15.0", features = ["http"] } rust-stemmers = "1.2.0" rayon = "1.8.0" -indicatif = {version = "0.17.0", features = ["rayon", "improved_unicode"]} +indicatif = { version = "0.17.0", features = ["rayon", "improved_unicode"] } fxhash = "0.2.1" tempdir = "0.3.7" +regex = "1" diff --git a/search/src/index/builder.rs b/search/src/index/builder.rs index 447fbed..01d70d3 100644 --- a/search/src/index/builder.rs +++ b/search/src/index/builder.rs @@ -1,19 +1,17 @@ use super::{ documents::{Document, Documents}, postings::{PostingEntry, PostingList, Postings}, - text, + preprocessor::Preprocessor, vocabulary::Vocabulary, InMemoryIndex, }; use indicatif::{ParallelProgressIterator, ProgressStyle}; use rayon::prelude::*; -use rust_stemmers::Stemmer; use std::{ collections::{BTreeMap, HashMap}, fs, sync::Mutex, }; -use tokenizers::Tokenizer; const PROGRESS_STYLE: &str = "Documents per second: {per_sec:<3}\n\n[{elapsed_precise}] [{bar:50}] {pos}/{len} [{eta_precise}]"; @@ -21,14 +19,14 @@ const PROGRESS_CHARS: &str = "=> "; const CUTOFF_THRESHOLD: f64 = 0.8; -pub fn build_index(input_dir: &str, output_path: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) { - let index: InMemoryIndex = build_in_memory(input_dir, tokenizer, stemmer); +pub fn build_index(input_dir: &str, output_path: &str, preprocessor: &Preprocessor) { + let index: InMemoryIndex = build_in_memory(input_dir, preprocessor); Postings::write_postings(&index, output_path); Vocabulary::write_vocabulary(&index, output_path); Documents::write_documents(&index.documents, output_path); } -fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) -> InMemoryIndex { +fn build_in_memory(input_dir: &str, preprocessor: &Preprocessor) -> InMemoryIndex { let files: Vec = fs::read_dir(input_dir) .expect("error while retrieving input directory content") .map(|p| p.unwrap()) @@ -54,7 +52,7 @@ fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) -> ) .for_each(|d| { let file_content = fs::read_to_string(d.path()).expect("error while reading file"); - let tokens = text::tokenize_and_stem(tokenizer, stemmer, &file_content); + let tokens = preprocessor.tokenize_and_stem(&file_content); let mut doc_id = doc_id_mutex.lock().unwrap(); diff --git a/search/src/index/documents.rs b/search/src/index/documents.rs index ebec59c..66264ec 100644 --- a/search/src/index/documents.rs +++ b/search/src/index/documents.rs @@ -65,3 +65,61 @@ impl Documents { self.docs[doc_id as usize].path.clone() } } + +#[cfg(test)] +mod tests { + use crate::test_utils::utils::create_temporary_file_path; + + use super::*; + + #[test] + fn test_write_and_load() { + let dir = create_temporary_file_path("docs_unit"); + + let documents = vec![ + Document { + path: "document1.txt".to_string(), + lenght: 100, + }, + Document { + path: "document2.txt".to_string(), + lenght: 150, + }, + ]; + + Documents::write_documents(&documents, &dir); + let loaded_documents = Documents::load_documents(&dir); + + assert_eq!(loaded_documents.get_num_documents(), documents.len() as u32); + + for i in 0..documents.len() { + assert_eq!(loaded_documents.get_doc_path(i as u32), documents[i].path); + assert_eq!(loaded_documents.get_doc_len(i as u32), documents[i].lenght); + } + } + + #[test] + fn test_methods() { + let documents = vec![ + Document { + path: "document1.txt".to_string(), + lenght: 100, + }, + Document { + path: "document2.txt".to_string(), + lenght: 150, + }, + ]; + + let doc_collection = Documents { + docs: documents.clone(), + }; + + assert_eq!(doc_collection.get_num_documents(), documents.len() as u32); + + for i in 0..documents.len() { + assert_eq!(doc_collection.get_doc_path(i as u32), documents[i].path); + assert_eq!(doc_collection.get_doc_len(i as u32), documents[i].lenght); + } + } +} diff --git a/search/src/index/mod.rs b/search/src/index/mod.rs index 7895d69..ac2c3cc 100644 --- a/search/src/index/mod.rs +++ b/search/src/index/mod.rs @@ -1,17 +1,15 @@ mod builder; mod documents; mod postings; -mod text; +mod preprocessor; mod utils; mod vocabulary; -use rust_stemmers::Stemmer; -use std::collections::BTreeMap; -use tokenizers::Tokenizer; - use self::documents::{Document, Documents}; use self::postings::{PostingList, Postings}; +use self::preprocessor::Preprocessor; use self::vocabulary::Vocabulary; +use std::collections::BTreeMap; pub const POSTINGS_EXTENSION: &str = ".postings"; pub const OFFSETS_EXTENSION: &str = ".offsets"; @@ -22,8 +20,7 @@ pub struct Index { vocabulary: Vocabulary, postings: Postings, documents: Documents, - tokenizer: Tokenizer, - stemmer: Stemmer, + preprocessor: Preprocessor, } pub struct InMemoryIndex { @@ -33,19 +30,16 @@ pub struct InMemoryIndex { } impl Index { - pub fn build_index(input_path: &str, output_path: &str, tokenizer_path: &str) { - let tokenizer = text::load_tokenizer(tokenizer_path, false); - let stemmer = text::load_stemmer(); - builder::build_index(input_path, output_path, &tokenizer, &stemmer); + pub fn build_index(input_path: &str, output_path: &str) { + builder::build_index(input_path, output_path, &Preprocessor::new()); } - pub fn load_index(input_path: &str, tokenizer_path: &str) -> Index { + pub fn load_index(input_path: &str) -> Index { Index { vocabulary: Vocabulary::load_vocabulary(input_path), postings: Postings::load_postings_reader(input_path), documents: Documents::load_documents(input_path), - tokenizer: text::load_tokenizer(tokenizer_path, false), - stemmer: text::load_stemmer(), + preprocessor: Preprocessor::new(), } } @@ -55,8 +49,8 @@ impl Index { .map(|i| self.postings.load_postings_list(i)) } - pub fn tokenize_and_stem_query(&self, query: &str) -> Vec { - text::tokenize_and_stem(&self.tokenizer, &self.stemmer, query) + pub fn get_query_tokens(&self, query: &str) -> Vec { + self.preprocessor.tokenize_and_stem(query) } pub fn get_num_documents(&self) -> u32 { @@ -81,9 +75,9 @@ mod test { fn test_build() { let index_path = &create_temporary_dir_path(); - Index::build_index("test_data/docs", index_path, "test_data/test_tokenizer"); + Index::build_index("test_data/docs", index_path); - let mut idx = Index::load_index(index_path, "test_data/test_tokenizer"); + let mut idx = Index::load_index(index_path); for ele in ["hello", "man", "world"] { assert!(idx.vocabulary.get_term_index(ele).is_some()); diff --git a/search/src/index/preprocessor.rs b/search/src/index/preprocessor.rs new file mode 100644 index 0000000..8d050ac --- /dev/null +++ b/search/src/index/preprocessor.rs @@ -0,0 +1,42 @@ +use regex::Regex; +use rust_stemmers::{Algorithm, Stemmer}; + +pub struct Preprocessor { + stemmer: Stemmer, + regex: Regex, +} + +impl Preprocessor { + pub fn new() -> Preprocessor { + Preprocessor { + stemmer: Stemmer::create(Algorithm::English), + regex: Regex::new(r"[^a-zA-Z0-9\s]+").expect("error while building regex"), + } + } + + pub fn tokenize_and_stem(&self, text: &str) -> Vec { + self.regex + .replace_all(text, " ") + .split_whitespace() + .map(|t| t.to_lowercase()) + .map(|t| self.stemmer.stem(&t).to_string()) + .collect() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_tokenize_and_stem() { + let preprocessor = Preprocessor::new(); + + let text1 = "The quick brown, fox jumps over the lazy dog!!!"; + let result1 = preprocessor.tokenize_and_stem(text1); + assert_eq!( + result1, + vec!["the", "quick", "brown", "fox", "jump", "over", "the", "lazi", "dog"] + ); + } +} diff --git a/search/src/index/text.rs b/search/src/index/text.rs deleted file mode 100644 index 2f1bed7..0000000 --- a/search/src/index/text.rs +++ /dev/null @@ -1,37 +0,0 @@ -use rust_stemmers::{Algorithm, Stemmer}; -use std::{fs::create_dir_all, path::Path}; -use tokenizers::Tokenizer; - -pub fn load_tokenizer(filename: &str, force_download: bool) -> Tokenizer { - let path = Path::new(filename); - - if !path.exists() || force_download { - path.parent().map(create_dir_all); - - let identifier = path.file_name().unwrap().to_str().unwrap(); - - Tokenizer::from_pretrained(identifier, None) - .expect("error while retrieving tokenizer from the web") - .save(filename, false) - .expect("error while saving tokenizer to file"); - } - - Tokenizer::from_file(filename).expect("error while loading tokenizer from file") -} - -pub fn load_stemmer() -> Stemmer { - Stemmer::create(Algorithm::English) -} - -pub fn tokenize_and_stem(tokenizer: &Tokenizer, stemmer: &Stemmer, text: &str) -> Vec { - let tokenized_text = tokenizer - .encode(text, false) - .expect("error while tokenizing text"); - - tokenized_text - .get_tokens() - .iter() - .map(|t| t.to_lowercase()) - .map(|t| stemmer.stem(&t).to_string()) - .collect() -} diff --git a/search/src/index/vocabulary.rs b/search/src/index/vocabulary.rs index 2a83500..135bfba 100644 --- a/search/src/index/vocabulary.rs +++ b/search/src/index/vocabulary.rs @@ -2,6 +2,7 @@ use super::{utils, InMemoryIndex, VOCABULARY_ALPHA_EXTENSION}; use crate::disk::{bits_reader::BitsReader, bits_writer::BitsWriter}; use fxhash::FxHashMap; +#[allow(dead_code)] pub struct Vocabulary { term_to_index: FxHashMap, frequencies: Vec, @@ -67,8 +68,12 @@ impl Vocabulary { // build trigram index let mut trigram_index = FxHashMap::default(); + for (index, term) in index_to_term.iter().enumerate() { let term_chars: Vec = term.chars().collect(); + if term_chars.len() < 3 { + continue; + } for i in 0..term_chars.len() - 2 { let trigram = &term_chars[i..i + 3]; @@ -92,4 +97,72 @@ impl Vocabulary { pub fn get_term_index(&self, term: &str) -> Option { self.term_to_index.get(term).map(|i| *i) } + + #[allow(dead_code)] + + pub fn get_term_index_spellcheck(&self, term: &str) -> Option { + self.get_term_index(term) + .or_else(|| self.get_closest_index(term)) + } + #[allow(dead_code)] + + fn get_closest_index(&self, term: &str) -> Option { + let candidates = (0..term.len() - 2) + .map(|i| term[i..i + 3].to_string()) + .flat_map(|t| self.trigram_index.get(&t)) + .flat_map(|v| v.into_iter()); + + candidates + .min_by_key(|i| Self::distance(term, &self.index_to_term[**i])) + .map(|i| *i) + } + + #[allow(unused_variables)] + fn distance(s1: &str, s2: &str) -> u32 { + todo!() + } +} + +#[cfg(test)] +mod tests { + use std::collections::BTreeMap; + + use crate::{index::postings::PostingList, test_utils::utils::create_temporary_file_path}; + + use super::*; + + #[test] + fn test_write_and_load() { + let dir = create_temporary_file_path("vocab_unit"); + + let mut map = BTreeMap::new(); + map.insert("hello".to_string(), 0); + map.insert("world".to_string(), 0); + + let mut postings = Vec::new(); + postings.push(PostingList { + collection_frequency: 1, + documents: Vec::new(), + }); + postings.push(PostingList { + collection_frequency: 2, + documents: Vec::new(), + }); + + let index = InMemoryIndex { + term_index_map: map, + postings: postings, + documents: Vec::new(), + }; + + Vocabulary::write_vocabulary(&index, &dir); + let loaded_vocabulary = Vocabulary::load_vocabulary(&dir); + + assert_eq!(loaded_vocabulary.index_to_term, ["hello", "world"]); + assert_eq!(loaded_vocabulary.frequencies, [1, 2]); + + assert_eq!(*loaded_vocabulary.trigram_index.get("hel").unwrap(), [0]); + assert_eq!(*loaded_vocabulary.trigram_index.get("ell").unwrap(), [0]); + assert_eq!(*loaded_vocabulary.trigram_index.get("rld").unwrap(), [1]); + } } diff --git a/search/src/main.rs b/search/src/main.rs index 87539be..9935aee 100644 --- a/search/src/main.rs +++ b/search/src/main.rs @@ -1,6 +1,6 @@ use indicatif::HumanDuration; use search::index::Index; -use search::query::{DocumentResult, QueryProcessor}; +use search::query::{QueryProcessor, QueryResult}; use std::cmp::min; use std::env; use std::io::{self, Write}; @@ -10,22 +10,27 @@ use std::time::{Duration, Instant}; const NUM_TOP_RESULTS: usize = 10; const NUM_RESULTS: usize = 1_000_000; -fn print_results(results: &[DocumentResult], elapsed_time: Duration) { - if results.is_empty() { +fn print_results(result: QueryResult) { + println!("Search tokens: {:?}", result.tokens); + + if result.documents.is_empty() { println!("\nNo documents found\n"); return; } - println!("\nTop {} results:\n", min(results.len(), NUM_TOP_RESULTS)); + println!( + "\nTop {} results:\n", + min(result.documents.len(), NUM_TOP_RESULTS) + ); - for (i, doc) in results.iter().take(NUM_TOP_RESULTS).enumerate() { + for (i, doc) in result.documents.iter().take(NUM_TOP_RESULTS).enumerate() { println!("{:2}. score: {:>5.3}, path: {}", i + 1, doc.score, doc.path); } println!( "\nFetched {} documents in {} ms\n", - results.len(), - elapsed_time.as_millis() + result.documents.len(), + result.time_ms, ); } @@ -61,8 +66,7 @@ fn main() { let action = &args[2]; let build_index = action == "build"; - let index_path = format!("{}/index/index", base_path); - let tokenizer_path = format!("{}/tokenizer/roberta-large", base_path); + let index_path = format!("{}/index/idx", base_path); let docs_path = format!("{}/docs", base_path); if build_index { @@ -81,7 +85,7 @@ fn main() { let start_time = Instant::now(); - Index::build_index(&docs_path, &index_path, &tokenizer_path); + Index::build_index(&docs_path, &index_path); let elapsed_time = start_time.elapsed(); println!( "Index built in {}.\n\nLoad options:\n- CLI: cargo run --release --bin search {} load", @@ -92,7 +96,7 @@ fn main() { exit(0); } - let mut q = QueryProcessor::build_query_processor(&index_path, &tokenizer_path); + let mut q = QueryProcessor::build_query_processor(&index_path); println!( "Loaded search engine for directory: [{}]\n\nWrite a query and press enter.\n", @@ -102,10 +106,8 @@ fn main() { loop { let query = read_line("> "); - let start_time = Instant::now(); - let results = q.query(&query, NUM_RESULTS); - let elapsed_time = start_time.elapsed(); + let result = q.query(&query, NUM_RESULTS); - print_results(&results, elapsed_time); + print_results(result); } } diff --git a/search/src/query/mod.rs b/search/src/query/mod.rs index d80d2f9..968fa6b 100644 --- a/search/src/query/mod.rs +++ b/search/src/query/mod.rs @@ -1,4 +1,4 @@ -use std::{cmp::min, collections::HashMap}; +use std::{cmp::min, collections::HashMap, time::Instant}; use crate::index::Index; @@ -13,6 +13,12 @@ pub struct QueryProcessor { num_documents: u32, } +pub struct QueryResult { + pub tokens: Vec, + pub documents: Vec, + pub time_ms: u128, +} + pub struct DocumentResult { pub id: u32, pub path: String, @@ -26,11 +32,8 @@ struct DocumentScore { } impl QueryProcessor { - pub fn build_query_processor( - index_input_path: &str, - index_tokenizer_path: &str, - ) -> QueryProcessor { - let index = Index::load_index(index_input_path, index_tokenizer_path); + pub fn build_query_processor(index_input_path: &str) -> QueryProcessor { + let index = Index::load_index(index_input_path); let num_documents = index.get_num_documents(); QueryProcessor { @@ -39,26 +42,37 @@ impl QueryProcessor { } } - pub fn query(&mut self, query: &str, num_results: usize) -> Vec { - self.get_sorted_document_entries(query, num_results) + pub fn query(&mut self, query: &str, num_results: usize) -> QueryResult { + let start_time = Instant::now(); + + let tokens = self.index.get_query_tokens(query); + + let documents = self + .get_sorted_document_entries(tokens.clone(), num_results) .iter() .map(|e| DocumentResult { id: e.id, score: e.score, path: self.index.get_document_path(e.id), }) - .collect() + .collect(); + + let time_ms = start_time.elapsed().as_millis(); + + QueryResult { + tokens, + documents, + time_ms, + } } fn get_sorted_document_entries( &mut self, - query: &str, + tokens: Vec, num_results: usize, ) -> Vec { let mut scores: HashMap = HashMap::new(); - let tokens = self.index.tokenize_and_stem_query(query); - for (id, token) in tokens.iter().enumerate() { if let Some(postings) = self.index.get_term_postings(token) { let idf = (self.num_documents as f32 / postings.collection_frequency as f32).log2(); diff --git a/server/Cargo.toml b/server/Cargo.toml index 9d3a532..03017f3 100644 --- a/server/Cargo.toml +++ b/server/Cargo.toml @@ -6,10 +6,10 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +axum = { version = "0.7.4", features = ["macros"] } askama = "0.12.1" -axum = "0.7.4" env_logger = "0.11.0" log = "0.4.20" search = { path = "../search" } -serde = "1.0.195" +serde = { version = "1.0.195", features = ["derive"] } tokio = { version = "1.35.1", features = ["macros", "rt-multi-thread"] } diff --git a/server/src/main.rs b/server/src/main.rs index 14df684..78c3a91 100644 --- a/server/src/main.rs +++ b/server/src/main.rs @@ -1,5 +1,6 @@ use askama::Template; use axum::{ + debug_handler, extract::{Json, State}, http::StatusCode, response::{Html, IntoResponse, Response}, @@ -13,12 +14,11 @@ use std::{ env, fs::read_to_string, sync::{Arc, Mutex}, - time::Instant, }; struct AppState { - query_processor: Mutex, index_path: String, + query_processor: Mutex, } #[tokio::main] @@ -35,15 +35,11 @@ async fn main() { } let base_path = &args[1]; - let index_path = format!("{}/index/index", base_path); - let tokenizer_path = format!("{}/tokenizer/roberta-large", base_path); + let index_path = format!("{}/index/idx", base_path); let state = Arc::new(AppState { - query_processor: Mutex::new(QueryProcessor::build_query_processor( - &index_path, - &tokenizer_path, - )), index_path: base_path.clone(), + query_processor: Mutex::new(QueryProcessor::build_query_processor(&index_path)), }); let app = Router::new() @@ -98,11 +94,12 @@ struct QueryRequest { #[derive(Template)] #[template(path = "query.html")] struct QueryResponse { + tokens: Vec, time_ms: u128, documents: Vec, } -#[derive(Serialize, Deserialize)] +#[derive(Deserialize, Serialize)] struct Document { id: u32, score: f32, @@ -110,6 +107,7 @@ struct Document { content: String, } +#[debug_handler] async fn post_query( State(state): State>, Json(payload): Json, @@ -118,11 +116,10 @@ async fn post_query( let mut q = state.query_processor.lock().unwrap(); - let start_time = Instant::now(); let query_result = q.query(&payload.query, 100); - let time_ms = start_time.elapsed().as_millis(); let documents = query_result + .documents .iter() .map(|r| Document { id: r.id, @@ -132,7 +129,11 @@ async fn post_query( }) .collect(); - HtmlTemplate(QueryResponse { time_ms, documents }) + HtmlTemplate(QueryResponse { + tokens: query_result.tokens, + documents, + time_ms: query_result.time_ms, + }) } fn read_file_content(path: String) -> String { diff --git a/server/templates/query.html b/server/templates/query.html index 8364c00..6a7a5b1 100644 --- a/server/templates/query.html +++ b/server/templates/query.html @@ -1,10 +1,25 @@
- + {% if documents.len() == 0 %} +

No documents found

+ {% else %}

Search results

-

Query time: {{ time_ms }}ms

+ {% endif %} + +

+ Query time: {{ time_ms }}ms +

+

+ Searched tokens: + {% for t in tokens %} + {{t}} + {% if !loop.last %} + - + {% endif %} + {% endfor %} +

+ - {% for doc in documents %}
@@ -13,11 +28,14 @@

{{ doc.path }}

- {{ doc.content|truncate(300) }} + + {{ doc.content }}

{% endfor %} + + \ No newline at end of file