diff --git a/.gitignore b/.gitignore index 3192777..7bbb396 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,4 @@ Cargo.lock # TODO todo.md -/misc - .DS_Store \ No newline at end of file diff --git a/README.md b/README.md index 4f917bc..333c87c 100644 --- a/README.md +++ b/README.md @@ -2,29 +2,53 @@ Search engine written in Rust, based on an inverted index on disk. -## Implementation status +## Commands -**IO** -- [x] Classes for writing and reading bit-streams; -- [x] Proper strings writer and reader. +**Index a new document collection** -**Text preprocessing** -- [x] Tokenization; -- [x] Stemming; +``` +make cli folder=path/to/folder action=build min_f=1 max_p=0.99 +``` -**Index construction** -- [x] In-memory datasets index construction; -- [x] Proper vocabulary and paths on disk; -- [x] Spelling correction index; -- [x] Min and max frequency cutoffs. +The `min_f` param filters terms appearing less that it, while `max_p` filters terms appearing more than +in `max_p` percentage of the documents. -**Queries** -- [x] BM25 scoring and query window; -- [ ] Boolean queries: in progress +The folder param is a path to a folder with the following structure: +``` +├── docs +│ ├── 1.txt +│ ├── 2.txt +│ └── 3.txt +└── index + ├── idx.alphas + ├── idx.docs + ├── idx.offsets + └── idx.postings +``` -**Client** -- [x] CLI; -- [x] Web interface. +The index folder will be created after the build command. + +**Load a document collection** + +You can load a pre-build index by running: + +``` +make web folder=path/to/folder +``` + +You can then visit `http://0.0.0.0:3000` to find a web interface to enter free text and boolean queries. + +![web.png](misc%2Fweb.png) + +**Query Syntax** + +You can perform Google-like free test queries, results will +be ranked via [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) scoring. + +You can also specify boolean queries with `"b: "` prefix such as: +``` +b: hello AND there OR NOT man +``` ## References [Introduction to Information Retrieval](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze diff --git a/makefile b/makefile index f9d463d..af94106 100644 --- a/makefile +++ b/makefile @@ -1,8 +1,8 @@ web: - cargo run --release --bin server $(index_name) + cargo run --release --bin server ${folder} cli: - cargo run --release --bin search $(index_name) ${action} + cargo run --release --bin search ${folder} ${action} ${min_f} ${max_p} test: cargo test --release diff --git a/misc/web.png b/misc/web.png new file mode 100644 index 0000000..cfaa888 Binary files /dev/null and b/misc/web.png differ diff --git a/search/Cargo.toml b/search/Cargo.toml index bd4693e..4b0d297 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -13,3 +13,4 @@ fxhash = "0.2.1" tempdir = "0.3.7" regex = "1" argparse = "0.2.2" +phf = { version = "0.11.2", features = ["macros"] } diff --git a/search/src/engine/mod.rs b/search/src/engine/mod.rs index 1734454..0e8b248 100644 --- a/search/src/engine/mod.rs +++ b/search/src/engine/mod.rs @@ -11,6 +11,7 @@ use self::heap::FixedMinHeap; use self::postings::{DocumentIdsList, Postings, PostingsList}; use self::preprocessor::Preprocessor; use self::vocabulary::Vocabulary; +use phf::phf_map; use std::cmp::min; use std::collections::{BTreeMap, HashMap}; use std::time::Instant; @@ -26,6 +27,14 @@ const BM25_SCORE_MULTIPLIER: f64 = 1.0; const BM25_KL: f64 = 1.2; const BM25_B: f64 = 0.75; +static BOOLEAN_PRECEDENCE: phf::Map<&'static str, &u8> = phf_map! { + "NOT" => &3, + "AND" => &2, + "OR" => &1, + "(" => &0, + ")" => &0, +}; + pub struct Engine { vocabulary: Vocabulary, postings: Postings, @@ -39,19 +48,13 @@ pub struct InMemory { documents: Vec, } -pub struct BooleanQueryResult { - pub postfix_query: Vec, - pub documents_ids: DocumentIdsList, - pub time_ms: u128, -} - -pub struct RankedQueryResult { - pub tokens: Vec, - pub documents: Vec, +pub struct QueryResult { + pub query: Vec, + pub documents: Vec, pub time_ms: u128, } -pub struct RankedDocumentResult { +pub struct DocumentResult { pub id: u32, pub path: String, pub score: f64, @@ -88,15 +91,16 @@ impl Engine { } } - pub fn boolean_query(&mut self, postfix_expression: Vec<&str>) -> BooleanQueryResult { + pub fn boolean_query(&mut self, query: &str) -> QueryResult { let start_time = Instant::now(); let mut stack = Vec::new(); let mut intermediate_result; let num_docs = self.documents.get_num_documents(); - for p in postfix_expression.clone() { - match p { + let query = Self::infix_to_postfix_boolean(query); + for p in query.clone() { + match p.as_str() { "AND" => { intermediate_result = Postings::and_operator(stack.pop().unwrap(), stack.pop().unwrap()); @@ -111,7 +115,7 @@ impl Engine { _ => { intermediate_result = self .vocabulary - .spellcheck_term(p) + .spellcheck_term(&p) .and_then(|t| self.get_term_doc_ids(&t)) .unwrap_or_default(); } @@ -120,16 +124,27 @@ impl Engine { stack.push(intermediate_result); } + let documents = stack + .pop() + .unwrap() + .iter() + .map(|i| DocumentResult { + id: *i, + path: self.documents.get_doc_path(*i), + score: 1.0, + }) + .collect(); + let time_ms = start_time.elapsed().as_millis(); - BooleanQueryResult { - postfix_query: postfix_expression.iter().map(|s| s.to_string()).collect(), - documents_ids: stack.pop().unwrap(), + QueryResult { + query, + documents, time_ms, } } - pub fn free_query(&mut self, query: &str, num_results: usize) -> RankedQueryResult { + pub fn free_query(&mut self, query: &str, num_results: usize) -> QueryResult { let start_time = Instant::now(); let tokens: Vec = self @@ -184,7 +199,7 @@ impl Engine { let documents = selector .get_sorted_id_priority_pairs() .iter() - .map(|(id, score)| RankedDocumentResult { + .map(|(id, score)| DocumentResult { id: *id, score: *score, path: self.documents.get_doc_path(*id), @@ -193,23 +208,57 @@ impl Engine { let time_ms = start_time.elapsed().as_millis(); - RankedQueryResult { - tokens, + QueryResult { + query: tokens, documents, time_ms, } } - fn get_term_postings(&mut self, term: &str) -> Option { + fn get_term_doc_ids(&mut self, term: &str) -> Option { self.vocabulary .get_term_index(term) - .map(|i| self.postings.load_postings_list(i)) + .map(|i| self.postings.load_doc_ids_list(i)) } - fn get_term_doc_ids(&mut self, term: &str) -> Option { + fn infix_to_postfix_boolean(query: &str) -> Vec { + let mut res = Vec::new(); + let mut stack = Vec::new(); + + let sanitized_query = query.replace('(', " ( ").replace(')', " ) "); + + for t in sanitized_query.split_ascii_whitespace() { + if t == "(" { + stack.push(t); + } else if t == ")" { + let mut last = stack.pop().unwrap(); + while last != "(" { + res.push(last); + last = stack.pop().unwrap(); + } + } else if let Some(current_precedence) = BOOLEAN_PRECEDENCE.get(t) { + while !stack.is_empty() { + let last = stack.last().unwrap(); + if BOOLEAN_PRECEDENCE.get(last).unwrap() > current_precedence { + res.push(stack.pop().unwrap()); + } else { + break; + } + } + stack.push(t); + } else { + res.push(t); + } + } + + stack.iter().rev().for_each(|e| res.push(e)); + res.iter().map(|s| (*s).to_string()).collect() + } + + fn get_term_postings(&mut self, term: &str) -> Option { self.vocabulary .get_term_index(term) - .map(|i| self.postings.load_doc_ids_list(i)) + .map(|i| self.postings.load_postings_list(i)) } fn compute_score(document_score: &DocumentScore, num_tokens: usize) -> f64 { @@ -254,37 +303,59 @@ mod test { #[test] fn test_build() { let index_path = &create_temporary_dir_path(); - Engine::build_engine("test_data/docs", index_path, 1.0, 0); - let mut idx = Engine::load_index(index_path); for ele in ["hello", "man", "world"] { assert!(idx.vocabulary.get_term_index(ele).is_some()); } - let mut query: Vec = idx + let mut free_query: Vec = idx .free_query("hello", 10) .documents .iter() .map(|d| d.path.clone()) .collect(); + free_query.sort(); + + assert_eq!(free_query, ["test_data/docs/1.txt", "test_data/docs/2.txt"]); + + let mut boolean_query: Vec = idx + .boolean_query("hello AND NOT world") + .documents + .iter() + .map(|d| d.path.clone()) + .collect(); + boolean_query.sort(); - query.sort(); - - assert_eq!(query, ["test_data/docs/1.txt", "test_data/docs/2.txt"]); - - // println!( - // "{:?}", - // idx.boolean_query(vec!["hello", "man", "OR"]).documents_ids - // ); - // println!( - // "{:?}", - // idx.boolean_query(vec!["hello", "man", "AND"]).documents_ids - // ); - // println!( - // "{:?}", - // idx.boolean_query(vec!["man", "NOT"]).documents_ids[0] - // ); + assert_eq!(boolean_query, ["test_data/docs/2.txt"]); + } + + #[test] + fn test_infix_postfix() { + assert_eq!( + Engine::infix_to_postfix_boolean("a AND (b OR NOT c)"), + ["a", "b", "c", "NOT", "OR", "AND"] + ); + + assert_eq!( + Engine::infix_to_postfix_boolean("a AND b OR NOT c"), + ["a", "b", "AND", "c", "NOT", "OR"] + ); + + assert_eq!( + Engine::infix_to_postfix_boolean("NOT (a AND b) OR NOT (c OR d)"), + ["a", "b", "AND", "NOT", "c", "d", "OR", "NOT", "OR"] + ); + + assert_eq!( + Engine::infix_to_postfix_boolean("a AND b AND c OR d OR e"), + ["a", "b", "c", "AND", "AND", "d", "e", "OR", "OR"] + ); + + assert_eq!( + Engine::infix_to_postfix_boolean("a AND (b OR c)"), + ["a", "b", "c", "OR", "AND"] + ); } } diff --git a/search/src/engine/postings.rs b/search/src/engine/postings.rs index b238025..8402869 100644 --- a/search/src/engine/postings.rs +++ b/search/src/engine/postings.rs @@ -241,5 +241,8 @@ mod tests { let result_empty = Postings::not_operator(vec![], n); assert_eq!(result_empty, (1..=n).collect::>()); + + let result_full = Postings::not_operator(vec![0, 1, 2], 3); + assert_eq!(result_full, []); } } diff --git a/search/src/main.rs b/search/src/main.rs index 7255302..450e53a 100644 --- a/search/src/main.rs +++ b/search/src/main.rs @@ -1,5 +1,5 @@ use indicatif::HumanDuration; -use search::engine::{Engine, RankedQueryResult}; +use search::engine::{Engine, QueryResult}; use std::cmp::min; use std::env; use std::io::{self, Write}; @@ -9,8 +9,8 @@ use std::time::{Duration, Instant}; const NUM_TOP_RESULTS: usize = 10; const NUM_RESULTS: usize = 100; -fn print_results(result: &RankedQueryResult) { - println!("Search tokens: {:?}", result.tokens); +fn print_results(result: &QueryResult) { + println!("Search tokens: {:?}", result.query); if result.documents.is_empty() { println!("\nNo documents found\n"); @@ -107,7 +107,11 @@ fn main() { loop { let query = read_line("> "); - let result = e.free_query(&query, NUM_RESULTS); + let result = if query.starts_with("b: ") { + e.boolean_query(&query.replace("b: ", "")) + } else { + e.free_query(&query, NUM_RESULTS) + }; print_results(&result); } diff --git a/server/src/main.rs b/server/src/main.rs index 3869123..c5efcd7 100644 --- a/server/src/main.rs +++ b/server/src/main.rs @@ -129,7 +129,11 @@ async fn post_query( let mut engine = state.engine.lock().unwrap(); - let query_result = engine.free_query(&payload.query, 100); + let query_result = if payload.query.starts_with("b: ") { + engine.boolean_query(&payload.query.replace("b: ", "")) + } else { + engine.free_query(&payload.query, 100) + }; let documents = query_result .documents @@ -143,7 +147,7 @@ async fn post_query( .collect(); let response = QueryResponse { - tokens: query_result.tokens, + tokens: query_result.query, documents, time_ms: query_result.time_ms, }; diff --git a/server/templates/index.html b/server/templates/index.html index f71bd64..48058db 100644 --- a/server/templates/index.html +++ b/server/templates/index.html @@ -62,12 +62,12 @@ search-rs - + -