From f7ea00ca68a94917f1415152a7ce89d782054b5d Mon Sep 17 00:00:00 2001 From: Francesco Date: Mon, 29 Jan 2024 00:03:00 +0100 Subject: [PATCH] min and max freq cutoff --- README.md | 6 ++--- search/Cargo.toml | 1 + search/src/engine/builder.rs | 32 +++++++++++++++++++------ search/src/engine/mod.rs | 19 +++++++++++---- search/src/main.rs | 45 +++++++++++++++++++++--------------- 5 files changed, 71 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index efa76ea..5d52d4c 100644 --- a/README.md +++ b/README.md @@ -11,16 +11,16 @@ Search engine written in Rust, based on an inverted index on disk. **Text preprocessing** - [x] Tokenization; - [x] Stemming; -- [ ] Parametrization at build time. **Index construction** - [x] In-memory datasets index construction; - [x] Proper vocabulary and paths on disk; -- [x] Spelling correction index;. +- [x] Spelling correction index; +- [x] Min and max frequency cutoffs. **Queries** - [x] BM25 scoring; -- [x] Window computation; +- [x] Query window **Evaluation** - [ ] Query speed; diff --git a/search/Cargo.toml b/search/Cargo.toml index fa90ac5..bd4693e 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -12,3 +12,4 @@ indicatif = { version = "0.17.0", features = ["rayon", "improved_unicode"] } fxhash = "0.2.1" tempdir = "0.3.7" regex = "1" +argparse = "0.2.2" diff --git a/search/src/engine/builder.rs b/search/src/engine/builder.rs index 971922f..5f6deb1 100644 --- a/search/src/engine/builder.rs +++ b/search/src/engine/builder.rs @@ -17,16 +17,30 @@ const PROGRESS_STYLE: &str = "Documents per second: {per_sec:<3}\n\n[{elapsed_precise}] [{bar:50}] {pos}/{len} [{eta_precise}]"; const PROGRESS_CHARS: &str = "=> "; -const CUTOFF_THRESHOLD: f64 = 0.8; - -pub fn build_engine(input_dir: &str, output_path: &str, preprocessor: &Preprocessor) { - let index: InMemory = build_in_memory(input_dir, preprocessor); +pub fn build_engine( + input_path: &str, + output_path: &str, + preprocessor: &Preprocessor, + max_freq_percentage_threshold: f64, + min_freq_threshold: u32, +) { + let index: InMemory = build_in_memory( + input_path, + preprocessor, + max_freq_percentage_threshold, + min_freq_threshold, + ); Postings::write_postings(&index, output_path); Vocabulary::write_vocabulary(&index, output_path); Documents::write_documents(&index.documents, output_path); } -fn build_in_memory(input_dir: &str, preprocessor: &Preprocessor) -> InMemory { +fn build_in_memory( + input_dir: &str, + preprocessor: &Preprocessor, + max_freq_percentage_threshold: f64, + min_freq_threshold: u32, +) -> InMemory { let files: Vec = fs::read_dir(input_dir) .expect("error while retrieving input directory content") .map(std::result::Result::unwrap) @@ -98,13 +112,17 @@ fn build_in_memory(input_dir: &str, preprocessor: &Preprocessor) -> InMemory { let final_postings = postings.into_inner().unwrap(); - let frequency_threshold = (doc_id_mutex.into_inner().unwrap() as f64 * CUTOFF_THRESHOLD) as u32; + let frequency_threshold = + (doc_id_mutex.into_inner().unwrap() as f64 * max_freq_percentage_threshold) as u32; let sorted_term_index_map: BTreeMap = term_index_map .into_inner() .unwrap() .into_iter() - .filter(|(_, v)| final_postings[*v].collection_frequency <= frequency_threshold) + .filter(|(_, v)| { + let f = final_postings[*v].collection_frequency; + f <= frequency_threshold && f > min_freq_threshold + }) .collect(); InMemory { diff --git a/search/src/engine/mod.rs b/search/src/engine/mod.rs index c4d6584..b224f91 100644 --- a/search/src/engine/mod.rs +++ b/search/src/engine/mod.rs @@ -20,7 +20,7 @@ pub const OFFSETS_EXTENSION: &str = ".offsets"; pub const DOCUMENTS_EXTENSION: &str = ".docs"; pub const VOCABULARY_ALPHA_EXTENSION: &str = ".alphas"; -const WINDOW_SCORE_MULTIPLIER: f64 = 0.5; +const WINDOW_SCORE_MULTIPLIER: f64 = 0.0; const BM25_SCORE_MULTIPLIER: f64 = 1.0; const BM25_KL: f64 = 1.2; @@ -58,8 +58,19 @@ struct DocumentScore { } impl Engine { - pub fn build_engine(input_path: &str, output_path: &str) { - builder::build_engine(input_path, output_path, &Preprocessor::new()); + pub fn build_engine( + input_path: &str, + output_path: &str, + max_freq_percentage_threshold: f64, + min_freq_threshold: u32, + ) { + builder::build_engine( + input_path, + output_path, + &Preprocessor::new(), + max_freq_percentage_threshold, + min_freq_threshold, + ); } pub fn load_index(input_path: &str) -> Engine { @@ -191,7 +202,7 @@ mod test { fn test_build() { let index_path = &create_temporary_dir_path(); - Engine::build_engine("test_data/docs", index_path); + Engine::build_engine("test_data/docs", index_path, 1.0, 0); let mut idx = Engine::load_index(index_path); diff --git a/search/src/main.rs b/search/src/main.rs index 2624685..31df4b0 100644 --- a/search/src/main.rs +++ b/search/src/main.rs @@ -7,7 +7,7 @@ use std::process::{exit, Command}; use std::time::{Duration, Instant}; const NUM_TOP_RESULTS: usize = 10; -const NUM_RESULTS: usize = 1_000_000; +const NUM_RESULTS: usize = 100; fn print_results(result: &QueryResult) { println!("Search tokens: {:?}", result.tokens); @@ -57,7 +57,10 @@ fn main() { let args: Vec = env::args().collect(); if args.len() < 3 || args.len() > 5 { - println!("Usage: cargo run --bin search [build_num_threads]"); + println!("Usage: cargo run -r + \nExample: + \n\t- cargo run -r path/to/docs build 10 0.90 + \n\t- cargo run -r path/to/docs load"); return; } @@ -69,27 +72,33 @@ fn main() { let docs_path = format!("{base_path}/docs"); if build_index { - println!("Start build on directory [{docs_path}]\n"); - - let num_threads = args.get(3).map_or(0, |s| s.parse().unwrap_or(0)); - - if num_threads != 0 { - println!("Setting thread number to {num_threads}"); + let min_freq: Result = args[3].parse(); + let min_freq = match min_freq { + Ok(value) => value, + Err(_) => { + println!("Error: min_freq must be an integer."); + return; + } + }; + + let max_frequency_perc: Result = args[4].parse(); + let max_frequency_perc = match max_frequency_perc { + Ok(value) => value, + Err(_) => { + println!("Error: max_frequency_perc must be a float."); + return; + } + }; - rayon::ThreadPoolBuilder::new() - .num_threads(num_threads) - .build_global() - .unwrap(); - } + println!("Start build on directory [{docs_path}]\n"); let start_time = Instant::now(); - - Engine::build_engine(&docs_path, &index_path); + Engine::build_engine(&docs_path, &index_path, max_frequency_perc, min_freq); let elapsed_time = start_time.elapsed(); + println!( - "Index built in {}.\n\nLoad options:\n- CLI: cargo run --release --bin search {} load", - HumanDuration(Duration::from_secs(elapsed_time.as_secs())), - base_path + "Index built in {}", + HumanDuration(Duration::from_secs(elapsed_time.as_secs())) ); exit(0);