diff --git a/README.md b/README.md index d103a77..ae257c9 100644 --- a/README.md +++ b/README.md @@ -101,20 +101,27 @@ make cli folder=path/to/folder action=build min_f=1 max_p=0.99 The `min_f` param filters terms appearing less that it, while `max_p` filters terms appearing more than in `max_p` percentage of the documents. -The folder param is a path to a folder with the following structure: +The folder param is a path to a folder containing the documents to index. +The index files will be placed inside a subfolder, `.index`. + +Here is an example of such structure: ``` -├── docs -│ ├── 1.txt -│ ├── 2.txt -│ └── 3.txt -└── index - ├── idx.alphas - ├── idx.docs - ├── idx.offsets - └── idx.postings +example +├── .index +│ ├── idx.alphas +│ ├── idx.docs +│ ├── idx.offsets +│ └── idx.postings +├── 1.txt +├── 2.txt +├── 3.txt +└── subfolder + ├── 1.txt + ├── 2.txt + └── 3.txt ``` -The index folder will be created after the build command. +The builder will walk recursively down the input folder, skipping hidden ones. **Load a document collection** @@ -124,8 +131,9 @@ You can load a pre-build index by running: make web folder=path/to/folder ``` -You can then visit `http://0.0.0.0:3000` to find a web interface to enter free text and boolean queries. +This will load the index inside `path/to/folder/.index` +You can then visit `http://0.0.0.0:3000` to find a web interface to enter free text and boolean queries. **Query Syntax** @@ -139,6 +147,4 @@ b: hello AND there OR NOT man ## References [Introduction to Information Retrieval](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze ---- - *Feel free to get in touch to discuss the project!* \ No newline at end of file diff --git a/search/Cargo.toml b/search/Cargo.toml index 4b0d297..d2a5798 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -14,3 +14,4 @@ tempdir = "0.3.7" regex = "1" argparse = "0.2.2" phf = { version = "0.11.2", features = ["macros"] } +walkdir = "2.4.0" diff --git a/search/src/disk/file_utils.rs b/search/src/disk/file_utils.rs index 566a1d4..ff2ec27 100644 --- a/search/src/disk/file_utils.rs +++ b/search/src/disk/file_utils.rs @@ -2,6 +2,7 @@ use std::{ fs::{create_dir_all, File}, path::Path, }; +use walkdir::{DirEntry, WalkDir}; pub fn create_and_open_file(file_path: &str) -> File { let path = Path::new(file_path); @@ -9,3 +10,21 @@ pub fn create_and_open_file(file_path: &str) -> File { File::create(path).expect("error while creating file") } + +pub fn walk_dir(input_dir: &str) -> Vec { + WalkDir::new(input_dir) + .sort_by_file_name() + .into_iter() + .filter_entry(|e| !is_hidden(e)) + .filter_map(|e| e.ok()) + .filter(|e| !e.path().is_dir()) + .collect() +} + +fn is_hidden(entry: &DirEntry) -> bool { + entry + .file_name() + .to_str() + .map(|s| s.starts_with('.')) + .unwrap_or(false) +} diff --git a/search/src/engine/builder.rs b/search/src/engine/builder.rs index 90a1e8a..9dda0eb 100644 --- a/search/src/engine/builder.rs +++ b/search/src/engine/builder.rs @@ -1,3 +1,5 @@ +use crate::disk::file_utils::walk_dir; + use super::{ documents::{Document, Documents}, postings::{Posting, Postings, PostingsList}, @@ -41,11 +43,7 @@ fn build_in_memory( max_freq_percentage_threshold: f64, min_freq_threshold: u32, ) -> InMemory { - let files: Vec = fs::read_dir(input_dir) - .expect("error while retrieving input directory content") - .map(std::result::Result::unwrap) - .collect(); - + let files = walk_dir(input_dir); // document counter let doc_id_mutex = Mutex::new(0); // postings list @@ -65,7 +63,8 @@ fn build_in_memory( .progress_chars(PROGRESS_CHARS), ) .for_each(|d| { - let file_content = fs::read_to_string(d.path()).expect("error while reading file"); + let file_content: String = + fs::read_to_string(d.path()).expect("error while reading file"); let tokens = preprocessor.tokenize_and_stem(&file_content); let mut doc_id = doc_id_mutex.lock().unwrap(); diff --git a/search/src/main.rs b/search/src/main.rs index 450e53a..3ed689c 100644 --- a/search/src/main.rs +++ b/search/src/main.rs @@ -68,8 +68,7 @@ fn main() { let action = &args[2]; let build_index = action == "build"; - let index_path = format!("{base_path}/index/idx"); - let docs_path = format!("{base_path}/docs"); + let index_path = format!("{base_path}/.index/idx"); if build_index { let min_freq: Result = args[3].parse(); @@ -84,10 +83,10 @@ fn main() { return; }; - println!("Start build on directory [{docs_path}]\n"); + println!("Start build on directory [{base_path}]\n"); let start_time = Instant::now(); - Engine::build_engine(&docs_path, &index_path, max_frequency_perc, min_freq); + Engine::build_engine(base_path, &index_path, max_frequency_perc, min_freq); let elapsed_time = start_time.elapsed(); println!( diff --git a/server/src/main.rs b/server/src/main.rs index c5efcd7..ad53c30 100644 --- a/server/src/main.rs +++ b/server/src/main.rs @@ -40,7 +40,7 @@ async fn main() { } let base_path = &args[1]; - let index_path = format!("{base_path}/index/idx"); + let index_path = format!("{base_path}/.index/idx"); let state = Arc::new(AppState { index_path: base_path.clone(),