diff --git a/.gitignore b/.gitignore index a3659c7..ef7ce68 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,9 @@ Cargo.lock .idea # Added by cargo - /target -/data/ + +# Data files +/data/wiki-data +/data/index_unit_test/index +/data/test \ No newline at end of file diff --git a/data/index_unit_test/docs/1.txt b/data/index_unit_test/docs/1.txt new file mode 100644 index 0000000..95d09f2 --- /dev/null +++ b/data/index_unit_test/docs/1.txt @@ -0,0 +1 @@ +hello world \ No newline at end of file diff --git a/data/index_unit_test/docs/2.txt b/data/index_unit_test/docs/2.txt new file mode 100644 index 0000000..ff70386 --- /dev/null +++ b/data/index_unit_test/docs/2.txt @@ -0,0 +1 @@ +hello man \ No newline at end of file diff --git a/src/disk/bits_reader.rs b/src/disk/bits_reader.rs index 070b9ea..916b119 100644 --- a/src/disk/bits_reader.rs +++ b/src/disk/bits_reader.rs @@ -112,12 +112,9 @@ mod test { use super::*; use crate::disk::bits_writer::BitsWriter; - use std::fs::create_dir_all; #[test] fn test_read() { - create_dir_all("data/test/").expect("error while creating test dir"); - let mut w = BitsWriter::new("data/test/writer_unit.bin"); (1..100).for_each(|i| { @@ -138,8 +135,6 @@ mod test { #[test] fn test_seek() { - create_dir_all("data/test/").expect("error while creating test dir"); - let mut w = BitsWriter::new("data/test/writer_seek.bin"); let offset = (0..1000).map(|i| w.write_gamma(i)).sum(); diff --git a/src/disk/bits_writer.rs b/src/disk/bits_writer.rs index 7b27721..030c799 100644 --- a/src/disk/bits_writer.rs +++ b/src/disk/bits_writer.rs @@ -3,6 +3,8 @@ use std::{ io::{BufWriter, Write}, }; +use super::file_utils; + pub struct BitsWriter { file: BufWriter, buffer: u128, @@ -10,9 +12,9 @@ pub struct BitsWriter { } impl BitsWriter { - pub fn new(filename: &str) -> BitsWriter { + pub fn new(path: &str) -> BitsWriter { BitsWriter { - file: BufWriter::new(File::create(filename).expect("Can not create output file")), + file: BufWriter::new(file_utils::create_and_open_file(path)), buffer: 0, written: 0, } @@ -94,7 +96,6 @@ impl BitsWriter { mod test { use super::*; - use std::fs::create_dir_all; #[test] fn test_gamma_coding() { @@ -120,8 +121,6 @@ mod test { #[test] fn test_buffer_overflow() { - create_dir_all("data/test/").expect("error while creating test dir"); - let word = (1 << 10) - 1; let len = 10; diff --git a/src/disk/file_utils.rs b/src/disk/file_utils.rs new file mode 100644 index 0000000..c549812 --- /dev/null +++ b/src/disk/file_utils.rs @@ -0,0 +1,11 @@ +use std::{ + fs::{create_dir_all, File}, + path::Path, +}; + +pub fn create_and_open_file(filename: &str) -> File { + let path = Path::new(filename); + path.parent().map(create_dir_all); + + File::create(path).expect("error while creating file") +} diff --git a/src/disk/mod.rs b/src/disk/mod.rs index 3e7bb2a..dfcdc96 100644 --- a/src/disk/mod.rs +++ b/src/disk/mod.rs @@ -1,4 +1,5 @@ pub mod bits_reader; pub mod bits_writer; +mod file_utils; pub mod terms_reader; pub mod terms_writer; diff --git a/src/disk/terms_writer.rs b/src/disk/terms_writer.rs index e25db6e..f993f04 100644 --- a/src/disk/terms_writer.rs +++ b/src/disk/terms_writer.rs @@ -3,14 +3,16 @@ use std::{ io::{BufWriter, Write}, }; +use super::file_utils; + pub struct TermsWriter { file: BufWriter, } impl TermsWriter { - pub fn new(filename: &str) -> TermsWriter { + pub fn new(path: &str) -> TermsWriter { TermsWriter { - file: BufWriter::new(File::create(filename).expect("Can not create output file")), + file: BufWriter::new(file_utils::create_and_open_file(path)), } } diff --git a/src/indexer/disk_utils.rs b/src/indexer/disk_utils.rs index 6d9512d..91896f7 100644 --- a/src/indexer/disk_utils.rs +++ b/src/indexer/disk_utils.rs @@ -19,9 +19,7 @@ pub fn build_in_memory_postings( let documents = fs::read_dir(input_dir).expect("error while retrieving input directory content"); - println!("{:?}", documents); let tokens_regex = tokens::build_tokenization_regex(); - let tokenized_docs_iter = documents .into_iter() .map(|p| p.unwrap()) @@ -61,9 +59,9 @@ pub fn write_postings( output_path: &str, ) { let postings_path = output_path.to_string() + POSTINGS_EXTENSION; - let offsets_path = output_path.to_string() + OFFSETS_EXTENSION; - let mut postings_writer = BitsWriter::new(&postings_path); + + let offsets_path = output_path.to_string() + OFFSETS_EXTENSION; let mut offsets_writer = BitsWriter::new(&offsets_path); let mut offset: u64 = 0; @@ -91,11 +89,11 @@ pub fn write_postings( } pub fn write_vocabulary(vocab: &BTreeMap, output_path: &str) { - let alphas_path = output_path.to_string() + VOCABULARY_ALPHA_EXTENSION; - let lenghts_path = output_path.to_string() + VOCABULARY_LENGHTS_EXTENSION; + let terms_path = output_path.to_string() + VOCABULARY_ALPHA_EXTENSION; + let mut terms_writer = TermsWriter::new(&terms_path); + let lenghts_path = output_path.to_string() + VOCABULARY_LENGHTS_EXTENSION; let mut lenghts_writer = BitsWriter::new(&lenghts_path); - let mut terms_writer = TermsWriter::new(&alphas_path); for term in vocab.keys() { lenghts_writer.write_gamma(term.len() as u32); @@ -107,32 +105,34 @@ pub fn write_vocabulary(vocab: &BTreeMap, output_path: &str) { } pub fn read_terms_to_offsets_map(input_path: &str) -> BTreeMap { - let alphas_path: String = input_path.to_string() + VOCABULARY_ALPHA_EXTENSION; + let terms_path: String = input_path.to_string() + VOCABULARY_ALPHA_EXTENSION; + let terms_buffer = TermsReader::new(&terms_path).read_to_string(); + let lenghts_path = input_path.to_string() + VOCABULARY_LENGHTS_EXTENSION; - let offsets_path = input_path.to_string() + OFFSETS_EXTENSION; + let mut lenghts_reader = BitsReader::new(&lenghts_path); + let offsets_path = input_path.to_string() + OFFSETS_EXTENSION; let mut offsets_reader = BitsReader::new(&offsets_path); - let terms_buffer = TermsReader::new(&alphas_path).read_to_string(); - let mut lenghts_reader = BitsReader::new(&lenghts_path); - let n = offsets_reader.read_vbyte(); + let num_terms: u32 = offsets_reader.read_vbyte(); let mut start_term_offset: usize = 0; let mut postings_offset = 0; let mut res: BTreeMap = BTreeMap::new(); - for _ in 0..n { - let terms_delta = lenghts_reader.read_gamma() as usize; - let x = offsets_reader.read_gamma() as u64; - postings_offset += x; + for _ in 0..num_terms { + let term_length = lenghts_reader.read_gamma() as usize; + + let postings_offset_delta = offsets_reader.read_gamma() as u64; + postings_offset += postings_offset_delta; res.insert( - terms_buffer[start_term_offset..start_term_offset + terms_delta].to_string(), + terms_buffer[start_term_offset..start_term_offset + term_length].to_string(), postings_offset, ); - start_term_offset += terms_delta; + start_term_offset += term_length; } res diff --git a/src/indexer/index.rs b/src/indexer/index.rs index 6d85b07..8289fe2 100644 --- a/src/indexer/index.rs +++ b/src/indexer/index.rs @@ -43,13 +43,19 @@ impl Index { mod test { use super::*; - // #[test] + #[test] fn test_build() { - Index::build_index("data/dummy/docs", "data/dummy/index/dum"); + Index::build_index( + "data/index_unit_test/docs", + "data/index_unit_test/index/test", + ); - let mut idx = Index::load_index("data/dummy/index/dum"); + let mut idx = Index::load_index("data/index_unit_test/index/test"); - println!("{:?}", idx.terms_to_offsets); - println!("{:?}", idx.get_postings("my")); + for ele in ["hello", "man", "world"] { + assert!(idx.terms_to_offsets.contains_key(ele)); + } + + assert_eq!(idx.get_postings("hello").unwrap(), [0, 1]); } } diff --git a/src/text/tokens.rs b/src/text/tokens.rs index d7b666c..028daa9 100644 --- a/src/text/tokens.rs +++ b/src/text/tokens.rs @@ -1,6 +1,6 @@ use regex::Regex; -pub fn tokenize(s: &String, re: &Regex) -> Vec { +pub fn tokenize(s: &str, re: &Regex) -> Vec { let vec: Vec = re .replace_all(s, "") .to_lowercase() @@ -23,7 +23,7 @@ mod test { #[test] fn test_tokenization() { let r = build_tokenization_regex(); - let mut t = tokenize(&"123#Hello, __World!".to_string(), &r); + let mut t = tokenize("123#Hello, __World!", &r); t.sort(); assert_eq!(t, ["hello", "world"]); diff --git a/tests/read_write_integration_test.rs b/tests/read_write_integration_test.rs index 9889136..0ba4d32 100644 --- a/tests/read_write_integration_test.rs +++ b/tests/read_write_integration_test.rs @@ -1,11 +1,8 @@ use rand::Rng; use search::disk::{bits_reader::BitsReader, bits_writer::BitsWriter}; -use std::fs::create_dir_all; #[test] fn test_read_write() { - create_dir_all("data/test/").expect("error while creating test dir"); - let path = "data/test/writer_io_integration.bin"; let n = 100_000;