diff --git a/src/index/builder.rs b/src/index/builder.rs new file mode 100644 index 0000000..2c43af8 --- /dev/null +++ b/src/index/builder.rs @@ -0,0 +1,128 @@ +use std::{collections::BTreeMap, fs}; +use tokenizers::Tokenizer; + +use crate::disk::{bits_writer::BitsWriter, terms_writer::TermsWriter}; + +use super::{ + index::{ + DOCUMENT_LENGHTS_EXTENSION, OFFSETS_EXTENSION, POSTINGS_EXTENSION, + VOCABULARY_ALPHA_EXTENSION, VOCABULARY_LENGHTS_EXTENSION, + }, + text_utils, +}; + +struct InMemoryIndex { + term_index_map: BTreeMap, + postings: Vec>, + document_lenghts: Vec, +} + +pub fn build_index(input_dir: &str, output_path: &str, tokenizer: &Tokenizer) { + let index = build_in_memory(input_dir, tokenizer); + write_postings(&index, output_path); + write_vocabulary(&index.term_index_map, output_path); + write_doc_lentghts(&index.document_lenghts, output_path); +} + +fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer) -> InMemoryIndex { + let documents = + fs::read_dir(input_dir).expect("error while retrieving input directory content"); + + let tokenized_docs_iter = documents + .into_iter() + .map(|p| p.unwrap()) + .map(|p| fs::read_to_string(p.path()).expect("error while reading file")) + .map(|s| text_utils::tokenize(tokenizer, &s)); + + let mut term_index_map: BTreeMap = BTreeMap::new(); + let mut postings: Vec> = Vec::new(); + let mut document_lenghts: Vec = Vec::new(); + + for (doc_id, tokens) in tokenized_docs_iter.enumerate() { + document_lenghts.push(tokens.len() as u32); + + for t in tokens.iter() { + let value: Option<&usize> = term_index_map.get(t); + + let postings_counter = match value { + Some(idx) => &mut postings[*idx], + None => { + let idx = term_index_map.len(); + term_index_map.insert(t.clone(), idx); + postings.push(BTreeMap::new()); + &mut postings[idx] + } + }; + let key = doc_id as u32; + postings_counter + .entry(key) + .and_modify(|count| *count += 1) + .or_insert(1); + } + } + + InMemoryIndex { + term_index_map, + postings, + document_lenghts, + } +} + +fn write_postings(index: &InMemoryIndex, output_path: &str) { + let postings_path = output_path.to_string() + POSTINGS_EXTENSION; + let mut postings_writer = BitsWriter::new(&postings_path); + + let offsets_path = output_path.to_string() + OFFSETS_EXTENSION; + let mut offsets_writer = BitsWriter::new(&offsets_path); + + let mut offset: u64 = 0; + let mut prev_offset = 0; + + offsets_writer.write_vbyte(index.term_index_map.len() as u32); + + for (_, idx) in index.term_index_map.iter() { + offsets_writer.write_gamma(offset as u32 - prev_offset); + prev_offset = offset as u32; + + let postings: &BTreeMap = &index.postings[*idx]; + offset += postings_writer.write_vbyte(postings.len() as u32); + + let mut prev = 0; + for (doc_id, frequency) in postings.iter() { + offset += postings_writer.write_gamma(doc_id - prev); + offset += postings_writer.write_gamma(*frequency); + prev = *doc_id; + } + } + + postings_writer.flush(); + offsets_writer.flush(); +} + +fn write_vocabulary(vocab: &BTreeMap, output_path: &str) { + let terms_path = output_path.to_string() + VOCABULARY_ALPHA_EXTENSION; + let mut terms_writer = TermsWriter::new(&terms_path); + + let lenghts_path = output_path.to_string() + VOCABULARY_LENGHTS_EXTENSION; + let mut lenghts_writer = BitsWriter::new(&lenghts_path); + + for term in vocab.keys() { + lenghts_writer.write_gamma(term.len() as u32); + terms_writer.write_term(term); + } + + lenghts_writer.flush(); + terms_writer.flush(); +} + +fn write_doc_lentghts(document_lenghts: &Vec, output_path: &str) { + let doc_path = output_path.to_string() + DOCUMENT_LENGHTS_EXTENSION; + let mut doc_writer = BitsWriter::new(&doc_path); + + doc_writer.write_vbyte(document_lenghts.len() as u32); + document_lenghts.iter().for_each(|l| { + doc_writer.write_gamma(*l); + }); + + doc_writer.flush(); +} diff --git a/src/index/index.rs b/src/index/index.rs new file mode 100644 index 0000000..1160323 --- /dev/null +++ b/src/index/index.rs @@ -0,0 +1,76 @@ +use std::collections::BTreeMap; +use tokenizers::Tokenizer; + +use super::{builder, loader, text_utils}; +use crate::disk::bits_reader::BitsReader; + +pub const POSTINGS_EXTENSION: &str = ".postings"; +pub const OFFSETS_EXTENSION: &str = ".offsets"; +pub const DOCUMENT_LENGHTS_EXTENSION: &str = ".doc_lengths"; +pub const VOCABULARY_ALPHA_EXTENSION: &str = ".alphas"; +pub const VOCABULARY_LENGHTS_EXTENSION: &str = ".term_lengths"; + +pub struct Index { + postings: BitsReader, + term_offset_map: BTreeMap, + doc_lenghts: Vec, + tokenizer: Tokenizer, +} + +impl Index { + pub fn build_index(input_path: &str, output_path: &str, tokenizer_path: &str) { + let tokenizer = text_utils::load_tokenizer(tokenizer_path, false); + builder::build_index(input_path, output_path, &tokenizer); + } + + pub fn load_index(input_path: &str, tokenizer_path: &str) -> Index { + Index { + postings: loader::build_postings_reader(input_path), + term_offset_map: loader::load_terms_to_offsets_map(input_path), + doc_lenghts: loader::load_document_lenghts(input_path), + tokenizer: text_utils::load_tokenizer(tokenizer_path, false), + } + } + + pub fn get_postings(&mut self, term: &str) -> Option> { + let offset = self.term_offset_map.get(term)?; + Some(self.get_postings_internal(*offset)) + } + + fn get_postings_internal(&mut self, offset: u64) -> Vec { + self.postings.seek(offset); + let mut prev = 0; + + (0..self.postings.read_vbyte()) + .map(|_| { + prev += self.postings.read_gamma(); + prev + }) + .collect() + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_build() { + Index::build_index( + "data/index_unit_test/docs", + "data/index_unit_test/index/test", + "data/index_unit_test/test_tokenizer", + ); + + let mut idx = Index::load_index( + "data/index_unit_test/index/test", + "data/index_unit_test/test_tokenizer", + ); + + for ele in ["hello", "man", "world"] { + assert!(idx.term_offset_map.contains_key(ele)); + } + + assert_eq!(idx.get_postings("hello").unwrap(), [0, 1]); + } +} diff --git a/src/index/loader.rs b/src/index/loader.rs new file mode 100644 index 0000000..8d4ee0f --- /dev/null +++ b/src/index/loader.rs @@ -0,0 +1,51 @@ +use std::collections::BTreeMap; + +use super::index::{ + DOCUMENT_LENGHTS_EXTENSION, OFFSETS_EXTENSION, POSTINGS_EXTENSION, VOCABULARY_ALPHA_EXTENSION, + VOCABULARY_LENGHTS_EXTENSION, +}; +use crate::disk::{bits_reader::BitsReader, terms_reader::TermsReader}; + +pub fn load_terms_to_offsets_map(input_path: &str) -> BTreeMap { + let terms_path: String = input_path.to_string() + VOCABULARY_ALPHA_EXTENSION; + let terms_buffer = TermsReader::new(&terms_path).read_to_string(); + + let lenghts_path = input_path.to_string() + VOCABULARY_LENGHTS_EXTENSION; + let mut lenghts_reader = BitsReader::new(&lenghts_path); + + let offsets_path = input_path.to_string() + OFFSETS_EXTENSION; + let mut offsets_reader = BitsReader::new(&offsets_path); + + let num_terms: u32 = offsets_reader.read_vbyte(); + + let mut start_term_offset: usize = 0; + let mut postings_offset = 0; + + let mut res: BTreeMap = BTreeMap::new(); + + for _ in 0..num_terms { + let term_length = lenghts_reader.read_gamma() as usize; + + let postings_offset_delta = offsets_reader.read_gamma() as u64; + postings_offset += postings_offset_delta; + + res.insert( + terms_buffer[start_term_offset..start_term_offset + term_length].to_string(), + postings_offset, + ); + + start_term_offset += term_length; + } + + res +} + +pub fn load_document_lenghts(input_path: &str) -> Vec { + let mut reader = BitsReader::new(&(input_path.to_string() + DOCUMENT_LENGHTS_EXTENSION)); + let n = reader.read_vbyte(); + (0..n).map(|_| reader.read_gamma() as u32).collect() +} + +pub fn build_postings_reader(input_path: &str) -> BitsReader { + BitsReader::new(&(input_path.to_string() + POSTINGS_EXTENSION)) +} diff --git a/src/indexer/mod.rs b/src/index/mod.rs similarity index 55% rename from src/indexer/mod.rs rename to src/index/mod.rs index c3c1675..b63cb38 100644 --- a/src/indexer/mod.rs +++ b/src/index/mod.rs @@ -1,3 +1,4 @@ -mod disk_utils; +mod builder; pub mod index; +mod loader; mod text_utils; diff --git a/src/indexer/text_utils.rs b/src/index/text_utils.rs similarity index 100% rename from src/indexer/text_utils.rs rename to src/index/text_utils.rs diff --git a/src/indexer/disk_utils.rs b/src/indexer/disk_utils.rs deleted file mode 100644 index 44f3205..0000000 --- a/src/indexer/disk_utils.rs +++ /dev/null @@ -1,144 +0,0 @@ -use tokenizers::Tokenizer; - -use crate::disk::{ - bits_reader::BitsReader, bits_writer::BitsWriter, terms_reader::TermsReader, - terms_writer::TermsWriter, -}; -use std::{collections::BTreeMap, fs}; - -use super::text_utils; - -const POSTINGS_EXTENSION: &str = ".postings"; -const OFFSETS_EXTENSION: &str = ".offsets"; - -const VOCABULARY_ALPHA_EXTENSION: &str = ".alphas"; -const VOCABULARY_LENGHTS_EXTENSION: &str = ".lengths"; - -pub fn build_in_memory_postings( - input_dir: &str, - tokenizer: &Tokenizer, -) -> (BTreeMap, Vec>) { - let documents = - fs::read_dir(input_dir).expect("error while retrieving input directory content"); - - let tokenized_docs_iter = documents - .into_iter() - .map(|p| p.unwrap()) - .map(|p| fs::read_to_string(p.path()).expect("error while reading file")) - .map(|s| text_utils::tokenize(tokenizer, &s)); - - let mut words: BTreeMap = BTreeMap::new(); - let mut in_memory_postings: Vec> = Vec::new(); - - for (doc_id, tokens) in tokenized_docs_iter.enumerate() { - for t in tokens.iter() { - let value: Option<&usize> = words.get(t); - - let postings_counter = match value { - Some(idx) => &mut in_memory_postings[*idx], - None => { - let idx = words.len(); - words.insert(t.clone(), idx); - in_memory_postings.push(BTreeMap::new()); - &mut in_memory_postings[idx] - } - }; - let key = doc_id as u32; - postings_counter - .entry(key) - .and_modify(|count| *count += 1) - .or_insert(1); - } - } - - (words, in_memory_postings) -} - -pub fn write_postings( - vocab: &BTreeMap, - postings: &[BTreeMap], - output_path: &str, -) { - let postings_path = output_path.to_string() + POSTINGS_EXTENSION; - let mut postings_writer = BitsWriter::new(&postings_path); - - let offsets_path = output_path.to_string() + OFFSETS_EXTENSION; - let mut offsets_writer = BitsWriter::new(&offsets_path); - - let mut offset: u64 = 0; - let mut prev_offset = 0; - - offsets_writer.write_vbyte(vocab.len() as u32); - - for (_, idx) in vocab.iter() { - offsets_writer.write_gamma(offset as u32 - prev_offset); - prev_offset = offset as u32; - - let postings: &BTreeMap = &postings[*idx]; - offset += postings_writer.write_vbyte(postings.len() as u32); - - let mut prev = 0; - for (doc_id, frequency) in postings.iter() { - offset += postings_writer.write_gamma(doc_id - prev); - offset += postings_writer.write_gamma(*frequency); - prev = *doc_id; - } - } - - postings_writer.flush(); - offsets_writer.flush(); -} - -pub fn write_vocabulary(vocab: &BTreeMap, output_path: &str) { - let terms_path = output_path.to_string() + VOCABULARY_ALPHA_EXTENSION; - let mut terms_writer = TermsWriter::new(&terms_path); - - let lenghts_path = output_path.to_string() + VOCABULARY_LENGHTS_EXTENSION; - let mut lenghts_writer = BitsWriter::new(&lenghts_path); - - for term in vocab.keys() { - lenghts_writer.write_gamma(term.len() as u32); - terms_writer.write_term(term); - } - - lenghts_writer.flush(); - terms_writer.flush(); -} - -pub fn read_terms_to_offsets_map(input_path: &str) -> BTreeMap { - let terms_path: String = input_path.to_string() + VOCABULARY_ALPHA_EXTENSION; - let terms_buffer = TermsReader::new(&terms_path).read_to_string(); - - let lenghts_path = input_path.to_string() + VOCABULARY_LENGHTS_EXTENSION; - let mut lenghts_reader = BitsReader::new(&lenghts_path); - - let offsets_path = input_path.to_string() + OFFSETS_EXTENSION; - let mut offsets_reader = BitsReader::new(&offsets_path); - - let num_terms: u32 = offsets_reader.read_vbyte(); - - let mut start_term_offset: usize = 0; - let mut postings_offset = 0; - - let mut res: BTreeMap = BTreeMap::new(); - - for _ in 0..num_terms { - let term_length = lenghts_reader.read_gamma() as usize; - - let postings_offset_delta = offsets_reader.read_gamma() as u64; - postings_offset += postings_offset_delta; - - res.insert( - terms_buffer[start_term_offset..start_term_offset + term_length].to_string(), - postings_offset, - ); - - start_term_offset += term_length; - } - - res -} - -pub fn build_postings_reader(input_path: &str) -> BitsReader { - BitsReader::new(&(input_path.to_string() + POSTINGS_EXTENSION)) -} diff --git a/src/indexer/index.rs b/src/indexer/index.rs deleted file mode 100644 index b85b7d2..0000000 --- a/src/indexer/index.rs +++ /dev/null @@ -1,64 +0,0 @@ -use super::{disk_utils, text_utils::load_tokenizer}; -use crate::disk::bits_reader::BitsReader; -use std::collections::BTreeMap; - -pub struct Index { - postings: BitsReader, - terms_to_offsets: BTreeMap, -} - -impl Index { - pub fn build_index(input_dir: &str, output_path: &str, tokenizer_path: &str) { - let tokenizer = load_tokenizer(tokenizer_path, false); - let (words, postings) = disk_utils::build_in_memory_postings(input_dir, &tokenizer); - - disk_utils::write_postings(&words, &postings, output_path); - disk_utils::write_vocabulary(&words, output_path); - } - - pub fn load_index(input_path: &str) -> Index { - Index { - postings: disk_utils::build_postings_reader(input_path), - terms_to_offsets: disk_utils::read_terms_to_offsets_map(input_path), - } - } - - pub fn get_postings(&mut self, term: &str) -> Option> { - let offset = self.terms_to_offsets.get(term)?; - Some(self.get_postings_internal(*offset)) - } - - fn get_postings_internal(&mut self, offset: u64) -> Vec { - self.postings.seek(offset); - let mut prev = 0; - - (0..self.postings.read_vbyte()) - .map(|_| { - prev += self.postings.read_gamma(); - prev - }) - .collect() - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_build() { - Index::build_index( - "data/index_unit_test/docs", - "data/index_unit_test/index/test", - "data/index_unit_test/test_tokenizer", - ); - - let mut idx = Index::load_index("data/index_unit_test/index/test"); - - for ele in ["hello", "man", "world"] { - assert!(idx.terms_to_offsets.contains_key(ele)); - } - - assert_eq!(idx.get_postings("hello").unwrap(), [0, 1]); - } -} diff --git a/src/lib.rs b/src/lib.rs index ee74592..7ae8e5c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,2 @@ pub mod disk; -pub mod indexer; +pub mod index;