doc len on disk

tomfran · Dec 23, 2023 · 3be6a7e · 3be6a7e
1 parent 26c28bc
commit 3be6a7e
Show file tree

Hide file tree

Showing 8 changed files with 258 additions and 210 deletions.
diff --git a/src/index/builder.rs b/src/index/builder.rs
@@ -0,0 +1,128 @@
+use std::{collections::BTreeMap, fs};
+use tokenizers::Tokenizer;
+
+use crate::disk::{bits_writer::BitsWriter, terms_writer::TermsWriter};
+
+use super::{
+    index::{
+        DOCUMENT_LENGHTS_EXTENSION, OFFSETS_EXTENSION, POSTINGS_EXTENSION,
+        VOCABULARY_ALPHA_EXTENSION, VOCABULARY_LENGHTS_EXTENSION,
+    },
+    text_utils,
+};
+
+pub struct InMemoryIndex {
+    term_index_map: BTreeMap<String, usize>,
+    postings: Vec<BTreeMap<u32, u32>>,
+    document_lenghts: Vec<u32>,
+}
+
+pub fn build_index(input_dir: &str, output_path: &str, tokenizer: &Tokenizer) {
+    let index = build_in_memory(input_dir, tokenizer);
+    write_postings(&index, output_path);
+    write_vocabulary(&index.term_index_map, output_path);
+    write_doc_lentghts(&index.document_lenghts, output_path);
+}
+
+fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer) -> InMemoryIndex {
+    let documents =
+        fs::read_dir(input_dir).expect("error while retrieving input directory content");
+
+    let tokenized_docs_iter = documents
+        .into_iter()
+        .map(|p| p.unwrap())
+        .map(|p| fs::read_to_string(p.path()).expect("error while reading file"))
+        .map(|s| text_utils::tokenize(tokenizer, &s));
+
+    let mut term_index_map: BTreeMap<String, usize> = BTreeMap::new();
+    let mut postings: Vec<BTreeMap<u32, u32>> = Vec::new();
+    let mut document_lenghts: Vec<u32> = Vec::new();
+
+    for (doc_id, tokens) in tokenized_docs_iter.enumerate() {
+        document_lenghts.push(tokens.len() as u32);
+
+        for t in tokens.iter() {
+            let value: Option<&usize> = term_index_map.get(t);
+
+            let postings_counter = match value {
+                Some(idx) => &mut postings[*idx],
+                None => {
+                    let idx = term_index_map.len();
+                    term_index_map.insert(t.clone(), idx);
+                    postings.push(BTreeMap::new());
+                    &mut postings[idx]
+                }
+            };
+            let key = doc_id as u32;
+            postings_counter
+                .entry(key)
+                .and_modify(|count| *count += 1)
+                .or_insert(1);
+        }
+    }
+
+    InMemoryIndex {
+        term_index_map,
+        postings,
+        document_lenghts,
+    }
+}
+
+fn write_postings(index: &InMemoryIndex, output_path: &str) {
+    let postings_path = output_path.to_string() + POSTINGS_EXTENSION;
+    let mut postings_writer = BitsWriter::new(&postings_path);
+
+    let offsets_path = output_path.to_string() + OFFSETS_EXTENSION;
+    let mut offsets_writer = BitsWriter::new(&offsets_path);
+
+    let mut offset: u64 = 0;
+    let mut prev_offset = 0;
+
+    offsets_writer.write_vbyte(index.term_index_map.len() as u32);
+
+    for (_, idx) in index.term_index_map.iter() {
+        offsets_writer.write_gamma(offset as u32 - prev_offset);
+        prev_offset = offset as u32;
+
+        let postings: &BTreeMap<u32, u32> = &index.postings[*idx];
+        offset += postings_writer.write_vbyte(postings.len() as u32);
+
+        let mut prev = 0;
+        for (doc_id, frequency) in postings.iter() {
+            offset += postings_writer.write_gamma(doc_id - prev);
+            offset += postings_writer.write_gamma(*frequency);
+            prev = *doc_id;
+        }
+    }
+
+    postings_writer.flush();
+    offsets_writer.flush();
+}
+
+fn write_vocabulary(vocab: &BTreeMap<String, usize>, output_path: &str) {
+    let terms_path = output_path.to_string() + VOCABULARY_ALPHA_EXTENSION;
+    let mut terms_writer = TermsWriter::new(&terms_path);
+
+    let lenghts_path = output_path.to_string() + VOCABULARY_LENGHTS_EXTENSION;
+    let mut lenghts_writer = BitsWriter::new(&lenghts_path);
+
+    for term in vocab.keys() {
+        lenghts_writer.write_gamma(term.len() as u32);
+        terms_writer.write_term(term);
+    }
+
+    lenghts_writer.flush();
+    terms_writer.flush();
+}
+
+fn write_doc_lentghts(document_lenghts: &Vec<u32>, output_path: &str) {
+    let doc_path = output_path.to_string() + DOCUMENT_LENGHTS_EXTENSION;
+    let mut doc_writer = BitsWriter::new(&doc_path);
+
+    doc_writer.write_vbyte(document_lenghts.len() as u32);
+    document_lenghts.iter().for_each(|l| {
+        doc_writer.write_gamma(*l);
+    });
+
+    doc_writer.flush();
+}
diff --git a/src/index/index.rs b/src/index/index.rs
@@ -0,0 +1,76 @@
+use std::collections::BTreeMap;
+use tokenizers::Tokenizer;
+
+use super::{builder, loader, text_utils};
+use crate::disk::bits_reader::BitsReader;
+
+pub const POSTINGS_EXTENSION: &str = ".postings";
+pub const OFFSETS_EXTENSION: &str = ".offsets";
+pub const DOCUMENT_LENGHTS_EXTENSION: &str = ".doc_lengths";
+pub const VOCABULARY_ALPHA_EXTENSION: &str = ".alphas";
+pub const VOCABULARY_LENGHTS_EXTENSION: &str = ".term_lengths";
+
+pub struct Index {
+    postings: BitsReader,
+    term_offset_map: BTreeMap<String, u64>,
+    doc_lenghts: Vec<u32>,
+    tokenizer: Tokenizer,
+}
+
+impl Index {
+    pub fn build_index(input_path: &str, output_path: &str, tokenizer_path: &str) {
+        let tokenizer = text_utils::load_tokenizer(tokenizer_path, false);
+        builder::build_index(input_path, output_path, &tokenizer);
+    }
+
+    pub fn load_index(input_path: &str, tokenizer_path: &str) -> Index {
+        Index {
+            postings: loader::build_postings_reader(input_path),
+            term_offset_map: loader::load_terms_to_offsets_map(input_path),
+            doc_lenghts: loader::load_document_lenghts(input_path),
+            tokenizer: text_utils::load_tokenizer(tokenizer_path, false),
+        }
+    }
+
+    pub fn get_postings(&mut self, term: &str) -> Option<Vec<u32>> {
+        let offset = self.term_offset_map.get(term)?;
+        Some(self.get_postings_internal(*offset))
+    }
+
+    fn get_postings_internal(&mut self, offset: u64) -> Vec<u32> {
+        self.postings.seek(offset);
+        let mut prev = 0;
+
+        (0..self.postings.read_vbyte())
+            .map(|_| {
+                prev += self.postings.read_gamma();
+                prev
+            })
+            .collect()
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_build() {
+        Index::build_index(
+            "data/index_unit_test/docs",
+            "data/index_unit_test/index/test",
+            "data/index_unit_test/test_tokenizer",
+        );
+
+        let mut idx = Index::load_index(
+            "data/index_unit_test/index/test",
+            "data/index_unit_test/test_tokenizer",
+        );
+
+        for ele in ["hello", "man", "world"] {
+            assert!(idx.term_offset_map.contains_key(ele));
+        }
+
+        assert_eq!(idx.get_postings("hello").unwrap(), [0, 1]);
+    }
+}
diff --git a/src/index/loader.rs b/src/index/loader.rs
@@ -0,0 +1,51 @@
+use std::collections::BTreeMap;
+
+use super::index::{
+    DOCUMENT_LENGHTS_EXTENSION, OFFSETS_EXTENSION, POSTINGS_EXTENSION, VOCABULARY_ALPHA_EXTENSION,
+    VOCABULARY_LENGHTS_EXTENSION,
+};
+use crate::disk::{bits_reader::BitsReader, terms_reader::TermsReader};
+
+pub fn load_terms_to_offsets_map(input_path: &str) -> BTreeMap<String, u64> {
+    let terms_path: String = input_path.to_string() + VOCABULARY_ALPHA_EXTENSION;
+    let terms_buffer = TermsReader::new(&terms_path).read_to_string();
+
+    let lenghts_path = input_path.to_string() + VOCABULARY_LENGHTS_EXTENSION;
+    let mut lenghts_reader = BitsReader::new(&lenghts_path);
+
+    let offsets_path = input_path.to_string() + OFFSETS_EXTENSION;
+    let mut offsets_reader = BitsReader::new(&offsets_path);
+
+    let num_terms: u32 = offsets_reader.read_vbyte();
+
+    let mut start_term_offset: usize = 0;
+    let mut postings_offset = 0;
+
+    let mut res: BTreeMap<String, u64> = BTreeMap::new();
+
+    for _ in 0..num_terms {
+        let term_length = lenghts_reader.read_gamma() as usize;
+
+        let postings_offset_delta = offsets_reader.read_gamma() as u64;
+        postings_offset += postings_offset_delta;
+
+        res.insert(
+            terms_buffer[start_term_offset..start_term_offset + term_length].to_string(),
+            postings_offset,
+        );
+
+        start_term_offset += term_length;
+    }
+
+    res
+}
+
+pub fn load_document_lenghts(input_path: &str) -> Vec<u32> {
+    let mut reader = BitsReader::new(&(input_path.to_string() + DOCUMENT_LENGHTS_EXTENSION));
+    let n = reader.read_vbyte();
+    (0..n).map(|_| reader.read_gamma() as u32).collect()
+}
+
+pub fn build_postings_reader(input_path: &str) -> BitsReader {
+    BitsReader::new(&(input_path.to_string() + POSTINGS_EXTENSION))
+}
diff --git a/src/indexer/mod.rs → src/index/mod.rs b/src/indexer/mod.rs → src/index/mod.rs
@@ -1,3 +1,4 @@
-mod disk_utils;
+mod builder;
 pub mod index;
+mod loader;
 mod text_utils;
diff --git a/src/indexer/text_utils.rs → src/index/text_utils.rs b/src/indexer/text_utils.rs → src/index/text_utils.rs