Skip to content

Commit

Permalink
doc len on disk
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Dec 23, 2023
1 parent 26c28bc commit 544c201
Show file tree
Hide file tree
Showing 8 changed files with 258 additions and 210 deletions.
128 changes: 128 additions & 0 deletions src/index/builder.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
use std::{collections::BTreeMap, fs};
use tokenizers::Tokenizer;

use crate::disk::{bits_writer::BitsWriter, terms_writer::TermsWriter};

use super::{
index::{
DOCUMENT_LENGHTS_EXTENSION, OFFSETS_EXTENSION, POSTINGS_EXTENSION,
VOCABULARY_ALPHA_EXTENSION, VOCABULARY_LENGHTS_EXTENSION,
},
text_utils,
};

struct InMemoryIndex {
term_index_map: BTreeMap<String, usize>,
postings: Vec<BTreeMap<u32, u32>>,
document_lenghts: Vec<u32>,
}

pub fn build_index(input_dir: &str, output_path: &str, tokenizer: &Tokenizer) {
let index = build_in_memory(input_dir, tokenizer);
write_postings(&index, output_path);
write_vocabulary(&index.term_index_map, output_path);
write_doc_lentghts(&index.document_lenghts, output_path);
}

fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer) -> InMemoryIndex {
let documents =
fs::read_dir(input_dir).expect("error while retrieving input directory content");

let tokenized_docs_iter = documents
.into_iter()
.map(|p| p.unwrap())
.map(|p| fs::read_to_string(p.path()).expect("error while reading file"))
.map(|s| text_utils::tokenize(tokenizer, &s));

let mut term_index_map: BTreeMap<String, usize> = BTreeMap::new();
let mut postings: Vec<BTreeMap<u32, u32>> = Vec::new();
let mut document_lenghts: Vec<u32> = Vec::new();

for (doc_id, tokens) in tokenized_docs_iter.enumerate() {
document_lenghts.push(tokens.len() as u32);

for t in tokens.iter() {
let value: Option<&usize> = term_index_map.get(t);

let postings_counter = match value {
Some(idx) => &mut postings[*idx],
None => {
let idx = term_index_map.len();
term_index_map.insert(t.clone(), idx);
postings.push(BTreeMap::new());
&mut postings[idx]
}
};
let key = doc_id as u32;
postings_counter
.entry(key)
.and_modify(|count| *count += 1)
.or_insert(1);
}
}

InMemoryIndex {
term_index_map,
postings,
document_lenghts,
}
}

fn write_postings(index: &InMemoryIndex, output_path: &str) {
let postings_path = output_path.to_string() + POSTINGS_EXTENSION;
let mut postings_writer = BitsWriter::new(&postings_path);

let offsets_path = output_path.to_string() + OFFSETS_EXTENSION;
let mut offsets_writer = BitsWriter::new(&offsets_path);

let mut offset: u64 = 0;
let mut prev_offset = 0;

offsets_writer.write_vbyte(index.term_index_map.len() as u32);

for (_, idx) in index.term_index_map.iter() {
offsets_writer.write_gamma(offset as u32 - prev_offset);
prev_offset = offset as u32;

let postings: &BTreeMap<u32, u32> = &index.postings[*idx];
offset += postings_writer.write_vbyte(postings.len() as u32);

let mut prev = 0;
for (doc_id, frequency) in postings.iter() {
offset += postings_writer.write_gamma(doc_id - prev);
offset += postings_writer.write_gamma(*frequency);
prev = *doc_id;
}
}

postings_writer.flush();
offsets_writer.flush();
}

fn write_vocabulary(vocab: &BTreeMap<String, usize>, output_path: &str) {
let terms_path = output_path.to_string() + VOCABULARY_ALPHA_EXTENSION;
let mut terms_writer = TermsWriter::new(&terms_path);

let lenghts_path = output_path.to_string() + VOCABULARY_LENGHTS_EXTENSION;
let mut lenghts_writer = BitsWriter::new(&lenghts_path);

for term in vocab.keys() {
lenghts_writer.write_gamma(term.len() as u32);
terms_writer.write_term(term);
}

lenghts_writer.flush();
terms_writer.flush();
}

fn write_doc_lentghts(document_lenghts: &Vec<u32>, output_path: &str) {
let doc_path = output_path.to_string() + DOCUMENT_LENGHTS_EXTENSION;
let mut doc_writer = BitsWriter::new(&doc_path);

doc_writer.write_vbyte(document_lenghts.len() as u32);
document_lenghts.iter().for_each(|l| {
doc_writer.write_gamma(*l);
});

doc_writer.flush();
}
76 changes: 76 additions & 0 deletions src/index/index.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
use std::collections::BTreeMap;
use tokenizers::Tokenizer;

use super::{builder, loader, text_utils};
use crate::disk::bits_reader::BitsReader;

pub const POSTINGS_EXTENSION: &str = ".postings";
pub const OFFSETS_EXTENSION: &str = ".offsets";
pub const DOCUMENT_LENGHTS_EXTENSION: &str = ".doc_lengths";
pub const VOCABULARY_ALPHA_EXTENSION: &str = ".alphas";
pub const VOCABULARY_LENGHTS_EXTENSION: &str = ".term_lengths";

pub struct Index {
postings: BitsReader,
term_offset_map: BTreeMap<String, u64>,
doc_lenghts: Vec<u32>,
tokenizer: Tokenizer,
}

impl Index {
pub fn build_index(input_path: &str, output_path: &str, tokenizer_path: &str) {
let tokenizer = text_utils::load_tokenizer(tokenizer_path, false);
builder::build_index(input_path, output_path, &tokenizer);
}

pub fn load_index(input_path: &str, tokenizer_path: &str) -> Index {
Index {
postings: loader::build_postings_reader(input_path),
term_offset_map: loader::load_terms_to_offsets_map(input_path),
doc_lenghts: loader::load_document_lenghts(input_path),
tokenizer: text_utils::load_tokenizer(tokenizer_path, false),
}
}

pub fn get_postings(&mut self, term: &str) -> Option<Vec<u32>> {
let offset = self.term_offset_map.get(term)?;
Some(self.get_postings_internal(*offset))
}

fn get_postings_internal(&mut self, offset: u64) -> Vec<u32> {
self.postings.seek(offset);
let mut prev = 0;

(0..self.postings.read_vbyte())
.map(|_| {
prev += self.postings.read_gamma();
prev
})
.collect()
}
}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn test_build() {
Index::build_index(
"data/index_unit_test/docs",
"data/index_unit_test/index/test",
"data/index_unit_test/test_tokenizer",
);

let mut idx = Index::load_index(
"data/index_unit_test/index/test",
"data/index_unit_test/test_tokenizer",
);

for ele in ["hello", "man", "world"] {
assert!(idx.term_offset_map.contains_key(ele));
}

assert_eq!(idx.get_postings("hello").unwrap(), [0, 1]);
}
}
51 changes: 51 additions & 0 deletions src/index/loader.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
use std::collections::BTreeMap;

use super::index::{
DOCUMENT_LENGHTS_EXTENSION, OFFSETS_EXTENSION, POSTINGS_EXTENSION, VOCABULARY_ALPHA_EXTENSION,
VOCABULARY_LENGHTS_EXTENSION,
};
use crate::disk::{bits_reader::BitsReader, terms_reader::TermsReader};

pub fn load_terms_to_offsets_map(input_path: &str) -> BTreeMap<String, u64> {
let terms_path: String = input_path.to_string() + VOCABULARY_ALPHA_EXTENSION;
let terms_buffer = TermsReader::new(&terms_path).read_to_string();

let lenghts_path = input_path.to_string() + VOCABULARY_LENGHTS_EXTENSION;
let mut lenghts_reader = BitsReader::new(&lenghts_path);

let offsets_path = input_path.to_string() + OFFSETS_EXTENSION;
let mut offsets_reader = BitsReader::new(&offsets_path);

let num_terms: u32 = offsets_reader.read_vbyte();

let mut start_term_offset: usize = 0;
let mut postings_offset = 0;

let mut res: BTreeMap<String, u64> = BTreeMap::new();

for _ in 0..num_terms {
let term_length = lenghts_reader.read_gamma() as usize;

let postings_offset_delta = offsets_reader.read_gamma() as u64;
postings_offset += postings_offset_delta;

res.insert(
terms_buffer[start_term_offset..start_term_offset + term_length].to_string(),
postings_offset,
);

start_term_offset += term_length;
}

res
}

pub fn load_document_lenghts(input_path: &str) -> Vec<u32> {
let mut reader = BitsReader::new(&(input_path.to_string() + DOCUMENT_LENGHTS_EXTENSION));
let n = reader.read_vbyte();
(0..n).map(|_| reader.read_gamma() as u32).collect()
}

pub fn build_postings_reader(input_path: &str) -> BitsReader {
BitsReader::new(&(input_path.to_string() + POSTINGS_EXTENSION))
}
3 changes: 2 additions & 1 deletion src/indexer/mod.rs → src/index/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
mod disk_utils;
mod builder;
pub mod index;
mod loader;
mod text_utils;
File renamed without changes.
Loading

0 comments on commit 544c201

Please sign in to comment.