-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
258 additions
and
210 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
use std::{collections::BTreeMap, fs}; | ||
use tokenizers::Tokenizer; | ||
|
||
use crate::disk::{bits_writer::BitsWriter, terms_writer::TermsWriter}; | ||
|
||
use super::{ | ||
index::{ | ||
DOCUMENT_LENGHTS_EXTENSION, OFFSETS_EXTENSION, POSTINGS_EXTENSION, | ||
VOCABULARY_ALPHA_EXTENSION, VOCABULARY_LENGHTS_EXTENSION, | ||
}, | ||
text_utils, | ||
}; | ||
|
||
pub struct InMemoryIndex { | ||
term_index_map: BTreeMap<String, usize>, | ||
postings: Vec<BTreeMap<u32, u32>>, | ||
document_lenghts: Vec<u32>, | ||
} | ||
|
||
pub fn build_index(input_dir: &str, output_path: &str, tokenizer: &Tokenizer) { | ||
let index = build_in_memory(input_dir, tokenizer); | ||
write_postings(&index, output_path); | ||
write_vocabulary(&index.term_index_map, output_path); | ||
write_doc_lentghts(&index.document_lenghts, output_path); | ||
} | ||
|
||
fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer) -> InMemoryIndex { | ||
let documents = | ||
fs::read_dir(input_dir).expect("error while retrieving input directory content"); | ||
|
||
let tokenized_docs_iter = documents | ||
.into_iter() | ||
.map(|p| p.unwrap()) | ||
.map(|p| fs::read_to_string(p.path()).expect("error while reading file")) | ||
.map(|s| text_utils::tokenize(tokenizer, &s)); | ||
|
||
let mut term_index_map: BTreeMap<String, usize> = BTreeMap::new(); | ||
let mut postings: Vec<BTreeMap<u32, u32>> = Vec::new(); | ||
let mut document_lenghts: Vec<u32> = Vec::new(); | ||
|
||
for (doc_id, tokens) in tokenized_docs_iter.enumerate() { | ||
document_lenghts.push(tokens.len() as u32); | ||
|
||
for t in tokens.iter() { | ||
let value: Option<&usize> = term_index_map.get(t); | ||
|
||
let postings_counter = match value { | ||
Some(idx) => &mut postings[*idx], | ||
None => { | ||
let idx = term_index_map.len(); | ||
term_index_map.insert(t.clone(), idx); | ||
postings.push(BTreeMap::new()); | ||
&mut postings[idx] | ||
} | ||
}; | ||
let key = doc_id as u32; | ||
postings_counter | ||
.entry(key) | ||
.and_modify(|count| *count += 1) | ||
.or_insert(1); | ||
} | ||
} | ||
|
||
InMemoryIndex { | ||
term_index_map, | ||
postings, | ||
document_lenghts, | ||
} | ||
} | ||
|
||
fn write_postings(index: &InMemoryIndex, output_path: &str) { | ||
let postings_path = output_path.to_string() + POSTINGS_EXTENSION; | ||
let mut postings_writer = BitsWriter::new(&postings_path); | ||
|
||
let offsets_path = output_path.to_string() + OFFSETS_EXTENSION; | ||
let mut offsets_writer = BitsWriter::new(&offsets_path); | ||
|
||
let mut offset: u64 = 0; | ||
let mut prev_offset = 0; | ||
|
||
offsets_writer.write_vbyte(index.term_index_map.len() as u32); | ||
|
||
for (_, idx) in index.term_index_map.iter() { | ||
offsets_writer.write_gamma(offset as u32 - prev_offset); | ||
prev_offset = offset as u32; | ||
|
||
let postings: &BTreeMap<u32, u32> = &index.postings[*idx]; | ||
offset += postings_writer.write_vbyte(postings.len() as u32); | ||
|
||
let mut prev = 0; | ||
for (doc_id, frequency) in postings.iter() { | ||
offset += postings_writer.write_gamma(doc_id - prev); | ||
offset += postings_writer.write_gamma(*frequency); | ||
prev = *doc_id; | ||
} | ||
} | ||
|
||
postings_writer.flush(); | ||
offsets_writer.flush(); | ||
} | ||
|
||
fn write_vocabulary(vocab: &BTreeMap<String, usize>, output_path: &str) { | ||
let terms_path = output_path.to_string() + VOCABULARY_ALPHA_EXTENSION; | ||
let mut terms_writer = TermsWriter::new(&terms_path); | ||
|
||
let lenghts_path = output_path.to_string() + VOCABULARY_LENGHTS_EXTENSION; | ||
let mut lenghts_writer = BitsWriter::new(&lenghts_path); | ||
|
||
for term in vocab.keys() { | ||
lenghts_writer.write_gamma(term.len() as u32); | ||
terms_writer.write_term(term); | ||
} | ||
|
||
lenghts_writer.flush(); | ||
terms_writer.flush(); | ||
} | ||
|
||
fn write_doc_lentghts(document_lenghts: &Vec<u32>, output_path: &str) { | ||
let doc_path = output_path.to_string() + DOCUMENT_LENGHTS_EXTENSION; | ||
let mut doc_writer = BitsWriter::new(&doc_path); | ||
|
||
doc_writer.write_vbyte(document_lenghts.len() as u32); | ||
document_lenghts.iter().for_each(|l| { | ||
doc_writer.write_gamma(*l); | ||
}); | ||
|
||
doc_writer.flush(); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
use std::collections::BTreeMap; | ||
use tokenizers::Tokenizer; | ||
|
||
use super::{builder, loader, text_utils}; | ||
use crate::disk::bits_reader::BitsReader; | ||
|
||
pub const POSTINGS_EXTENSION: &str = ".postings"; | ||
pub const OFFSETS_EXTENSION: &str = ".offsets"; | ||
pub const DOCUMENT_LENGHTS_EXTENSION: &str = ".doc_lengths"; | ||
pub const VOCABULARY_ALPHA_EXTENSION: &str = ".alphas"; | ||
pub const VOCABULARY_LENGHTS_EXTENSION: &str = ".term_lengths"; | ||
|
||
pub struct Index { | ||
postings: BitsReader, | ||
term_offset_map: BTreeMap<String, u64>, | ||
doc_lenghts: Vec<u32>, | ||
tokenizer: Tokenizer, | ||
} | ||
|
||
impl Index { | ||
pub fn build_index(input_path: &str, output_path: &str, tokenizer_path: &str) { | ||
let tokenizer = text_utils::load_tokenizer(tokenizer_path, false); | ||
builder::build_index(input_path, output_path, &tokenizer); | ||
} | ||
|
||
pub fn load_index(input_path: &str, tokenizer_path: &str) -> Index { | ||
Index { | ||
postings: loader::build_postings_reader(input_path), | ||
term_offset_map: loader::load_terms_to_offsets_map(input_path), | ||
doc_lenghts: loader::load_document_lenghts(input_path), | ||
tokenizer: text_utils::load_tokenizer(tokenizer_path, false), | ||
} | ||
} | ||
|
||
pub fn get_postings(&mut self, term: &str) -> Option<Vec<u32>> { | ||
let offset = self.term_offset_map.get(term)?; | ||
Some(self.get_postings_internal(*offset)) | ||
} | ||
|
||
fn get_postings_internal(&mut self, offset: u64) -> Vec<u32> { | ||
self.postings.seek(offset); | ||
let mut prev = 0; | ||
|
||
(0..self.postings.read_vbyte()) | ||
.map(|_| { | ||
prev += self.postings.read_gamma(); | ||
prev | ||
}) | ||
.collect() | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
use super::*; | ||
|
||
#[test] | ||
fn test_build() { | ||
Index::build_index( | ||
"data/index_unit_test/docs", | ||
"data/index_unit_test/index/test", | ||
"data/index_unit_test/test_tokenizer", | ||
); | ||
|
||
let mut idx = Index::load_index( | ||
"data/index_unit_test/index/test", | ||
"data/index_unit_test/test_tokenizer", | ||
); | ||
|
||
for ele in ["hello", "man", "world"] { | ||
assert!(idx.term_offset_map.contains_key(ele)); | ||
} | ||
|
||
assert_eq!(idx.get_postings("hello").unwrap(), [0, 1]); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
use std::collections::BTreeMap; | ||
|
||
use super::index::{ | ||
DOCUMENT_LENGHTS_EXTENSION, OFFSETS_EXTENSION, POSTINGS_EXTENSION, VOCABULARY_ALPHA_EXTENSION, | ||
VOCABULARY_LENGHTS_EXTENSION, | ||
}; | ||
use crate::disk::{bits_reader::BitsReader, terms_reader::TermsReader}; | ||
|
||
pub fn load_terms_to_offsets_map(input_path: &str) -> BTreeMap<String, u64> { | ||
let terms_path: String = input_path.to_string() + VOCABULARY_ALPHA_EXTENSION; | ||
let terms_buffer = TermsReader::new(&terms_path).read_to_string(); | ||
|
||
let lenghts_path = input_path.to_string() + VOCABULARY_LENGHTS_EXTENSION; | ||
let mut lenghts_reader = BitsReader::new(&lenghts_path); | ||
|
||
let offsets_path = input_path.to_string() + OFFSETS_EXTENSION; | ||
let mut offsets_reader = BitsReader::new(&offsets_path); | ||
|
||
let num_terms: u32 = offsets_reader.read_vbyte(); | ||
|
||
let mut start_term_offset: usize = 0; | ||
let mut postings_offset = 0; | ||
|
||
let mut res: BTreeMap<String, u64> = BTreeMap::new(); | ||
|
||
for _ in 0..num_terms { | ||
let term_length = lenghts_reader.read_gamma() as usize; | ||
|
||
let postings_offset_delta = offsets_reader.read_gamma() as u64; | ||
postings_offset += postings_offset_delta; | ||
|
||
res.insert( | ||
terms_buffer[start_term_offset..start_term_offset + term_length].to_string(), | ||
postings_offset, | ||
); | ||
|
||
start_term_offset += term_length; | ||
} | ||
|
||
res | ||
} | ||
|
||
pub fn load_document_lenghts(input_path: &str) -> Vec<u32> { | ||
let mut reader = BitsReader::new(&(input_path.to_string() + DOCUMENT_LENGHTS_EXTENSION)); | ||
let n = reader.read_vbyte(); | ||
(0..n).map(|_| reader.read_gamma() as u32).collect() | ||
} | ||
|
||
pub fn build_postings_reader(input_path: &str) -> BitsReader { | ||
BitsReader::new(&(input_path.to_string() + POSTINGS_EXTENSION)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
mod disk_utils; | ||
mod builder; | ||
pub mod index; | ||
mod loader; | ||
mod text_utils; |
File renamed without changes.
Oops, something went wrong.