Skip to content

Commit

Permalink
removed terms writer and reader
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Jan 13, 2024
1 parent 1855658 commit 47c069d
Show file tree
Hide file tree
Showing 6 changed files with 9 additions and 86 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Search engine written in Rust, based on an inverted index on disk.

**IO**
- [x] Classes for writing and reading bit-streams;
- [ ] Proper strings writer and reader.
- [x] Proper strings writer and reader.

**Text preprocessing**
- [x] Tokenization;
Expand Down
2 changes: 0 additions & 2 deletions src/disk/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
pub mod bits_reader;
pub mod bits_writer;
mod file_utils;
pub mod terms_reader;
pub mod terms_writer;
24 changes: 0 additions & 24 deletions src/disk/terms_reader.rs

This file was deleted.

30 changes: 0 additions & 30 deletions src/disk/terms_writer.rs

This file was deleted.

1 change: 0 additions & 1 deletion src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ pub const POSTINGS_EXTENSION: &str = ".postings";
pub const OFFSETS_EXTENSION: &str = ".offsets";
pub const DOCUMENT_LENGHTS_EXTENSION: &str = ".doc_lengths";
pub const VOCABULARY_ALPHA_EXTENSION: &str = ".alphas";
pub const VOCABULARY_LENGHTS_EXTENSION: &str = ".term_lengths";

pub struct Index {
postings: BitsReader,
Expand Down
36 changes: 8 additions & 28 deletions src/index/vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,58 +2,38 @@ use std::collections::BTreeMap;

use fxhash::FxHashMap;

use crate::disk::{
bits_reader::BitsReader, bits_writer::BitsWriter, terms_reader::TermsReader,
terms_writer::TermsWriter,
};
use crate::disk::{bits_reader::BitsReader, bits_writer::BitsWriter};

use super::{OFFSETS_EXTENSION, VOCABULARY_ALPHA_EXTENSION, VOCABULARY_LENGHTS_EXTENSION};
use super::{OFFSETS_EXTENSION, VOCABULARY_ALPHA_EXTENSION};

pub fn write_vocabulary(vocab: &BTreeMap<String, usize>, output_path: &str) {
let terms_path = output_path.to_string() + VOCABULARY_ALPHA_EXTENSION;
let mut terms_writer = TermsWriter::new(&terms_path);
let mut terms_writer = BitsWriter::new(&terms_path);

let lenghts_path = output_path.to_string() + VOCABULARY_LENGHTS_EXTENSION;
let mut lenghts_writer = BitsWriter::new(&lenghts_path);
vocab.keys().for_each(|s| {
terms_writer.write_str(s);
});

for term in vocab.keys() {
lenghts_writer.write_gamma(term.len() as u32);
terms_writer.write_term(term);
}

lenghts_writer.flush();
terms_writer.flush();
}

pub fn load_vocabulary(input_path: &str) -> FxHashMap<String, u64> {
let terms_path: String = input_path.to_string() + VOCABULARY_ALPHA_EXTENSION;
let terms_buffer = TermsReader::new(&terms_path).read_to_string();

let lenghts_path = input_path.to_string() + VOCABULARY_LENGHTS_EXTENSION;
let mut lenghts_reader = BitsReader::new(&lenghts_path);
let mut terms_reader = BitsReader::new(&terms_path);

let offsets_path = input_path.to_string() + OFFSETS_EXTENSION;
let mut offsets_reader = BitsReader::new(&offsets_path);

let num_terms: u32 = offsets_reader.read_vbyte();

let mut start_term_offset: usize = 0;
let mut postings_offset = 0;

let mut res = FxHashMap::default();

for _ in 0..num_terms {
let term_length = lenghts_reader.read_gamma() as usize;

let postings_offset_delta = offsets_reader.read_gamma() as u64;
postings_offset += postings_offset_delta;

res.insert(
terms_buffer[start_term_offset..start_term_offset + term_length].to_string(),
postings_offset,
);

start_term_offset += term_length;
res.insert(terms_reader.read_str(), postings_offset);
}

res
Expand Down

0 comments on commit 47c069d

Please sign in to comment.