Skip to content

Commit

Permalink
cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Dec 17, 2023
1 parent c938e62 commit b345df8
Show file tree
Hide file tree
Showing 12 changed files with 58 additions and 42 deletions.
7 changes: 5 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ Cargo.lock
.idea

# Added by cargo

/target
/data/

# Data files
/data/wiki-data
/data/index_unit_test/index
/data/test
1 change: 1 addition & 0 deletions data/index_unit_test/docs/1.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
hello world
1 change: 1 addition & 0 deletions data/index_unit_test/docs/2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
hello man
5 changes: 0 additions & 5 deletions src/disk/bits_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,9 @@ mod test {

use super::*;
use crate::disk::bits_writer::BitsWriter;
use std::fs::create_dir_all;

#[test]
fn test_read() {
create_dir_all("data/test/").expect("error while creating test dir");

let mut w = BitsWriter::new("data/test/writer_unit.bin");

(1..100).for_each(|i| {
Expand All @@ -138,8 +135,6 @@ mod test {

#[test]
fn test_seek() {
create_dir_all("data/test/").expect("error while creating test dir");

let mut w = BitsWriter::new("data/test/writer_seek.bin");

let offset = (0..1000).map(|i| w.write_gamma(i)).sum();
Expand Down
9 changes: 4 additions & 5 deletions src/disk/bits_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,18 @@ use std::{
io::{BufWriter, Write},
};

use super::utils;

pub struct BitsWriter {
file: BufWriter<File>,
buffer: u128,
written: u32,
}

impl BitsWriter {
pub fn new(filename: &str) -> BitsWriter {
pub fn new(path: &str) -> BitsWriter {
BitsWriter {
file: BufWriter::new(File::create(filename).expect("Can not create output file")),
file: BufWriter::new(utils::create_and_open_file(path)),
buffer: 0,
written: 0,
}
Expand Down Expand Up @@ -94,7 +96,6 @@ impl BitsWriter {
mod test {

use super::*;
use std::fs::create_dir_all;

#[test]
fn test_gamma_coding() {
Expand All @@ -120,8 +121,6 @@ mod test {

#[test]
fn test_buffer_overflow() {
create_dir_all("data/test/").expect("error while creating test dir");

let word = (1 << 10) - 1;
let len = 10;

Expand Down
1 change: 1 addition & 0 deletions src/disk/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ pub mod bits_reader;
pub mod bits_writer;
pub mod terms_reader;
pub mod terms_writer;
mod utils;
6 changes: 4 additions & 2 deletions src/disk/terms_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@ use std::{
io::{BufWriter, Write},
};

use super::utils;

pub struct TermsWriter {
file: BufWriter<File>,
}

impl TermsWriter {
pub fn new(filename: &str) -> TermsWriter {
pub fn new(path: &str) -> TermsWriter {
TermsWriter {
file: BufWriter::new(File::create(filename).expect("Can not create output file")),
file: BufWriter::new(utils::create_and_open_file(path)),
}
}

Expand Down
11 changes: 11 additions & 0 deletions src/disk/utils.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
use std::{
fs::{create_dir_all, File},
path::Path,
};

pub fn create_and_open_file(filename: &str) -> File {
let path = Path::new(filename);
path.parent().map(create_dir_all);

File::create(path).expect("error while creating file")
}
36 changes: 18 additions & 18 deletions src/indexer/disk_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@ pub fn build_in_memory_postings(
let documents =
fs::read_dir(input_dir).expect("error while retrieving input directory content");

println!("{:?}", documents);
let tokens_regex = tokens::build_tokenization_regex();

let tokenized_docs_iter = documents
.into_iter()
.map(|p| p.unwrap())
Expand Down Expand Up @@ -61,9 +59,9 @@ pub fn write_postings(
output_path: &str,
) {
let postings_path = output_path.to_string() + POSTINGS_EXTENSION;
let offsets_path = output_path.to_string() + OFFSETS_EXTENSION;

let mut postings_writer = BitsWriter::new(&postings_path);

let offsets_path = output_path.to_string() + OFFSETS_EXTENSION;
let mut offsets_writer = BitsWriter::new(&offsets_path);

let mut offset: u64 = 0;
Expand Down Expand Up @@ -91,11 +89,11 @@ pub fn write_postings(
}

pub fn write_vocabulary(vocab: &BTreeMap<String, usize>, output_path: &str) {
let alphas_path = output_path.to_string() + VOCABULARY_ALPHA_EXTENSION;
let lenghts_path = output_path.to_string() + VOCABULARY_LENGHTS_EXTENSION;
let terms_path = output_path.to_string() + VOCABULARY_ALPHA_EXTENSION;
let mut terms_writer = TermsWriter::new(&terms_path);

let lenghts_path = output_path.to_string() + VOCABULARY_LENGHTS_EXTENSION;
let mut lenghts_writer = BitsWriter::new(&lenghts_path);
let mut terms_writer = TermsWriter::new(&alphas_path);

for term in vocab.keys() {
lenghts_writer.write_gamma(term.len() as u32);
Expand All @@ -107,32 +105,34 @@ pub fn write_vocabulary(vocab: &BTreeMap<String, usize>, output_path: &str) {
}

pub fn read_terms_to_offsets_map(input_path: &str) -> BTreeMap<String, u64> {
let alphas_path: String = input_path.to_string() + VOCABULARY_ALPHA_EXTENSION;
let terms_path: String = input_path.to_string() + VOCABULARY_ALPHA_EXTENSION;
let terms_buffer = TermsReader::new(&terms_path).read_to_string();

let lenghts_path = input_path.to_string() + VOCABULARY_LENGHTS_EXTENSION;
let offsets_path = input_path.to_string() + OFFSETS_EXTENSION;
let mut lenghts_reader = BitsReader::new(&lenghts_path);

let offsets_path = input_path.to_string() + OFFSETS_EXTENSION;
let mut offsets_reader = BitsReader::new(&offsets_path);
let terms_buffer = TermsReader::new(&alphas_path).read_to_string();
let mut lenghts_reader = BitsReader::new(&lenghts_path);

let n = offsets_reader.read_vbyte();
let num_terms: u32 = offsets_reader.read_vbyte();

let mut start_term_offset: usize = 0;
let mut postings_offset = 0;

let mut res: BTreeMap<String, u64> = BTreeMap::new();

for _ in 0..n {
let terms_delta = lenghts_reader.read_gamma() as usize;
let x = offsets_reader.read_gamma() as u64;
postings_offset += x;
for _ in 0..num_terms {
let term_length = lenghts_reader.read_gamma() as usize;

let postings_offset_delta = offsets_reader.read_gamma() as u64;
postings_offset += postings_offset_delta;

res.insert(
terms_buffer[start_term_offset..start_term_offset + terms_delta].to_string(),
terms_buffer[start_term_offset..start_term_offset + term_length].to_string(),
postings_offset,
);

start_term_offset += terms_delta;
start_term_offset += term_length;
}

res
Expand Down
16 changes: 11 additions & 5 deletions src/indexer/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,19 @@ impl Index {
mod test {
use super::*;

// #[test]
#[test]
fn test_build() {
Index::build_index("data/dummy/docs", "data/dummy/index/dum");
Index::build_index(
"data/index_unit_test/docs",
"data/index_unit_test/index/test",
);

let mut idx = Index::load_index("data/dummy/index/dum");
let mut idx = Index::load_index("data/index_unit_test/index/test");

println!("{:?}", idx.terms_to_offsets);
println!("{:?}", idx.get_postings("my"));
for ele in ["hello", "man", "world"] {
assert!(idx.terms_to_offsets.contains_key(ele));
}

assert_eq!(idx.get_postings("hello").unwrap(), [0, 1]);
}
}
4 changes: 2 additions & 2 deletions src/text/tokens.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use regex::Regex;

pub fn tokenize(s: &String, re: &Regex) -> Vec<String> {
pub fn tokenize(s: &str, re: &Regex) -> Vec<String> {
let vec: Vec<String> = re
.replace_all(s, "")
.to_lowercase()
Expand All @@ -23,7 +23,7 @@ mod test {
#[test]
fn test_tokenization() {
let r = build_tokenization_regex();
let mut t = tokenize(&"123#Hello, __World!".to_string(), &r);
let mut t = tokenize("123#Hello, __World!", &r);
t.sort();

assert_eq!(t, ["hello", "world"]);
Expand Down
3 changes: 0 additions & 3 deletions tests/read_write_integration_test.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
use rand::Rng;
use search::disk::{bits_reader::BitsReader, bits_writer::BitsWriter};
use std::fs::create_dir_all;

#[test]
fn test_read_write() {
create_dir_all("data/test/").expect("error while creating test dir");

let path = "data/test/writer_io_integration.bin";

let n = 100_000;
Expand Down

0 comments on commit b345df8

Please sign in to comment.