Skip to content

Commit

Permalink
Draft in memory index
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Dec 7, 2023
1 parent 691c182 commit 61282ee
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 7 deletions.
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ Search engine written in Rust, based on an inverted index on disk.

**Implementation status**
- [x] IO classes for writing and reading bit-streams;
- [ ] Structure to hold the vocabulary;
- [ ] Text preprocessing;
- [ ] In-memory datasets index construction;
- [ ] Disk-based partial index construction and merging;
- [ ] Additional indexes to support things such as spelling correction.
- [ ] Text preprocessing:
- [x] Tokenization;
- [ ] Stemming.
- [ ] Index construction:
- [ ] [In progress] In-memory datasets index construction;
- [ ] Disk-based partial index construction and merging;
- [ ] Additional indexes to support things such as spelling correction.

**References**

Expand Down
104 changes: 104 additions & 0 deletions src/index.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
use std::{
collections::{BTreeMap, HashMap, HashSet},
fs, vec,
};

use crate::{
bits::{self, reader::Reader},
text::tokens,
};

const OUTPUT_DIR: &str = "data/index";
const POSTINGS_EXTENSION: &str = ".postings";
const OFFSETS_EXTENSION: &str = ".offsets";

pub struct Index {
postings: Reader,
offsets: Vec<u64>,
vocabulary: BTreeMap<String, u64>,
}

impl Index {
fn build_in_memory_postings(
input_dir: &str,
) -> (BTreeMap<String, usize>, Vec<BTreeMap<u32, u32>>) {
let documents =
fs::read_dir(input_dir).expect("error while retrieving input directory content");

println!("{:?}", documents);
let tokens_regex = tokens::build_tokenization_regex();

let tokenized_docs_iter = documents
.into_iter()
.map(|p| p.unwrap())
.map(|p| fs::read_to_string(p.path()).expect("error while reading file"))
.map(|s| tokens::tokenize(&s, &tokens_regex));

let mut words: BTreeMap<String, usize> = BTreeMap::new();
let mut in_memory_postings: Vec<BTreeMap<u32, u32>> = Vec::new();

for (doc_id, tokens) in tokenized_docs_iter.enumerate() {
for t in tokens.iter() {
let value: Option<&usize> = words.get(t);

let postings_counter = match value {
Some(idx) => &mut in_memory_postings[*idx],
None => {
let idx = words.len();
words.insert(t.clone(), idx);
in_memory_postings.push(BTreeMap::new());
&mut in_memory_postings[idx]
}
};
let key = doc_id as u32;
postings_counter
.entry(key)
.and_modify(|count| *count += 1)
.or_insert(1);
}
}

(words, in_memory_postings)
}

fn write_to_file(vocab: &BTreeMap<String, usize>, postings: &Vec<BTreeMap<u32, u32>>) {
let postings_path = OUTPUT_DIR.to_string() + "/index" + POSTINGS_EXTENSION;
let offsets_path = OUTPUT_DIR.to_string() + "/index" + OFFSETS_EXTENSION;

let mut postings_writer = bits::writer::Writer::new(&postings_path);
let mut offsets_writer = bits::writer::Writer::new(&offsets_path);

let mut offset: u64 = 0;
let mut prev_offset = 0;
for (_, idx) in vocab.iter() {
offsets_writer.write_gamma(offset as u32 - prev_offset);

let postings = &postings[*idx];
offset += postings_writer.write_vbyte(postings.len() as u32);

let mut prev = 0;
for (doc_id, frequency) in postings.iter() {
offset += postings_writer.write_gamma(doc_id - prev);
offset += postings_writer.write_gamma(*frequency);
prev = *doc_id;
}

prev_offset = offset as u32;
}

postings_writer.flush();
offsets_writer.flush();
}
}

// #[cfg(test)]
// mod test {
// use super::*;

// #[test]
// fn test_build() {
// let (a, b) = Index::build_in_memory_postings("data/wiki-data/docs");

// Index::write_to_file(&a, &b);
// }
// }
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pub mod bits;
pub mod index;
pub mod text;
4 changes: 2 additions & 2 deletions src/text/tokens.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use regex::Regex;

pub fn tokenize(s: &str, re: Regex) -> Vec<String> {
pub fn tokenize(s: &String, re: &Regex) -> Vec<String> {
let vec: Vec<String> = re
.replace_all(s, "")
.to_lowercase()
Expand All @@ -23,7 +23,7 @@ mod test {
#[test]
fn test_tokenization() {
let r = build_tokenization_regex();
let mut t = tokenize("123#Hello, __World!", r);
let mut t = tokenize(&"123#Hello, __World!".to_string(), &r);
t.sort();

assert_eq!(t, ["hello", "world"]);
Expand Down

0 comments on commit 61282ee

Please sign in to comment.