Draft in memory index

tomfran · Dec 7, 2023 · 61282ee · 61282ee
1 parent 691c182
commit 61282ee
Show file tree

Hide file tree

Showing 4 changed files with 114 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -4,11 +4,13 @@ Search engine written in Rust, based on an inverted index on disk.
 
 **Implementation status** 
 - [x] IO classes for writing and reading bit-streams;
-- [ ] Structure to hold the vocabulary;
-- [ ] Text preprocessing;
-- [ ] In-memory datasets index construction;
-- [ ] Disk-based partial index construction and merging;
-- [ ] Additional indexes to support things such as spelling correction.
+- [ ] Text preprocessing: 
+  - [x] Tokenization;
+  - [ ] Stemming.
+- [ ] Index construction:
+  - [ ] [In progress] In-memory datasets index construction;
+  - [ ] Disk-based partial index construction and merging;
+  - [ ] Additional indexes to support things such as spelling correction.
 
 **References**
 

diff --git a/src/index.rs b/src/index.rs
@@ -0,0 +1,104 @@
+use std::{
+    collections::{BTreeMap, HashMap, HashSet},
+    fs, vec,
+};
+
+use crate::{
+    bits::{self, reader::Reader},
+    text::tokens,
+};
+
+const OUTPUT_DIR: &str = "data/index";
+const POSTINGS_EXTENSION: &str = ".postings";
+const OFFSETS_EXTENSION: &str = ".offsets";
+
+pub struct Index {
+    postings: Reader,
+    offsets: Vec<u64>,
+    vocabulary: BTreeMap<String, u64>,
+}
+
+impl Index {
+    fn build_in_memory_postings(
+        input_dir: &str,
+    ) -> (BTreeMap<String, usize>, Vec<BTreeMap<u32, u32>>) {
+        let documents =
+            fs::read_dir(input_dir).expect("error while retrieving input directory content");
+
+        println!("{:?}", documents);
+        let tokens_regex = tokens::build_tokenization_regex();
+
+        let tokenized_docs_iter = documents
+            .into_iter()
+            .map(|p| p.unwrap())
+            .map(|p| fs::read_to_string(p.path()).expect("error while reading file"))
+            .map(|s| tokens::tokenize(&s, &tokens_regex));
+
+        let mut words: BTreeMap<String, usize> = BTreeMap::new();
+        let mut in_memory_postings: Vec<BTreeMap<u32, u32>> = Vec::new();
+
+        for (doc_id, tokens) in tokenized_docs_iter.enumerate() {
+            for t in tokens.iter() {
+                let value: Option<&usize> = words.get(t);
+
+                let postings_counter = match value {
+                    Some(idx) => &mut in_memory_postings[*idx],
+                    None => {
+                        let idx = words.len();
+                        words.insert(t.clone(), idx);
+                        in_memory_postings.push(BTreeMap::new());
+                        &mut in_memory_postings[idx]
+                    }
+                };
+                let key = doc_id as u32;
+                postings_counter
+                    .entry(key)
+                    .and_modify(|count| *count += 1)
+                    .or_insert(1);
+            }
+        }
+
+        (words, in_memory_postings)
+    }
+
+    fn write_to_file(vocab: &BTreeMap<String, usize>, postings: &Vec<BTreeMap<u32, u32>>) {
+        let postings_path = OUTPUT_DIR.to_string() + "/index" + POSTINGS_EXTENSION;
+        let offsets_path = OUTPUT_DIR.to_string() + "/index" + OFFSETS_EXTENSION;
+
+        let mut postings_writer = bits::writer::Writer::new(&postings_path);
+        let mut offsets_writer = bits::writer::Writer::new(&offsets_path);
+
+        let mut offset: u64 = 0;
+        let mut prev_offset = 0;
+        for (_, idx) in vocab.iter() {
+            offsets_writer.write_gamma(offset as u32 - prev_offset);
+
+            let postings = &postings[*idx];
+            offset += postings_writer.write_vbyte(postings.len() as u32);
+
+            let mut prev = 0;
+            for (doc_id, frequency) in postings.iter() {
+                offset += postings_writer.write_gamma(doc_id - prev);
+                offset += postings_writer.write_gamma(*frequency);
+                prev = *doc_id;
+            }
+
+            prev_offset = offset as u32;
+        }
+
+        postings_writer.flush();
+        offsets_writer.flush();
+    }
+}
+
+// #[cfg(test)]
+// mod test {
+//     use super::*;
+
+//     #[test]
+//     fn test_build() {
+//         let (a, b) = Index::build_in_memory_postings("data/wiki-data/docs");
+
+//         Index::write_to_file(&a, &b);
+//     }
+// }
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,2 +1,3 @@
 pub mod bits;
+pub mod index;
 pub mod text;
diff --git a/src/text/tokens.rs b/src/text/tokens.rs
@@ -1,6 +1,6 @@
 use regex::Regex;
 
-pub fn tokenize(s: &str, re: Regex) -> Vec<String> {
+pub fn tokenize(s: &String, re: &Regex) -> Vec<String> {
     let vec: Vec<String> = re
         .replace_all(s, "")
         .to_lowercase()
@@ -23,7 +23,7 @@ mod test {
     #[test]
     fn test_tokenization() {
         let r = build_tokenization_regex();
-        let mut t = tokenize("123#Hello, __World!", r);
+        let mut t = tokenize(&"123#Hello, __World!".to_string(), &r);
         t.sort();
 
         assert_eq!(t, ["hello", "world"]);