Skip to content

Commit

Permalink
in-memory postings with dummy tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Nov 26, 2023
1 parent 8ec0403 commit 6213edc
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 60 deletions.
49 changes: 37 additions & 12 deletions src/index.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use std::fs;
use std::{collections::HashMap, fs};

use crate::trie::Trie;

pub struct Index {
vocab: Trie,
postings: Vec<Vec<i32>>,
postings: Vec<Vec<usize>>,
}

impl Index {
Expand All @@ -13,20 +13,33 @@ impl Index {
Index { vocab, postings }
}

fn build_internal(directory: &str) -> (Trie, Vec<Vec<i32>>) {
fn build_internal(directory: &str) -> (Trie, Vec<Vec<usize>>) {
let paths = fs::read_dir(directory).unwrap();

let mut vocab = Trie::new();
let mut seen: HashMap<String, usize> = HashMap::new();
let mut vocab: Trie = Trie::new();
let mut postings = Vec::new();

let mut i = 0;

for path in paths {
let s = fs::read_to_string(path.unwrap().path()).unwrap();
for (doc_idx, path) in paths.enumerate() {
// replace all punctuation in the string
let s = fs::read_to_string(path.unwrap().path())
.unwrap()
.replace(&['(', ')', ',', '\"', '.', ';', ':', '\''][..], " ");

// split and insert into the vocabulary if needed
for w in s.split_ascii_whitespace() {
vocab.insert(w, i);
i += 1;
let lowercase_word = w.to_lowercase();
let to_insert = lowercase_word.clone();

if !seen.contains_key(to_insert.as_str()) {
println!("New word {to_insert}");
let word_idx = seen.len();
vocab.insert(to_insert.as_str(), word_idx);
seen.insert(to_insert, word_idx);
postings.push(Vec::new())
}

postings[*seen.get(lowercase_word.as_str()).unwrap()].push(doc_idx);
}
}

Expand All @@ -41,8 +54,20 @@ mod tests {

#[test]
fn test_build() {
let i = Index::new("test_data");
let i = Index::new("test_data/index");

assert!(i.vocab.get("apple").is_some());
assert!(i.vocab.get("park").is_some());

let apple_idx = i.vocab.get("apple").unwrap();
let binary_idx = i.vocab.get("binary").unwrap();

let apple_postings = i.postings.get(apple_idx).unwrap();
let binary_postings = i.postings.get(binary_idx).unwrap();

assert!(apple_postings.len() == 2);
assert!(binary_postings.len() == 1);

assert!(i.vocab.get("In").is_some())
assert!(!apple_postings.contains(binary_postings.first().unwrap()));
}
}
12 changes: 6 additions & 6 deletions src/trie.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::collections::{HashMap, VecDeque};

#[derive(Default)]
struct Node {
value: Option<i32>,
value: Option<usize>,
children: HashMap<char, Node>,
}

Expand All @@ -17,7 +17,7 @@ impl Trie {
}
}

pub fn insert(&mut self, word: &str, value: i32) {
pub fn insert(&mut self, word: &str, value: usize) {
let mut node = &mut self.root;

for c in word.chars() {
Expand All @@ -27,11 +27,11 @@ impl Trie {
node.value = Some(value);
}

pub fn get(&self, word: &str) -> Option<i32> {
pub fn get(&self, word: &str) -> Option<usize> {
self.get_internal(word).and_then(|n| n.value)
}

pub fn get_by_prefix(&self, prefix: &str) -> Vec<i32> {
pub fn get_by_prefix(&self, prefix: &str) -> Vec<usize> {
self.get_internal(prefix)
.map_or_else(Vec::new, |n| self.visit(n))
}
Expand All @@ -49,8 +49,8 @@ impl Trie {
Some(node)
}

fn visit(&self, node: &Node) -> Vec<i32> {
let mut res: Vec<i32> = Vec::new();
fn visit(&self, node: &Node) -> Vec<usize> {
let mut res: Vec<usize> = Vec::new();
let mut queue: VecDeque<&Node> = VecDeque::new();
queue.push_back(node);

Expand Down
42 changes: 0 additions & 42 deletions test_data/doc.txt

This file was deleted.

1 change: 1 addition & 0 deletions test_data/index/1
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The apple fell from the tree branch.
1 change: 1 addition & 0 deletions test_data/index/2
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
A binary tree is a sorted data structure.
1 change: 1 addition & 0 deletions test_data/index/3
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Apple Park is filled with trees.

0 comments on commit 6213edc

Please sign in to comment.