Skip to content

Commit

Permalink
refactored index, skeleton query processor
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Dec 24, 2023
1 parent 39348f5 commit c7e5e30
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 12 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ Cargo.lock
# Data files
/data/wiki-data
/data/index_unit_test/index
/data/test
/data/test
/data/small
56 changes: 45 additions & 11 deletions src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,16 @@ pub struct Index {
tokenizer: Tokenizer,
}

pub struct PostingList {
documents: Vec<PostingEntry>,
collection_frequency: u32,
}

pub struct PostingEntry {
document_id: u32,
document_frequency: u32,
}

impl Index {
pub fn build_index(input_path: &str, output_path: &str, tokenizer_path: &str) {
let tokenizer = text_utils::load_tokenizer(tokenizer_path, false);
Expand All @@ -35,21 +45,36 @@ impl Index {
}
}

pub fn get_postings(&mut self, term: &str) -> Option<Vec<u32>> {
pub fn get_term(&mut self, term: &str) -> Option<PostingList> {
let offset = self.term_offset_map.get(term)?;
Some(self.get_postings_internal(*offset))
}

fn get_postings_internal(&mut self, offset: u64) -> Vec<u32> {
self.postings.seek(offset);
let mut prev = 0;
self.postings.seek(*offset);
let mut document_id = 0;

(0..self.postings.read_vbyte())
let documents: Vec<PostingEntry> = (0..self.postings.read_vbyte())
.map(|_| {
prev += self.postings.read_gamma();
prev
let doc_id_delta = self.postings.read_gamma();
let document_frequency = self.postings.read_gamma();

document_id += doc_id_delta;

PostingEntry {
document_id,
document_frequency,
}
})
.collect()
.collect();

let collection_frequency = documents.len() as u32;

Some(PostingList {
documents,
collection_frequency,
})
}

pub fn tokenize_query(&self, query: &str) -> Vec<String> {
text_utils::tokenize(&self.tokenizer, query)
}
}

Expand All @@ -74,6 +99,15 @@ mod test {
assert!(idx.term_offset_map.contains_key(ele));
}

assert_eq!(idx.get_postings("hello").unwrap(), [0, 1]);
let pl = idx.get_term("hello").unwrap();
assert_eq!(
pl.documents
.iter()
.map(|d| d.document_id)
.collect::<Vec<u32>>(),
[0, 1]
);

assert_eq!(pl.collection_frequency, 2);
}
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pub mod disk;
pub mod index;
pub mod query;
20 changes: 20 additions & 0 deletions src/query/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
use crate::index::Index;

struct QueryProcessor {
index: Index,
}

impl QueryProcessor {
pub fn build_query_processor(
index_input_path: &str,
index_tokenizer_path: &str,
) -> QueryProcessor {
QueryProcessor {
index: Index::load_index(index_input_path, index_tokenizer_path),
}
}

pub fn query(query: &str) -> Vec<u32> {
todo!()
}
}

0 comments on commit c7e5e30

Please sign in to comment.