Skip to content

Commit

Permalink
reader seek
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Dec 6, 2023
1 parent 9104e68 commit 746bd6d
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 34 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ name = "search"
path = "src/lib.rs"

[dependencies]
rand = "0.8"
rand = "0.8"
regex = "1"
56 changes: 44 additions & 12 deletions src/bits/reader.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,17 @@
use std::{
fs::File,
io::{BufReader, Read},
io::{BufReader, Read, Seek, SeekFrom},
};

#[allow(dead_code)]
const BUFFER_SIZE: u32 = 128;

#[allow(dead_code)]
pub struct Reader {
file: BufReader<File>,
buffer: u128,
byte_buffer: [u8; 16],
read: u32,
}

#[allow(dead_code)]
impl Reader {
pub fn new(filename: &str) -> Reader {
let mut r = Reader {
Expand Down Expand Up @@ -91,41 +88,76 @@ impl Reader {
.read_exact(&mut self.byte_buffer)
.expect("error while filling byte buffer");

self.buffer = u128::from_be_bytes(self.byte_buffer);
self.buffer = u128::from_le_bytes(self.byte_buffer);
self.read = 0;
}

pub fn seek(&mut self, bit_offset: u64) {
let byte_seek = bit_offset / 8;
let remainder_seek = bit_offset % 8;

self.file
.seek(SeekFrom::Start(byte_seek))
.expect("error while seeking reader");

self.fill_buffer();
if remainder_seek > 0 {
self.read_internal(remainder_seek as u32);
}
}
}

#[cfg(test)]
mod test {

use std::fs::create_dir_all;

use super::*;
use crate::bits::writer::Writer;
use std::fs::create_dir_all;

#[test]
fn test_read_gamma() {
fn test_read() {
create_dir_all("data/test/").expect("error while creating test dir");

let mut w = Writer::new("data/test/writer_unit.bin");

for i in 1..100 {
w.write_gamma(i);
w.write_vbyte(i);
}
for i in 1..100 {
w.write_vbyte(i);
w.write_gamma(i);
}
w.flush();

let mut r = Reader::new("data/test/writer_unit.bin");

for i in 1..100 {
let a = r.read_gamma();
let a = r.read_vbyte();
assert_eq!(i, a);
}
for i in 1..100 {
let a = r.read_vbyte();
let a = r.read_gamma();
assert_eq!(i, a);
}
}

#[test]
fn test_seek() {
create_dir_all("data/test/").expect("error while creating test dir");

let mut w = Writer::new("data/test/writer_seek.bin");

let mut offset = 0;

for i in 0..1000 {
offset += w.write_gamma(i);
}

w.write_gamma(10);
w.flush();

let mut r = Reader::new("data/test/writer_seek.bin");

r.seek(offset);
assert_eq!(r.read_gamma(), 10);
}
}
35 changes: 16 additions & 19 deletions src/bits/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,12 @@ use std::{
io::{BufWriter, Write},
};

const BUFFER_SIZE: u32 = 128;

#[allow(dead_code)]
pub struct Writer {
file: BufWriter<File>,
buffer: u128,
written: u32,
}

#[allow(dead_code)]
impl Writer {
pub fn new(filename: &str) -> Writer {
Writer {
Expand All @@ -22,9 +18,9 @@ impl Writer {
}
}

pub fn write_gamma(&mut self, n: u32) {
pub fn write_gamma(&mut self, n: u32) -> u64 {
let (gamma, len) = Writer::int_to_gamma(n + 1);
self.write_internal(gamma, len);
self.write_internal(gamma, len)
}

fn int_to_gamma(n: u32) -> (u128, u32) {
Expand All @@ -34,9 +30,9 @@ impl Writer {
(gamma, 2 * msb + 1)
}

pub fn write_vbyte(&mut self, n: u32) {
pub fn write_vbyte(&mut self, n: u32) -> u64 {
let (vbyte, len) = Writer::int_to_vbyte(n + 1);
self.write_internal(vbyte, len);
self.write_internal(vbyte, len)
}

fn int_to_vbyte(n: u32) -> (u128, u32) {
Expand All @@ -56,25 +52,26 @@ impl Writer {
(vbyte, 8 * byte_num)
}

fn write_internal(&mut self, payload: u128, len: u32) {
let free = BUFFER_SIZE - self.written;
fn write_internal(&mut self, payload: u128, len: u32) -> u64 {
let free = 128 - self.written;
self.buffer |= payload << self.written;

if free > len {
self.written += len;
return;
} else {
self.update_buffer();
if len > free {
self.buffer |= payload >> free;
self.written += len - free;
}
}

self.update_buffer();
if len > free {
self.buffer |= payload >> free;
self.written += len - free;
}
len as u64
}

fn update_buffer(&mut self) {
self.file
.write_all(&self.buffer.to_be_bytes())
.write_all(&self.buffer.to_le_bytes())
.expect("error while writing buffer to BufWriter");

self.buffer = 0;
Expand All @@ -86,6 +83,7 @@ impl Writer {
self.update_buffer();
}

self.update_buffer();
self.file
.flush()
.expect("error while flushing BufWriter buffer");
Expand All @@ -95,9 +93,8 @@ impl Writer {
#[cfg(test)]
mod test {

use std::fs::create_dir_all;

use super::*;
use std::fs::create_dir_all;

#[test]
fn test_gamma_coding() {
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
pub mod bits;
pub mod text;
1 change: 1 addition & 0 deletions src/text/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod tokens;
31 changes: 31 additions & 0 deletions src/text/tokens.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
use regex::Regex;

pub fn tokenize(s: &str, re: Regex) -> Vec<String> {
let vec: Vec<String> = re
.replace_all(s, "")
.to_lowercase()
.split_whitespace()
.map(|t| t.to_string())
.collect();
vec
}

pub fn build_tokenization_regex() -> Regex {
Regex::new(r"[^a-zA-Z\s]").unwrap()
}

#[cfg(test)]
mod test {

use super::tokenize;
use crate::text::tokens::build_tokenization_regex;

#[test]
fn test_tokenization() {
let r = build_tokenization_regex();
let mut t = tokenize("123#Hello, __World!", r);
t.sort();

assert_eq!(t, ["hello", "world"]);
}
}
4 changes: 2 additions & 2 deletions tests/read_write_integration_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ fn test_read_write() {

values.iter().zip(coding.iter_mut()).for_each(|(v, c)| {
if *c % 2 == 0 {
writer.write_vbyte(*v)
writer.write_vbyte(*v);
} else {
writer.write_gamma(*v)
writer.write_gamma(*v);
}
});
writer.flush();
Expand Down

0 comments on commit 746bd6d

Please sign in to comment.