Skip to content

Commit

Permalink
Merge pull request #16 from andyquinterom/custom_interner
Browse files Browse the repository at this point in the history
feat: Adds custom (faster) interner
  • Loading branch information
andyquinterom authored Jun 18, 2024
2 parents cf95679 + 5932392 commit f05f3fc
Show file tree
Hide file tree
Showing 7 changed files with 192 additions and 25 deletions.
3 changes: 1 addition & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "orbweaver"
version = "0.10.4"
version = "0.11.0"
edition = "2021"
authors = ["ixpantia <hola@ixpantia.com>", "Andrés F. Quintero <andres@ixpantia.com>"]
description = "Crate designed for effortless construction and analysis of graph data structures."
Expand All @@ -15,7 +15,6 @@ ureq = "2.9.7"

[dependencies]
fxhash = "0.2.1"
string-interner = { version = "0.17", features = ["serde"] }
rayon = "1.10.0"
serde = { version = "1.0.202", features = ["derive", "rc"], optional = true }
serde_cbor = { version = "0.11.2", optional = true }
Expand Down
2 changes: 2 additions & 0 deletions benches/directed_graph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ pub fn criterion_benchmark(c: &mut Criterion) {

println!("Done building the graph!");

c.bench_function("dg_get_nodes", |b| b.iter(|| graph_dg.nodes()));

c.bench_function("dg_get_parents", |b| {
b.iter(|| graph_dg.parents(black_box(["1f6a329de1d9c26602fe1ee8ce81ca98"])))
});
Expand Down
4 changes: 2 additions & 2 deletions src/directed/acyclic/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::{directed::DirectedGraph, prelude::*};
use std::{ops::Deref};
use std::ops::Deref;
mod topological_sort;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -101,7 +101,7 @@ impl DirectedAcyclicGraph {
Ok(all_paths
.split(|&n| n == PATH_DELIM)
.filter(|p| !p.is_empty())
.map(|path| self.resolve_mul(path.iter().copied()))
.map(|path| self.resolve_mul_slice(path))
.collect())
}

Expand Down
44 changes: 23 additions & 21 deletions src/directed/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,23 @@ use fxhash::FxBuildHasher;
use rayon::prelude::*;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use string_interner::backend::BucketBackend;
use string_interner::StringInterner;
//use string_interner::backend::BucketBackend;
//use string_interner::StringInterner;
use crate::interner::{InternerBuilder, Resolver};

use self::acyclic::DirectedAcyclicGraph;
use self::get_rel2_on_rel1::get_values_on_rel_map;
use crate::prelude::*;
use std::cell::UnsafeCell;
use std::collections::{HashMap, HashSet, VecDeque};
use std::ops::Not;
use std::rc::Rc;

#[derive(Clone)]
pub struct DirectedGraphBuilder {
pub(crate) parents: Vec<u32>,
pub(crate) children: Vec<u32>,
pub(crate) interner: StringInterner<BucketBackend, FxBuildHasher>,
pub(crate) interner: InternerBuilder,
}

#[derive(Default)]
Expand All @@ -36,7 +38,7 @@ pub(crate) struct InternalBufs {

#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct DirectedGraph {
pub(crate) interner: StringInterner<BucketBackend, FxBuildHasher>,
pub(crate) interner: Rc<Resolver>,
pub(crate) leaves: Vec<u32>,
pub(crate) roots: Vec<u32>,
pub(crate) nodes: Vec<u32>,
Expand All @@ -47,7 +49,7 @@ pub struct DirectedGraph {
/// Key: Child | Value: Parents
pub(crate) parent_map: HashMap<u32, HashSet<u32, FxBuildHasher>, FxBuildHasher>,
pub(crate) n_edges: usize,
#[serde(skip_serializing, skip_deserializing)]
#[cfg_attr(feature = "serde", serde(skip_serializing, skip_deserializing))]
pub(crate) buf: InternalBufs,
}

Expand Down Expand Up @@ -76,7 +78,7 @@ fn find_roots(parents: &[u32], children: &[u32]) -> Vec<u32> {
impl DirectedGraphBuilder {
pub fn new() -> Self {
DirectedGraphBuilder {
interner: StringInterner::with_hasher(FxBuildHasher::default()),
interner: InternerBuilder::new(),
children: Vec::new(),
parents: Vec::new(),
}
Expand Down Expand Up @@ -149,7 +151,7 @@ impl DirectedGraphBuilder {
}

DirectedGraph {
interner: self.interner,
interner: Rc::new(self.interner.build()),
leaves,
roots,
nodes,
Expand Down Expand Up @@ -250,19 +252,19 @@ impl DirectedGraph {

#[inline(always)]
pub(crate) fn resolve(&self, val: u32) -> &str {
unsafe { self.interner.resolve_unchecked(std::mem::transmute(val)) }
unsafe { self.interner.resolve_unchecked(val) }
}

#[inline(always)]
pub(crate) fn resolve_mul(&self, nodes: impl IntoIterator<Item = u32>) -> Vec<&str> {
nodes.into_iter().map(|node| self.resolve(node)).collect()
pub(crate) fn resolve_mul_slice(&self, nodes: &[u32]) -> Vec<&str> {
unsafe { self.interner.resolve_many_unchecked_from_slice(nodes) }
}

#[inline(always)]
pub(crate) fn get_internal(&self, val: impl AsRef<str>) -> GraphInteractionResult<u32> {
self.interner
.get(val.as_ref())
.map(|v| unsafe { std::mem::transmute(v) })
.map(|v| v.get())
.ok_or_else(|| GraphInteractionError::node_not_exists(val))
}

Expand Down Expand Up @@ -306,7 +308,7 @@ impl DirectedGraph {
let res = unsafe { self.u32x1_vec_1() };
self.get_internal_mul(nodes, nodes_buf)?;
self.children_u32(nodes_buf, res);
Ok(self.resolve_mul(res.drain(..)))
Ok(self.resolve_mul_slice(res))
}

#[inline]
Expand All @@ -330,7 +332,7 @@ impl DirectedGraph {
let res = unsafe { self.u32x1_vec_1() };
self.get_internal_mul(nodes, nodes_buf)?;
self.parents_u32(nodes_buf, res);
Ok(self.resolve_mul(res.drain(..)))
Ok(self.resolve_mul_slice(res))
}

pub fn has_parents(
Expand Down Expand Up @@ -420,7 +422,7 @@ impl DirectedGraph {
}
}

Ok(self.resolve_mul(path_buf.drain(..)))
Ok(self.resolve_mul_slice(path_buf))
}

/// Finds all paths on a DG using BFS
Expand Down Expand Up @@ -465,7 +467,7 @@ impl DirectedGraph {
Ok(all_paths
.split(|&n| n == PATH_DELIM)
.filter(|p| !p.is_empty())
.map(|path| self.resolve_mul(path.iter().copied()))
.map(|path| self.resolve_mul_slice(path))
.collect())
}

Expand Down Expand Up @@ -496,11 +498,11 @@ impl DirectedGraph {
least_common_parents.sort_unstable();
least_common_parents.dedup();

Ok(self.resolve_mul(least_common_parents.iter().copied()))
Ok(self.resolve_mul_slice(least_common_parents))
}

pub fn get_all_leaves(&self) -> Vec<&str> {
self.resolve_mul(self.leaves.iter().copied())
self.resolve_mul_slice(&self.leaves)
}

fn get_leaves_under_u32(
Expand Down Expand Up @@ -533,11 +535,11 @@ impl DirectedGraph {
let visited = unsafe { self.u32x1_set_0() };
self.get_internal_mul(nodes, nodes_buf)?;
self.get_leaves_under_u32(nodes_buf, leaves, visited);
Ok(self.resolve_mul(leaves.drain(..)))
Ok(self.resolve_mul_slice(leaves))
}

pub fn get_all_roots(&self) -> Vec<&str> {
self.resolve_mul(self.roots.iter().copied())
self.resolve_mul_slice(&self.roots)
}

fn get_roots_over_u32(
Expand Down Expand Up @@ -569,7 +571,7 @@ impl DirectedGraph {
let visited = unsafe { self.u32x1_set_0() };
self.get_internal_mul(nodes, nodes_buf)?;
self.get_roots_over_u32(nodes_buf, roots, visited);
Ok(self.resolve_mul(roots.drain(..)))
Ok(self.resolve_mul_slice(roots))
}

fn subset_u32(&self, node: u32) -> DirectedGraph {
Expand Down Expand Up @@ -644,7 +646,7 @@ impl DirectedGraph {
}

pub fn nodes(&self) -> Vec<&str> {
self.resolve_mul(self.nodes.iter().copied())
self.resolve_mul_slice(&self.nodes)
}

pub fn len(&self) -> usize {
Expand Down
158 changes: 158 additions & 0 deletions src/interner.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
use std::{collections::HashMap, num::NonZeroU32};

use fxhash::FxBuildHasher;

#[derive(Clone)]
pub(crate) struct InternerBuilder {
count: NonZeroU32,
map_strs: HashMap<Box<str>, NonZeroU32>,
}

impl InternerBuilder {
pub(crate) fn new() -> Self {
InternerBuilder {
count: unsafe { NonZeroU32::new_unchecked(1) },
map_strs: HashMap::new(),
}
}
pub(crate) fn get_or_intern(&mut self, val: impl AsRef<str>) -> NonZeroU32 {
match self.map_strs.get(val.as_ref()) {
Some(sym) => *sym,
None => {
let sym = *self
.map_strs
.entry(val.as_ref().into())
.or_insert(self.count);
self.count = self.count.saturating_add(1);
sym
}
}
}
pub(crate) fn build(self) -> Resolver {
let mut indices = Vec::new();
let mut arena = Vec::new();
for (key, i) in self.map_strs {
let key_bytes = key.as_bytes();
indices.push((i, arena.len(), key_bytes.len()));
arena.extend_from_slice(key.as_bytes());
}
let arena: Box<[u8]> = Box::from(arena);
let arena_ptr = arena.as_ptr();
let mut strs = Vec::new();
strs.push("");
let mut strs_map = HashMap::default();
indices.sort_by_key(|(i, _, _)| *i);
for (i, start, end) in indices {
let current_str = unsafe {
std::str::from_utf8_unchecked(std::slice::from_raw_parts(arena_ptr.add(start), end))
};
strs_map.insert(current_str, i);
strs.push(current_str);
}
let strs = Box::from(strs);
strs_map.shrink_to_fit();
Resolver {
strs_map,
strs,
arena,
}
}
}

pub(crate) struct Resolver {
// This isnt actually static btw. This implements
// unsafe self referencing
//
// The 'static str points to bytes in the arena
strs_map: HashMap<&'static str, NonZeroU32, FxBuildHasher>,
strs: Box<[&'static str]>,
#[allow(unused)]
arena: Box<[u8]>,
}

impl Resolver {
#[inline(always)]
pub(crate) fn get(&self, val: &str) -> Option<NonZeroU32> {
self.strs_map.get(val).copied()
}
#[inline(always)]
pub(crate) unsafe fn resolve_unchecked(&self, sym: u32) -> &str {
self.strs.get_unchecked(sym as usize)
}
#[inline(always)]
pub(crate) unsafe fn resolve_many_unchecked_from_slice(&self, syms: &[u32]) -> Vec<&str> {
syms.iter()
.map(|sym| *self.strs.get_unchecked(*sym as usize))
.collect()
}
}

#[cfg(feature = "serde")]
impl serde::Serialize for Resolver {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
use serde::ser::SerializeSeq;
let mut seq = serializer.serialize_seq(Some(self.strs.len()))?;
for val in self.strs.iter() {
seq.serialize_element(val)?;
}
seq.end()
}
}

#[cfg(feature = "serde")]
impl<'de> serde::Deserialize<'de> for Resolver {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let mut builder = InternerBuilder::new();
let values: Vec<Box<str>> = Vec::deserialize(deserializer)?;
for val in values.into_iter().skip(1) {
match builder.map_strs.entry(val) {
std::collections::hash_map::Entry::Vacant(vac) => {
vac.insert(builder.count);
builder.count = builder.count.saturating_add(1);
}
std::collections::hash_map::Entry::Occupied(_) => {
return Err(serde::de::Error::custom("Duplicate value"));
}
}
}

Ok(builder.build())
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn can_build_interner() {
let mut builder = InternerBuilder::new();
let int1 = builder.get_or_intern("Hello");
let int2 = builder.get_or_intern("World");
let resolver = builder.build();
assert_eq!(resolver.strs, Box::from(["", "Hello", "World"]));
assert_eq!(int1.get(), 1);
assert_eq!(int2.get(), 2);
}

#[test]
fn can_access_after_move() {
let mut builder = InternerBuilder::new();
let int1 = builder.get_or_intern("Hello");
let int2 = builder.get_or_intern("World");
let resolver = builder.build();
assert_eq!(resolver.strs, Box::from(["", "Hello", "World"]));
assert_eq!(int1.get(), 1);
assert_eq!(int2.get(), 2);

let resolver2 = resolver;

assert_eq!(resolver2.strs, Box::from(["", "Hello", "World"]));
}
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pub mod directed;
pub mod error;
mod interner;
pub mod readwrite;

// Prelude of data types and functionality.
Expand Down
Loading

0 comments on commit f05f3fc

Please sign in to comment.