Skip to content

Commit

Permalink
adding Batson
Browse files Browse the repository at this point in the history
  • Loading branch information
samuelcolvin committed Sep 12, 2024
1 parent 43a493b commit cda94cd
Show file tree
Hide file tree
Showing 17 changed files with 2,573 additions and 4 deletions.
5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
members = [
"crates/jiter",
"crates/jiter-python",
"crates/batson",
"crates/fuzz",
]
resolver = "2"
Expand All @@ -28,5 +29,9 @@ inherits = "release"
debug = true

[workspace.dependencies]
jiter = { path = "crates/jiter", version = "0.5.0" }
pyo3 = { version = "0.22.0" }
pyo3-build-config = { version = "0.22.0" }
bencher = "0.1.5"
paste = "1.0.7"
codspeed-bencher-compat = "2.7.1"
42 changes: 42 additions & 0 deletions crates/batson/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
[package]
name = "batson"
description = "Binary Alternative To (J)SON. Designed to be very fast to query."
readme = "../../README.md"
version = {workspace = true}
edition = {workspace = true}
authors = {workspace = true}
license = {workspace = true}
keywords = {workspace = true}
categories = {workspace = true}
homepage = {workspace = true}
repository = {workspace = true}

[dependencies]
bytemuck = { version = "1.17.1", features = ["aarch64_simd", "derive", "align_offset"] }
jiter = { workspace = true }
serde = "1.0.210"
serde_json = "1.0.128"
simdutf8 = { version = "0.1.4", features = ["aarch64_neon"] }
smallvec = "2.0.0-alpha.7"

[dev-dependencies]
bencher = { workspace = true }
paste = { workspace = true }
codspeed-bencher-compat = { workspace = true }

[[bench]]
name = "main"
harness = false

[lints.clippy]
dbg_macro = "deny"
print_stdout = "deny"
print_stderr = "deny"
# in general we lint against the pedantic group, but we will whitelist
# certain lints which we don't want to enforce (for now)
pedantic = { level = "deny", priority = -1 }
missing_errors_doc = "allow"
cast_possible_truncation = "allow" # TODO remove
cast_sign_loss = "allow" # TODO remove
cast_possible_wrap = "allow" # TODO remove
checked_conversions = "allow" # TODO remove
16 changes: 16 additions & 0 deletions crates/batson/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# batson

Binary Alternative To (J)SON. Designed to be very fast to query.

Inspired by Postgres' [JSONB type](https://github.com/postgres/postgres/commit/d9134d0a355cfa447adc80db4505d5931084278a?diff=unified&w=0) and Snowflake's [VARIANT type](https://www.youtube.com/watch?v=jtjOfggD4YY).

For a relatively small JSON document (3KB), batson is 14 to 126x faster than Jiter, and 106 to 588x faster than Serde.

```
test medium_get_str_found_batson ... bench: 51 ns/iter (+/- 1)
test medium_get_str_found_jiter ... bench: 755 ns/iter (+/- 66)
test medium_get_str_found_serde ... bench: 5,420 ns/iter (+/- 93)
test medium_get_str_missing_batson ... bench: 9 ns/iter (+/- 0)
test medium_get_str_missing_jiter ... bench: 1,135 ns/iter (+/- 46)
test medium_get_str_missing_serde ... bench: 5,292 ns/iter (+/- 324)
```
213 changes: 213 additions & 0 deletions crates/batson/benches/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
use codspeed_bencher_compat::{benchmark_group, benchmark_main, Bencher};
use std::hint::black_box;

use std::fs::File;
use std::io::Read;

use batson::get::{get_str, BatsonPath};
use batson::{batson_to_json_string, encode_from_json};
use jiter::JsonValue;

fn read_file(path: &str) -> String {
let mut file = File::open(path).unwrap();
let mut contents = String::new();
file.read_to_string(&mut contents).unwrap();
contents
}

/// taken from <https://github.com/datafusion-contrib/datafusion-functions-json/blob/v0.41.0/src/common.rs#L184-L216>
mod jiter_find {
use jiter::{Jiter, Peek};

#[derive(Debug)]
pub enum JsonPath<'s> {
Key(&'s str),
Index(usize),
None,
}

impl From<u64> for JsonPath<'_> {
fn from(index: u64) -> Self {
JsonPath::Index(usize::try_from(index).unwrap())
}
}

impl From<i32> for JsonPath<'_> {
fn from(index: i32) -> Self {
match usize::try_from(index) {
Ok(i) => Self::Index(i),
Err(_) => Self::None,
}
}
}

impl<'s> From<&'s str> for JsonPath<'s> {
fn from(key: &'s str) -> Self {
JsonPath::Key(key)
}
}

pub fn jiter_json_find<'j>(opt_json: Option<&'j str>, path: &[JsonPath]) -> Option<(Jiter<'j>, Peek)> {
let json_str = opt_json?;
let mut jiter = Jiter::new(json_str.as_bytes());
let mut peek = jiter.peek().ok()?;
for element in path {
match element {
JsonPath::Key(key) if peek == Peek::Object => {
let mut next_key = jiter.known_object().ok()??;

while next_key != *key {
jiter.next_skip().ok()?;
next_key = jiter.next_key().ok()??;
}

peek = jiter.peek().ok()?;
}
JsonPath::Index(index) if peek == Peek::Array => {
let mut array_item = jiter.known_array().ok()??;

for _ in 0..*index {
jiter.known_skip(array_item).ok()?;
array_item = jiter.array_step().ok()??;
}

peek = array_item;
}
_ => {
return None;
}
}
}
Some((jiter, peek))
}

pub fn get_str(json_data: Option<&str>, path: &[JsonPath]) -> Option<String> {
if let Some((mut jiter, peek)) = jiter_json_find(json_data, path) {
match peek {
Peek::String => Some(jiter.known_str().ok()?.to_owned()),
_ => None,
}
} else {
None
}
}
}

mod serde_find {
use batson::get::BatsonPath;
use serde_json::Value;

pub fn get_str(json_data: &[u8], path: &[BatsonPath]) -> Option<String> {
let json_value: Value = serde_json::from_slice(json_data).ok()?;
let mut current = &json_value;
for key in path {
current = match (key, current) {
(BatsonPath::Key(k), Value::Object(map)) => map.get(*k)?,
(BatsonPath::Index(i), Value::Array(vec)) => vec.get(*i)?,
_ => return None,
}
}
match current {
Value::String(s) => Some(s.clone()),
_ => None,
}
}
}

fn json_to_batson(json: &[u8]) -> Vec<u8> {
let json_value = JsonValue::parse(json, false).unwrap();
encode_from_json(&json_value).unwrap()
}

fn medium_get_str_found_batson(bench: &mut Bencher) {
let json = read_file("../jiter/benches/medium_response.json");
let json_data = json.as_bytes();
let batson_data = json_to_batson(json_data);
let path: Vec<BatsonPath> = vec!["person".into(), "linkedin".into(), "handle".into()];
bench.iter(|| {
let v = get_str(black_box(&batson_data), &path);
black_box(v)
});
}

fn medium_get_str_found_jiter(bench: &mut Bencher) {
let json = read_file("../jiter/benches/medium_response.json");
let path: Vec<jiter_find::JsonPath> = vec!["person".into(), "linkedin".into(), "handle".into()];
bench.iter(|| {
let v = jiter_find::get_str(black_box(Some(&json)), &path);
black_box(v)
});
}

fn medium_get_str_found_serde(bench: &mut Bencher) {
let json = read_file("../jiter/benches/medium_response.json");
let json_data = json.as_bytes();
let path: Vec<BatsonPath> = vec!["person".into(), "linkedin".into(), "handle".into()];
bench.iter(|| {
let v = serde_find::get_str(black_box(json_data), &path).unwrap();
black_box(v)
});
}

fn medium_get_str_missing_batson(bench: &mut Bencher) {
let json = read_file("../jiter/benches/medium_response.json");
let json_data = json.as_bytes();
let batson_data = json_to_batson(json_data);
let path: Vec<BatsonPath> = vec!["squid".into(), "linkedin".into(), "handle".into()];
bench.iter(|| {
let v = get_str(black_box(&batson_data), &path);
black_box(v)
});
}

fn medium_get_str_missing_jiter(bench: &mut Bencher) {
let json = read_file("../jiter/benches/medium_response.json");
let path: Vec<jiter_find::JsonPath> = vec!["squid".into(), "linkedin".into(), "handle".into()];
bench.iter(|| {
let v = jiter_find::get_str(black_box(Some(&json)), &path);
black_box(v)
});
}

fn medium_get_str_missing_serde(bench: &mut Bencher) {
let json = read_file("../jiter/benches/medium_response.json");
let json_data = json.as_bytes();
let path: Vec<BatsonPath> = vec!["squid".into(), "linkedin".into(), "handle".into()];
bench.iter(|| {
let v = serde_find::get_str(black_box(json_data), &path);
black_box(v)
});
}

fn medium_convert_batson_to_json(bench: &mut Bencher) {
let json = read_file("../jiter/benches/medium_response.json");
let json_data = json.as_bytes();
let batson_data = json_to_batson(json_data);
bench.iter(|| {
let v = batson_to_json_string(black_box(&batson_data)).unwrap();
black_box(v)
});
}

fn medium_convert_json_to_batson(bench: &mut Bencher) {
let json = read_file("../jiter/benches/medium_response.json");
let json = json.as_bytes();
bench.iter(|| {
let json_value = JsonValue::parse(json, false).unwrap();
let b = encode_from_json(&json_value).unwrap();
black_box(b)
});
}

benchmark_group!(
benches,
medium_get_str_found_batson,
medium_get_str_found_jiter,
medium_get_str_found_serde,
medium_get_str_missing_batson,
medium_get_str_missing_jiter,
medium_get_str_missing_serde,
medium_convert_batson_to_json,
medium_convert_json_to_batson
);
benchmark_main!(benches);
53 changes: 53 additions & 0 deletions crates/batson/examples/read_file.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
use batson::get::BatsonPath;
use batson::{batson_to_json_string, encode_from_json};
use jiter::JsonValue;
use std::fs::File;
use std::io::Read;

fn main() {
let filename = std::env::args().nth(1).expect(
r#"
No arguments provided!
Usage:
cargo run --example read_file file.json [path]
"#,
);

let mut file = File::open(&filename).expect("failed to open file");
let mut json = Vec::new();
file.read_to_end(&mut json).expect("failed to read file");

let json_value = JsonValue::parse(&json, false).expect("invalid JSON");
let batson = encode_from_json(&json_value).expect("failed to construct batson data");
println!("json length: {}", json.len());
println!("batson length: {}", batson.len());

let output_json = batson_to_json_string(&batson).expect("failed to convert batson to JSON");
println!("output json length: {}", output_json.len());

if let Some(path) = std::env::args().nth(2) {
let path: Vec<BatsonPath> = path.split('.').map(to_batson_path).collect();
let start = std::time::Instant::now();
let value = batson::get::get_str(&batson, &path).expect("failed to get value");
let elapsed = start.elapsed();
println!("Found value: {value:?} (time taken: {elapsed:?})");
}

println!("reloading to check round-trip");
let json_value = JsonValue::parse(output_json.as_bytes(), false).expect("invalid JSON");
let batson = encode_from_json(&json_value).expect("failed to construct batson data");
let output_json2 = batson_to_json_string(&batson).expect("failed to convert batson to JSON");
println!("JSON unchanged after re-encoding: {:?}", output_json == output_json2);

println!("\n\noutput json:\n{}", output_json);
}

fn to_batson_path(s: &str) -> BatsonPath {
if s.chars().all(char::is_numeric) {
let index: usize = s.parse().unwrap();
index.into()
} else {
s.into()
}
}
Loading

0 comments on commit cda94cd

Please sign in to comment.