Skip to content

Commit

Permalink
Partial JSON parsing support trailing strings, fix #82
Browse files Browse the repository at this point in the history
  • Loading branch information
samuelcolvin committed May 21, 2024
1 parent 1fbedbf commit 50706b3
Show file tree
Hide file tree
Showing 10 changed files with 145 additions and 80 deletions.
4 changes: 2 additions & 2 deletions crates/fuzz/fuzz_targets/compare_skip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ fuzz_target!(|json: String| {
let json_data = json.as_bytes();
match JsonValue::parse(json_data, false) {
Ok(_) => {
let mut jiter = Jiter::new(json_data, false);
let mut jiter = Jiter::new(json_data, false, false);
jiter.next_skip().unwrap();
jiter.finish().unwrap();
}
Err(json_error) => {
let mut jiter = Jiter::new(json_data, false);
let mut jiter = Jiter::new(json_data, false, false);
let jiter_error = match jiter.next_skip() {
Ok(_) => jiter.finish().unwrap_err(),
Err(e) => e,
Expand Down
17 changes: 16 additions & 1 deletion crates/jiter-python/tests/test_jiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def test_extracted_value_error():
def test_partial_array():
json = b'["string", true, null, 1, "foo'
parsed = jiter.from_json(json, allow_partial=True)
assert parsed == ["string", True, None, 1]
assert parsed == ["string", True, None, 1, "foo"]

# test that stopping at every points is ok
for i in range(1, len(json)):
Expand Down Expand Up @@ -90,6 +90,21 @@ def test_partial_object():
assert isinstance(parsed, dict)


def test_partial_object_string():
json = b'{"a": 1, "b": 2, "c": "foo'
parsed = jiter.from_json(json, allow_partial=True)
assert parsed == {"a": 1, "b": 2, "c": "foo"}

# test that stopping at every points is ok
for i in range(1, len(json)):
parsed = jiter.from_json(json, allow_partial=True)
assert isinstance(parsed, dict)

json = b'{"title": "Pride and Prejudice", "author": "Jane A'
parsed = jiter.from_json(json, allow_partial=True)
assert parsed == {"title": "Pride and Prejudice", "author": "Jane A"}


def test_partial_nested():
json = b'{"a": 1, "b": 2, "c": [1, 2, {"d": 1, '
parsed = jiter.from_json(json, allow_partial=True)
Expand Down
2 changes: 1 addition & 1 deletion crates/jiter/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ fn main() {
"+44 2345678"
]
}"#;
let mut jiter = Jiter::new(json_data.as_bytes(), true);
let mut jiter = Jiter::new(json_data.as_bytes(), true, false);
assert_eq!(jiter.next_object().unwrap(), Some("name"));
assert_eq!(jiter.next_str().unwrap(), "John Doe");
assert_eq!(jiter.next_key().unwrap(), Some("age"));
Expand Down
14 changes: 11 additions & 3 deletions crates/jiter/src/jiter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ pub struct Jiter<'j> {
parser: Parser<'j>,
tape: Tape,
allow_inf_nan: bool,
allow_partial_strings: bool,
}

impl Clone for Jiter<'_> {
Expand All @@ -24,6 +25,7 @@ impl Clone for Jiter<'_> {
parser: self.parser.clone(),
tape: Tape::default(),
allow_inf_nan: self.allow_inf_nan,
allow_partial_strings: self.allow_partial_strings,
}
}
}
Expand All @@ -34,12 +36,13 @@ impl<'j> Jiter<'j> {
/// # Arguments
/// - `data`: The JSON data to be parsed.
/// - `allow_inf_nan`: Whether to allow `NaN`, `Infinity` and `-Infinity` as numbers.
pub fn new(data: &'j [u8], allow_inf_nan: bool) -> Self {
pub fn new(data: &'j [u8], allow_inf_nan: bool, allow_partial_strings: bool) -> Self {
Self {
data,
parser: Parser::new(data),
tape: Tape::default(),
allow_inf_nan,
allow_partial_strings,
}
}

Expand Down Expand Up @@ -186,7 +189,10 @@ impl<'j> Jiter<'j> {

/// Knowing the next value is a string, parse it.
pub fn known_str(&mut self) -> JiterResult<&str> {
match self.parser.consume_string::<StringDecoder>(&mut self.tape) {
match self
.parser
.consume_string::<StringDecoder>(&mut self.tape, self.allow_partial_strings)
{
Ok(output) => Ok(output.as_str()),
Err(e) => Err(e.into()),
}
Expand All @@ -203,7 +209,9 @@ impl<'j> Jiter<'j> {

/// Knowing the next value is a string, parse it and return bytes from the original JSON data.
pub fn known_bytes(&mut self) -> JiterResult<&[u8]> {
let range = self.parser.consume_string::<StringDecoderRange>(&mut self.tape)?;
let range = self
.parser
.consume_string::<StringDecoderRange>(&mut self.tape, self.allow_partial_strings)?;
Ok(&self.data[range])
}

Expand Down
10 changes: 7 additions & 3 deletions crates/jiter/src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,11 +192,15 @@ impl<'j> Parser<'j> {
self.consume_ident(NULL_REST)
}

pub fn consume_string<'t, D: AbstractStringDecoder<'t, 'j>>(&mut self, tape: &'t mut Tape) -> JsonResult<D::Output>
pub fn consume_string<'t, D: AbstractStringDecoder<'t, 'j>>(
&mut self,
tape: &'t mut Tape,
allow_partial: bool,
) -> JsonResult<D::Output>
where
'j: 't,
{
let (output, index) = D::decode(self.data, self.index, tape)?;
let (output, index) = D::decode(self.data, self.index, tape, allow_partial)?;
self.index = index;
Ok(output)
}
Expand All @@ -216,7 +220,7 @@ impl<'j> Parser<'j> {
where
'j: 't,
{
let (output, index) = D::decode(self.data, self.index, tape)?;
let (output, index) = D::decode(self.data, self.index, tape, false)?;
self.index = index;
if let Some(next) = self.eat_whitespace() {
if next == b':' {
Expand Down
4 changes: 3 additions & 1 deletion crates/jiter/src/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,9 @@ impl<'j, StringCache: StringMaybeCache, KeyCheck: MaybeKeyCheck> PythonParser<'j
Ok(false.to_object(py).into_bound(py))
}
Peek::String => {
let s = self.parser.consume_string::<StringDecoder>(&mut self.tape)?;
let s = self
.parser
.consume_string::<StringDecoder>(&mut self.tape, self.allow_partial)?;
Ok(StringCache::get_value(py, s.as_str(), s.ascii_only()).into_any())
}
Peek::Array => {
Expand Down
3 changes: 2 additions & 1 deletion crates/jiter/src/simd_aarch64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ pub fn decode_string_chunk(
data: &[u8],
mut index: usize,
mut ascii_only: bool,
allow_partial: bool,
) -> JsonResult<(StringChunk, bool, usize)> {
while let Some(byte_chunk) = data.get(index..index + SIMD_STEP) {
let byte_vec = load_slice(byte_chunk);
Expand All @@ -216,7 +217,7 @@ pub fn decode_string_chunk(
}
}
// we got near the end of the string, fall back to the slow path
StringChunk::decode_fallback(data, index, ascii_only)
StringChunk::decode_fallback(data, index, ascii_only, allow_partial)
}

#[rustfmt::skip]
Expand Down
73 changes: 53 additions & 20 deletions crates/jiter/src/string_decoder.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::borrow::Cow;
use std::fmt::Debug;
use std::ops::Range;
use std::str::{from_utf8, from_utf8_unchecked};

Expand All @@ -13,9 +14,14 @@ pub trait AbstractStringDecoder<'t, 'j>
where
'j: 't,
{
type Output;

fn decode(data: &'j [u8], index: usize, tape: &'t mut Tape) -> JsonResult<(Self::Output, usize)>;
type Output: Debug;

fn decode(
data: &'j [u8],
index: usize,
tape: &'t mut Tape,
allow_partial: bool,
) -> JsonResult<(Self::Output, usize)>;
}

pub struct StringDecoder;
Expand Down Expand Up @@ -69,15 +75,22 @@ where
{
type Output = StringOutput<'t, 'j>;

fn decode(data: &'j [u8], index: usize, tape: &'t mut Tape) -> JsonResult<(Self::Output, usize)> {
fn decode(
data: &'j [u8],
index: usize,
tape: &'t mut Tape,
allow_partial: bool,
) -> JsonResult<(Self::Output, usize)> {
let start = index + 1;

match decode_chunk(data, start, true)? {
(StringChunk::Quote, ascii_only, index) => {
match decode_chunk(data, start, true, allow_partial)? {
(StringChunk::StringEnd, ascii_only, index) => {
let s = to_str(&data[start..index], ascii_only, start)?;
Ok((StringOutput::Data(s, ascii_only), index + 1))
}
(StringChunk::Backslash, ascii_only, index) => decode_to_tape(data, index, tape, start, ascii_only),
(StringChunk::Backslash, ascii_only, index) => {
decode_to_tape(data, index, tape, start, ascii_only, allow_partial)
}
}
}
}
Expand All @@ -88,6 +101,7 @@ fn decode_to_tape<'t, 'j>(
tape: &'t mut Tape,
start: usize,
mut ascii_only: bool,
allow_partial: bool,
) -> JsonResult<(StringOutput<'t, 'j>, usize)> {
tape.clear();
let mut chunk_start = start;
Expand Down Expand Up @@ -115,8 +129,8 @@ fn decode_to_tape<'t, 'j>(
return json_err!(EofWhileParsingString, index);
}

match decode_chunk(data, index, ascii_only)? {
(StringChunk::Quote, ascii_only, new_index) => {
match decode_chunk(data, index, ascii_only, allow_partial)? {
(StringChunk::StringEnd, ascii_only, new_index) => {
tape.extend_from_slice(&data[index..new_index]);
index = new_index + 1;
let s = to_str(tape, ascii_only, start)?;
Expand All @@ -132,31 +146,41 @@ fn decode_to_tape<'t, 'j>(
}

#[inline(always)]
pub fn decode_chunk(data: &[u8], index: usize, ascii_only: bool) -> JsonResult<(StringChunk, bool, usize)> {
pub fn decode_chunk(
data: &[u8],
index: usize,
ascii_only: bool,
allow_partial: bool,
) -> JsonResult<(StringChunk, bool, usize)> {
// TODO x86_64: use simd

#[cfg(target_arch = "aarch64")]
{
crate::simd_aarch64::decode_string_chunk(data, index, ascii_only)
crate::simd_aarch64::decode_string_chunk(data, index, ascii_only, allow_partial)
}
#[cfg(not(target_arch = "aarch64"))]
{
StringChunk::decode_fallback(data, index, ascii_only)
StringChunk::decode_fallback(data, index, ascii_only, allow_partial)
}
}

pub(crate) enum StringChunk {
Quote,
StringEnd,
Backslash,
}

impl StringChunk {
#[inline(always)]
pub fn decode_fallback(data: &[u8], mut index: usize, mut ascii_only: bool) -> JsonResult<(Self, bool, usize)> {
pub fn decode_fallback(
data: &[u8],
mut index: usize,
mut ascii_only: bool,
allow_partial: bool,
) -> JsonResult<(Self, bool, usize)> {
while let Some(next) = data.get(index) {
if !JSON_ASCII[*next as usize] {
match &CHAR_TYPE[*next as usize] {
CharType::Quote => return Ok((Self::Quote, ascii_only, index)),
CharType::Quote => return Ok((Self::StringEnd, ascii_only, index)),
CharType::Backslash => return Ok((Self::Backslash, ascii_only, index)),
CharType::ControlChar => return json_err!(ControlCharacterWhileParsingString, index),
CharType::Other => {
Expand All @@ -166,7 +190,11 @@ impl StringChunk {
}
index += 1;
}
json_err!(EofWhileParsingString, index)
if allow_partial {
Ok((Self::StringEnd, ascii_only, index))
} else {
json_err!(EofWhileParsingString, index)
}
}

/// decode an array (generally from SIMD) return the result of the chunk, or none if the non-ascii character
Expand All @@ -181,7 +209,7 @@ impl StringChunk {
for u8_char in data {
if !JSON_ASCII[u8_char as usize] {
return match &CHAR_TYPE[u8_char as usize] {
CharType::Quote => Some(Ok((Self::Quote, ascii_only, *index))),
CharType::Quote => Some(Ok((Self::StringEnd, ascii_only, *index))),
CharType::Backslash => Some(Ok((Self::Backslash, ascii_only, *index))),
CharType::ControlChar => Some(json_err!(ControlCharacterWhileParsingString, *index)),
CharType::Other => {
Expand Down Expand Up @@ -338,13 +366,18 @@ where
{
type Output = Range<usize>;

fn decode(data: &'j [u8], mut index: usize, _tape: &'t mut Tape) -> JsonResult<(Self::Output, usize)> {
fn decode(
data: &'j [u8],
mut index: usize,
_tape: &'t mut Tape,
allow_partial: bool,
) -> JsonResult<(Self::Output, usize)> {
index += 1;
let start = index;

loop {
index = match decode_chunk(data, index, true)? {
(StringChunk::Quote, _, index) => {
index = match decode_chunk(data, index, true, allow_partial)? {
(StringChunk::StringEnd, _, index) => {
let r = start..index;
return Ok((r, index + 1));
}
Expand Down
4 changes: 2 additions & 2 deletions crates/jiter/src/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ fn take_value<'j, 's>(
Ok(JsonValue::Null)
}
Peek::String => {
let s: StringOutput<'_, 'j> = parser.consume_string::<StringDecoder>(tape)?;
let s: StringOutput<'_, 'j> = parser.consume_string::<StringDecoder>(tape, false)?;
Ok(JsonValue::Str(create_cow(s)))
}
Peek::Array => {
Expand Down Expand Up @@ -242,7 +242,7 @@ pub(crate) fn take_value_skip(
Peek::False => parser.consume_false(),
Peek::Null => parser.consume_null(),
Peek::String => {
parser.consume_string::<StringDecoderRange>(tape)?;
parser.consume_string::<StringDecoderRange>(tape, false)?;
Ok(())
}
Peek::Array => {
Expand Down
Loading

0 comments on commit 50706b3

Please sign in to comment.