Skip to content

Commit

Permalink
add PartialMode
Browse files Browse the repository at this point in the history
  • Loading branch information
samuelcolvin committed May 21, 2024
1 parent bc8b7b6 commit 23fb3aa
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 19 deletions.
7 changes: 5 additions & 2 deletions crates/jiter-python/jiter.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def from_json(
*,
allow_inf_nan: bool = True,
cache_strings: Literal[True, False, "all", "keys", "none"] = True,
allow_partial: bool = False,
allow_partial: Literal[True, False, "off", "trailing-strings", "none"] = False,
catch_duplicate_keys: bool = False,
) -> Any:
"""
Expand All @@ -20,7 +20,10 @@ def from_json(
- True / 'all' - cache all strings
- 'keys' - cache only object keys
- False / 'none' - cache nothing
allow_partial: if True, return parsed content when reaching EOF without closing objects and arrays
allow_partial: How to handle incomplete strings:
- False / 'off' - raise an exception if the input is incomplete
- True / 'on' - allow incomplete JSON but discard the last string if it is incomplete
- 'trailing-strings' - allow incomplete JSON, and include the last incomplete string in the output
catch_duplicate_keys: if True, raise an exception if objects contain the same key multiple times
Returns:
Expand Down
6 changes: 3 additions & 3 deletions crates/jiter-python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::sync::OnceLock;

use pyo3::prelude::*;

use jiter::{map_json_error, python_parse, StringCacheMode};
use jiter::{map_json_error, python_parse, PartialMode, StringCacheMode};

#[pyfunction(
signature = (
Expand All @@ -11,7 +11,7 @@ use jiter::{map_json_error, python_parse, StringCacheMode};
*,
allow_inf_nan=true,
cache_strings=StringCacheMode::All,
allow_partial=false,
allow_partial=PartialMode::Off,
catch_duplicate_keys=false
)
)]
Expand All @@ -20,7 +20,7 @@ pub fn from_json<'py>(
json_data: &[u8],
allow_inf_nan: bool,
cache_strings: StringCacheMode,
allow_partial: bool,
allow_partial: PartialMode,
catch_duplicate_keys: bool,
) -> PyResult<Bound<'py, PyAny>> {
python_parse(
Expand Down
30 changes: 28 additions & 2 deletions crates/jiter-python/tests/test_jiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,25 @@ def test_extracted_value_error():
def test_partial_array():
json = b'["string", true, null, 1, "foo'
parsed = jiter.from_json(json, allow_partial=True)
assert parsed == ["string", True, None, 1, "foo"]
assert parsed == ["string", True, None, 1]

# test that stopping at every points is ok
for i in range(1, len(json)):
parsed = jiter.from_json(json[:i], allow_partial=True)
assert isinstance(parsed, list)


def test_partial_array_trailing_strings():
json = b'["string", true, null, 1, "foo'
parsed = jiter.from_json(json, allow_partial='trailing-strings')
assert parsed == ["string", True, None, 1, "foo"]

# test that stopping at every points is ok
for i in range(1, len(json)):
parsed = jiter.from_json(json[:i], allow_partial='trailing-strings')
assert isinstance(parsed, list)


def test_partial_array_first():
json = b"["
parsed = jiter.from_json(json, allow_partial=True)
Expand All @@ -93,7 +104,7 @@ def test_partial_object():
def test_partial_object_string():
json = b'{"a": 1, "b": 2, "c": "foo'
parsed = jiter.from_json(json, allow_partial=True)
assert parsed == {"a": 1, "b": 2, "c": "foo"}
assert parsed == {"a": 1, "b": 2}

# test that stopping at every points is ok
for i in range(1, len(json)):
Expand All @@ -102,6 +113,21 @@ def test_partial_object_string():

json = b'{"title": "Pride and Prejudice", "author": "Jane A'
parsed = jiter.from_json(json, allow_partial=True)
assert parsed == {"title": "Pride and Prejudice"}


def test_partial_object_string_trailing_strings():
json = b'{"a": 1, "b": 2, "c": "foo'
parsed = jiter.from_json(json, allow_partial='trailing-strings')
assert parsed == {"a": 1, "b": 2, "c": "foo"}

# test that stopping at every points is ok
for i in range(1, len(json)):
parsed = jiter.from_json(json, allow_partial=True)
assert isinstance(parsed, dict)

json = b'{"title": "Pride and Prejudice", "author": "Jane A'
parsed = jiter.from_json(json, allow_partial='trailing-strings')
assert parsed == {"title": "Pride and Prejudice", "author": "Jane A"}


Expand Down
2 changes: 1 addition & 1 deletion crates/jiter/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@ pub use value::{JsonArray, JsonObject, JsonValue};
#[cfg(feature = "python")]
pub use py_string_cache::{cache_clear, cache_usage, cached_py_string, pystring_fast_new, StringCacheMode};
#[cfg(feature = "python")]
pub use python::{map_json_error, python_parse};
pub use python::{map_json_error, python_parse, PartialMode};
66 changes: 56 additions & 10 deletions crates/jiter/src/python.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use ahash::AHashSet;
use std::marker::PhantomData;

use pyo3::exceptions::PyValueError;
use pyo3::exceptions::{PyTypeError, PyValueError};
use pyo3::ffi;
use pyo3::prelude::*;
use pyo3::types::{PyDict, PyList, PyString};
use pyo3::types::{PyBool, PyDict, PyList, PyString};

use smallvec::SmallVec;

Expand Down Expand Up @@ -34,12 +34,12 @@ pub fn python_parse<'py>(
json_data: &[u8],
allow_inf_nan: bool,
cache_mode: StringCacheMode,
allow_partial: bool,
partial_mode: impl Into<PartialMode>,
catch_duplicate_keys: bool,
) -> JsonResult<Bound<'py, PyAny>> {
macro_rules! ppp {
($string_cache:ident, $key_check:ident) => {
PythonParser::<$string_cache, $key_check>::parse(py, json_data, allow_inf_nan, allow_partial)
PythonParser::<$string_cache, $key_check>::parse(py, json_data, allow_inf_nan, partial_mode.into())
};
}

Expand Down Expand Up @@ -71,15 +71,15 @@ struct PythonParser<'j, StringCache, KeyCheck> {
tape: Tape,
recursion_limit: u8,
allow_inf_nan: bool,
allow_partial: bool,
partial_mode: PartialMode,
}

impl<'j, StringCache: StringMaybeCache, KeyCheck: MaybeKeyCheck> PythonParser<'j, StringCache, KeyCheck> {
fn parse<'py>(
py: Python<'py>,
json_data: &[u8],
allow_inf_nan: bool,
allow_partial: bool,
partial_mode: PartialMode,
) -> JsonResult<Bound<'py, PyAny>> {
let mut slf = PythonParser {
_string_cache: PhantomData::<StringCache>,
Expand All @@ -88,12 +88,12 @@ impl<'j, StringCache: StringMaybeCache, KeyCheck: MaybeKeyCheck> PythonParser<'j
tape: Tape::default(),
recursion_limit: DEFAULT_RECURSION_LIMIT,
allow_inf_nan,
allow_partial,
partial_mode,
};

let peek = slf.parser.peek()?;
let v = slf.py_take_value(py, peek)?;
if !allow_partial {
if !slf.partial_mode.is_active() {
slf.parser.finish()?;
}
Ok(v)
Expand All @@ -116,7 +116,7 @@ impl<'j, StringCache: StringMaybeCache, KeyCheck: MaybeKeyCheck> PythonParser<'j
Peek::String => {
let s = self
.parser
.consume_string::<StringDecoder>(&mut self.tape, self.allow_partial)?;
.consume_string::<StringDecoder>(&mut self.tape, self.partial_mode.allow_trailing_str())?;
Ok(StringCache::get_value(py, s.as_str(), s.ascii_only()).into_any())
}
Peek::Array => {
Expand Down Expand Up @@ -208,7 +208,7 @@ impl<'j, StringCache: StringMaybeCache, KeyCheck: MaybeKeyCheck> PythonParser<'j
}

fn _allow_partial_err(&self, e: &JsonError) -> bool {
if self.allow_partial {
if self.partial_mode.is_active() {
matches!(
e.error_type,
JsonErrorType::EofWhileParsingList
Expand Down Expand Up @@ -236,6 +236,52 @@ impl<'j, StringCache: StringMaybeCache, KeyCheck: MaybeKeyCheck> PythonParser<'j
}
}

#[derive(Debug, Clone, Copy)]
pub enum PartialMode {
Off,
On,
TrailingStrings,
}

const PARTIAL_ERROR: &str = "Invalid partial mode, should be `'off'`, `'on'`, `'trailing-strings'` or a `bool`";

impl<'py> FromPyObject<'py> for PartialMode {
fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {
if let Ok(bool_mode) = ob.downcast::<PyBool>() {
Ok(bool_mode.is_true().into())
} else if let Ok(str_mode) = ob.extract::<&str>() {
match str_mode {
"off" => Ok(Self::Off),
"on" => Ok(Self::On),
"trailing-strings" => Ok(Self::TrailingStrings),
_ => Err(PyValueError::new_err(PARTIAL_ERROR)),
}
} else {
Err(PyTypeError::new_err(PARTIAL_ERROR))
}
}
}

impl From<bool> for PartialMode {
fn from(mode: bool) -> Self {
if mode {
Self::On
} else {
Self::Off
}
}
}

impl PartialMode {
fn is_active(self) -> bool {
!matches!(self, Self::Off)
}

fn allow_trailing_str(self) -> bool {
matches!(self, Self::TrailingStrings)
}
}

trait MaybeKeyCheck: Default {
fn check(&mut self, key: &str, index: usize) -> JsonResult<()>;
}
Expand Down
14 changes: 13 additions & 1 deletion crates/jiter/tests/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1562,5 +1562,17 @@ fn jiter_skip_invalid_long_float() {
#[test]
fn jiter_value_invalid_long_float() {
let e = JsonValue::parse(br#"2121515572557277572557277e"#, false).unwrap_err();
assert_eq!(e.error_type, JsonErrorType::EofWhileParsingValue,);
assert_eq!(e.error_type, JsonErrorType::EofWhileParsingValue);
}

#[test]
fn jiter_partial_string() {
let mut jiter = Jiter::new(br#"["foo"#).with_allow_partial_strings();
assert_eq!(jiter.next_array().unwrap(), Some(Peek::String));
assert_eq!(jiter.next_str().unwrap(), "foo");
let e = jiter.array_step().unwrap_err();
assert_eq!(
e.error_type,
JiterErrorType::JsonError(JsonErrorType::EofWhileParsingList)
);
}

0 comments on commit 23fb3aa

Please sign in to comment.