From 967857ae0d645dcd772a4f635108785d4534c7ac Mon Sep 17 00:00:00 2001 From: Mateusz Gienieczko Date: Fri, 22 Sep 2023 02:28:21 +0100 Subject: [PATCH] feat: fixed empty query on atomic roots - Previously only object and array roots were supported. Ref: #160 --- .vscode/settings.json | 1 + CHANGELOG.md | 18 ++- crates/rsonpath-lib/src/engine.rs | 6 +- crates/rsonpath-lib/src/engine/empty_query.rs | 118 ++++++++++++++++++ crates/rsonpath-lib/src/engine/main.rs | 44 +------ crates/rsonpath-lib/src/input.rs | 23 +++- crates/rsonpath-lib/src/input/borrowed.rs | 5 + crates/rsonpath-lib/src/input/mmap.rs | 7 +- crates/rsonpath-lib/src/input/owned.rs | 5 + crates/rsonpath-lib/src/lib.rs | 6 + crates/rsonpath-lib/src/result.rs | 17 +++ .../tests/input_implementation_tests.rs | 2 +- crates/rsonpath-test-codegen/src/gen.rs | 4 + rsonpath.code-workspace | 1 + 14 files changed, 208 insertions(+), 49 deletions(-) create mode 100644 crates/rsonpath-lib/src/engine/empty_query.rs diff --git a/.vscode/settings.json b/.vscode/settings.json index 9297a097..a5f1e5fb 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -38,6 +38,7 @@ "memmap", "memmem", "Mmap", + "mmaps", "movemask", "ndash", "nondescendant", diff --git a/CHANGELOG.md b/CHANGELOG.md index 5182dbb2..e4b09d3e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,9 +4,25 @@ All notable changes to this project will be documented in this file. ## [unreleased] +### Performance + +- Improved handling of the root-only query `$`. ([#160](https://github.com/V0ldek/rsonpath/issues/160)) + - Full nodes result when asking for root: 2 times throughput increase. + - Indices/count result when asking for root: basically unboundedly faster, + no longer looks at the entire document. + +### Documentation + +- Clarified that the `approximate_spans` guarantees. + - Now documentation mentions that the returned `MatchSpan`s can potentially + have their end indices farther than one would expect the input to logically end, + due to internal padding. + ### Bug fixes -- Fixed a bug when head-skipping to a single-byte key would panic. +- Fixed handling of the root-only query `$` on atomic documents. ([#160](https://github.com/V0ldek/rsonpath/issues/160)) + - Previously only object and array roots were supported. +- Fixed a bug when head-skipping to a single-byte key would panic. ([#281](https://github.com/V0ldek/rsonpath/issues/281)) - This was detected by fuzzing! - The queries `$..["{"]` and `$..["["]` would panic on inputs starting with the bytes `{"` or `["`, respectively. diff --git a/crates/rsonpath-lib/src/engine.rs b/crates/rsonpath-lib/src/engine.rs index a9835a4f..83f6140b 100644 --- a/crates/rsonpath-lib/src/engine.rs +++ b/crates/rsonpath-lib/src/engine.rs @@ -8,6 +8,7 @@ mod head_skipping; pub mod main; mod tail_skipping; pub use main::MainEngine as RsonpathEngine; +mod empty_query; use self::error::EngineError; use crate::{ @@ -57,7 +58,10 @@ pub trait Engine { /// Find the approximate spans of matches on the given [`Input`] and write them to the [`Sink`]. /// /// "Approximate" means that the ends of spans are not guaranteed to be located exactly at the end of a match, - /// but may include trailing whitespace. It is guaranteed that: + /// but may include trailing whitespace. **Importantly**, it may be beyond the rigidly-defined length of the + /// input, as the engine is allowed to pad the input with whitespace to help with processing. For the purposes + /// of this API, it is assumed that every character after the logical end of the input is a whitespace character. + /// With that in mind, it is guaranteed that: /// 1. the span start is exact; /// 2. the span encompasses the entire matched value; /// 3. the only characters included after the value are JSON whitespace characters: diff --git a/crates/rsonpath-lib/src/engine/empty_query.rs b/crates/rsonpath-lib/src/engine/empty_query.rs new file mode 100644 index 00000000..85e41f2d --- /dev/null +++ b/crates/rsonpath-lib/src/engine/empty_query.rs @@ -0,0 +1,118 @@ +//! Special case handlers for the empty query $. +//! +//! The main engine is built with a path automaton in mind, and for simplicity +//! we assume the root opening was already read. This makes it incompatible with +//! an empty query. Instead of rewriting the engine we provide fast-path implementations +//! here. +use crate::{ + engine::{error::EngineError, Input}, + is_json_whitespace, + result::{empty::EmptyRecorder, Match, MatchCount, MatchIndex, MatchSpan, Sink}, + FallibleIterator, BLOCK_SIZE, +}; + +/// Count for an empty query – determine if the root exists. +pub(super) fn count(input: &I) -> Result +where + I: Input, +{ + // Assuming a correct JSON, there is either one root if any non-whitespace character + // occurs in the document, or the document is empty. + if input.seek_non_whitespace_forward(0)?.is_some() { + Ok(1) + } else { + Ok(0) + } +} + +/// Index for an empty query – determine the first index of the root. +pub(super) fn index(input: &I, sink: &mut S) -> Result<(), EngineError> +where + I: Input, + S: Sink, +{ + // Assuming a correct JSON, the root starts at the first non-whitespace character, if any. + if let Some((first_idx, _)) = input.seek_non_whitespace_forward(0)? { + sink.add_match(first_idx) + .map_err(|err| EngineError::SinkError(Box::new(err)))?; + } + + Ok(()) +} + +/// Approximate span for an empty query – determine the first index and the length of the root. +pub(super) fn approx_span(input: &I, sink: &mut S) -> Result<(), EngineError> +where + I: Input, + S: Sink, +{ + // The root spans the entire document, by definition, with the exception of whitespace. + // We need to find the start index exactly, and then can return the length of the rest as the approximate + // length of the root. + // + // Some input know their lengths: bytes already in memory, file mmaps, etc. + // A BufferedInput over an arbitrary Read stream cannot know its length, so we actually + // need to iterate until the end and count the bytes. + if let Some((first_idx, _)) = input.seek_non_whitespace_forward(0)? { + let end_idx = match input.len_hint() { + Some(end_idx) => end_idx, // Known length, just take it. + None => { + // Unknown length, iterate and count. + let mut iter = input.iter_blocks::<_, BLOCK_SIZE>(&EmptyRecorder); + let mut end_idx = 0; + + while (iter.next()?).is_some() { + end_idx += BLOCK_SIZE; + } + + end_idx + } + }; + + sink.add_match(MatchSpan::from_indices(first_idx, end_idx)) + .map_err(|err| EngineError::SinkError(Box::new(err)))?; + } + + Ok(()) +} + +/// Match for an empty query – copy the entire document, trimming whitespace. +pub(super) fn match_(input: &I, sink: &mut S) -> Result<(), EngineError> +where + I: Input, + S: Sink, +{ + // For a full match we need to copy the entire input starting from first non-whitespace, + // and then trim the whitespace from the end. This might be slow if the document is excessively + // padded with whitespace at start and/or end, but that's a pathological case. + let mut iter = input.iter_blocks::<_, BLOCK_SIZE>(&EmptyRecorder); + let mut res: Vec = vec![]; + let mut first_significant_idx = None; + + while let Some(block) = iter.next()? { + if first_significant_idx.is_none() { + // Start of the root not found yet, look for it. + first_significant_idx = block.iter().position(|&x| !is_json_whitespace(x)); + + if let Some(first_idx) = first_significant_idx { + // Start of the root found in this block, copy the relevant part. + res.extend(&block[first_idx..]); + } + } else { + // Start of the root was already found, now we are copying everything. + res.extend(&*block); + } + } + + if let Some(start) = first_significant_idx { + // Trim whitespace if we have a result. + while !res.is_empty() && is_json_whitespace(res[res.len() - 1]) { + res.pop(); + } + + sink.add_match(Match::from_start_and_bytes(start, res)) + .map_err(|err| EngineError::SinkError(Box::new(err)))?; + } + + Ok(()) +} diff --git a/crates/rsonpath-lib/src/engine/main.rs b/crates/rsonpath-lib/src/engine/main.rs index d35ead02..b50bc418 100644 --- a/crates/rsonpath-lib/src/engine/main.rs +++ b/crates/rsonpath-lib/src/engine/main.rs @@ -13,6 +13,7 @@ use crate::{ debug, depth::Depth, engine::{ + empty_query, error::EngineError, head_skipping::{CanHeadSkip, HeadSkip, ResumeState}, tail_skipping::TailSkip, @@ -70,8 +71,7 @@ impl Engine for MainEngine<'_> { let recorder = CountRecorder::new(); if self.automaton.is_empty_query() { - empty_query(input, &recorder, self.simd)?; - return Ok(recorder.into()); + return empty_query::count(input); } simd_dispatch!(self.simd => |simd| { @@ -91,7 +91,7 @@ impl Engine for MainEngine<'_> { let recorder = IndexRecorder::new(sink); if self.automaton.is_empty_query() { - return empty_query(input, &recorder, self.simd); + return empty_query::index(input, sink); } simd_dispatch!(self.simd => |simd| { @@ -111,7 +111,7 @@ impl Engine for MainEngine<'_> { let recorder = ApproxSpanRecorder::new(sink); if self.automaton.is_empty_query() { - return empty_query(input, &recorder, self.simd); + return empty_query::approx_span(input, sink); } simd_dispatch!(self.simd => |simd| { @@ -131,7 +131,7 @@ impl Engine for MainEngine<'_> { let recorder = NodesRecorder::build_recorder(sink); if self.automaton.is_empty_query() { - return empty_query(input, &recorder, self.simd); + return empty_query::match_(input, sink); } simd_dispatch!(self.simd => |simd| { @@ -143,40 +143,6 @@ impl Engine for MainEngine<'_> { } } -fn empty_query<'i, I, R>(input: &'i I, recorder: &R, simd: SimdConfiguration) -> Result<(), EngineError> -where - I: Input + 'i, - R: Recorder>, -{ - let mut iter = input.iter_blocks(recorder); - let mut idx = 0; - - loop { - match iter.next()? { - Some(block) => { - let pos = block.iter().position(|&x| x != b' ' && x != b'\n' && x != b'\t' && x != b'\r'); - match pos { - Some(in_block_idx) => { - recorder.record_match(idx + in_block_idx, Depth::ONE, MatchedNodeType::Atomic)?; - idx += BLOCK_SIZE; - break; - }, - None => idx += BLOCK_SIZE, - } - }, - None => return Ok(()), - } - } - - while (iter.next()?).is_some() { - idx += BLOCK_SIZE; - } - - recorder.record_value_terminator(idx - 1, Depth::ONE)?; - - Ok(()) -} - macro_rules! Classifier { () => { TailSkip< diff --git a/crates/rsonpath-lib/src/input.rs b/crates/rsonpath-lib/src/input.rs index 816a8f49..b2eb4790 100644 --- a/crates/rsonpath-lib/src/input.rs +++ b/crates/rsonpath-lib/src/input.rs @@ -62,6 +62,17 @@ pub trait Input: Sized { where Self: 'i; + /// Return the length of the entire input, if known. + /// + /// This is meant to be used merely as a hint. + /// There are [`Input`] implementations that may not be able to know the entire + /// length a priori, and they should return [`None`]. + #[inline(always)] + #[must_use] + fn len_hint(&self) -> Option { + None + } + /// Iterate over blocks of size `N` of the input. /// `N` has to be a power of two larger than 1. #[must_use] @@ -293,11 +304,11 @@ mod tests { use pretty_assertions::assert_eq; #[test] - fn on_empty_bytes_is_all_zero() { + fn on_empty_bytes_is_all_whitespace() { let result = in_slice::pad_last_block(&[]); assert_eq!(result.absolute_start, 0); - assert_eq!(result.bytes, [0; MAX_BLOCK_SIZE]); + assert_eq!(result.bytes, [b' '; MAX_BLOCK_SIZE]); } #[test] @@ -308,17 +319,17 @@ mod tests { assert_eq!(result.absolute_start, 0); assert_eq!(&result.bytes[0..11], bytes); - assert_eq!(&result.bytes[11..], [0; MAX_BLOCK_SIZE - 11]); + assert_eq!(&result.bytes[11..], [b' '; MAX_BLOCK_SIZE - 11]); } #[test] - fn on_bytes_equal_to_full_block_gives_all_zero() { + fn on_bytes_equal_to_full_block_gives_all_whitespace() { let bytes = [42; MAX_BLOCK_SIZE]; let result = in_slice::pad_last_block(&bytes); assert_eq!(result.absolute_start, MAX_BLOCK_SIZE); - assert_eq!(result.bytes, [0; MAX_BLOCK_SIZE]); + assert_eq!(result.bytes, [b' '; MAX_BLOCK_SIZE]); } #[test] @@ -330,7 +341,7 @@ mod tests { assert_eq!(result.absolute_start, 2 * MAX_BLOCK_SIZE); assert_eq!(result.bytes[0..77], [69; 77]); - assert_eq!(result.bytes[77..], [0; MAX_BLOCK_SIZE - 77]); + assert_eq!(result.bytes[77..], [b' '; MAX_BLOCK_SIZE - 77]); } } diff --git a/crates/rsonpath-lib/src/input/borrowed.rs b/crates/rsonpath-lib/src/input/borrowed.rs index 1db8dcff..2e2a5bb1 100644 --- a/crates/rsonpath-lib/src/input/borrowed.rs +++ b/crates/rsonpath-lib/src/input/borrowed.rs @@ -95,6 +95,11 @@ impl<'a> Input for BorrowedBytes<'a> { type Block<'b, const N: usize> = &'b [u8] where Self: 'b; + #[inline(always)] + fn len_hint(&self) -> Option { + Some((self.bytes.len() / MAX_BLOCK_SIZE + 1) * MAX_BLOCK_SIZE) + } + #[inline(always)] fn iter_blocks<'b, 'r, R, const N: usize>(&'b self, recorder: &'r R) -> Self::BlockIterator<'b, 'r, N, R> where diff --git a/crates/rsonpath-lib/src/input/mmap.rs b/crates/rsonpath-lib/src/input/mmap.rs index df22837f..14e9216d 100644 --- a/crates/rsonpath-lib/src/input/mmap.rs +++ b/crates/rsonpath-lib/src/input/mmap.rs @@ -15,7 +15,7 @@ //! by an order of magnitude to execute the query on a memory map than it is to simply read the //! file into main memory. -use super::{borrowed::BorrowedBytesBlockIterator, error::InputError, in_slice, Input, LastBlock}; +use super::{borrowed::BorrowedBytesBlockIterator, error::InputError, in_slice, Input, LastBlock, MAX_BLOCK_SIZE}; use crate::{query::JsonString, result::InputRecorder}; use memmap2::{Mmap, MmapAsRawDesc}; @@ -55,6 +55,11 @@ impl Input for MmapInput { type Block<'a, const N: usize> = &'a [u8]; + #[inline(always)] + fn len_hint(&self) -> Option { + Some((self.mmap.len() / MAX_BLOCK_SIZE + 1) * MAX_BLOCK_SIZE) + } + #[inline(always)] fn iter_blocks<'a, 'r, R, const N: usize>(&'a self, recorder: &'r R) -> Self::BlockIterator<'a, 'r, N, R> where diff --git a/crates/rsonpath-lib/src/input/owned.rs b/crates/rsonpath-lib/src/input/owned.rs index 99c2957d..298d140a 100644 --- a/crates/rsonpath-lib/src/input/owned.rs +++ b/crates/rsonpath-lib/src/input/owned.rs @@ -225,6 +225,11 @@ impl Input for OwnedBytes { type Block<'a, const N: usize> = &'a [u8]; + #[inline(always)] + fn len_hint(&self) -> Option { + Some((self.len / MAX_BLOCK_SIZE + 1) * MAX_BLOCK_SIZE) + } + #[inline(always)] fn iter_blocks<'a, 'r, R, const N: usize>(&'a self, recorder: &'r R) -> Self::BlockIterator<'a, 'r, N, R> where diff --git a/crates/rsonpath-lib/src/lib.rs b/crates/rsonpath-lib/src/lib.rs index c8ce08a2..1f978ed9 100644 --- a/crates/rsonpath-lib/src/lib.rs +++ b/crates/rsonpath-lib/src/lib.rs @@ -346,3 +346,9 @@ impl Iterator for FallibleIntoIter { } } } + +#[inline(always)] +#[must_use] +pub(crate) fn is_json_whitespace(x: u8) -> bool { + x == b' ' || x == b'\t' || x == b'\n' || x == b'\r' +} diff --git a/crates/rsonpath-lib/src/result.rs b/crates/rsonpath-lib/src/result.rs index 62fd0660..de5d6435 100644 --- a/crates/rsonpath-lib/src/result.rs +++ b/crates/rsonpath-lib/src/result.rs @@ -41,6 +41,19 @@ pub struct Match { } impl MatchSpan { + pub(crate) fn from_indices(start_idx: usize, end_idx: usize) -> Self { + assert!( + start_idx <= end_idx, + "start of span {} is greater than end {}", + start_idx, + end_idx + ); + Self { + start_idx, + len: end_idx - start_idx, + } + } + /// Returns the starting index of the match. #[inline(always)] #[must_use] @@ -65,6 +78,10 @@ impl MatchSpan { } impl Match { + pub(crate) fn from_start_and_bytes(span_start: usize, bytes: Vec) -> Self { + Self { bytes, span_start } + } + /// Returns the JSON contents of the match. #[inline(always)] #[must_use] diff --git a/crates/rsonpath-lib/tests/input_implementation_tests.rs b/crates/rsonpath-lib/tests/input_implementation_tests.rs index 8efc0501..9f130784 100644 --- a/crates/rsonpath-lib/tests/input_implementation_tests.rs +++ b/crates/rsonpath-lib/tests/input_implementation_tests.rs @@ -142,7 +142,7 @@ fn assert_padding_is_correct(result: &[u8], original_length: usize) { ); let padding_length = result.len() - original_length; - let expected_padding: Vec = iter::repeat(0).take(padding_length).collect(); + let expected_padding: Vec = iter::repeat(b' ').take(padding_length).collect(); assert_eq!(&result[original_length..], expected_padding); } diff --git a/crates/rsonpath-test-codegen/src/gen.rs b/crates/rsonpath-test-codegen/src/gen.rs index 7cda9afb..423adeef 100644 --- a/crates/rsonpath-test-codegen/src/gen.rs +++ b/crates/rsonpath-test-codegen/src/gen.rs @@ -254,6 +254,10 @@ fn get_available_results(input: &model::InputSource, query: &model::Query) -> Re end += 1 } + if end == b.len() { + end = usize::MAX; + } + model::ResultApproximateSpan { start: span.start, end_lower_bound: span.end, diff --git a/rsonpath.code-workspace b/rsonpath.code-workspace index deea91da..fc3a6ef4 100644 --- a/rsonpath.code-workspace +++ b/rsonpath.code-workspace @@ -35,6 +35,7 @@ "memchr", "memmap", "Mmap", + "mmaps", "movemask", "nondescendant", "nonescaped",