rsonquery · charles-paperman · Sep 22, 2023 · Sep 21, 2023 · Sep 22, 2023 · Sep 22, 2023
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -38,6 +38,7 @@
         "memmap",
         "memmem",
         "Mmap",
+        "mmaps",
         "movemask",
         "ndash",
         "nondescendant",

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,9 +4,25 @@ All notable changes to this project will be documented in this file.
 
 ## [unreleased]
 
+### Performance
+
+- Improved handling of the root-only query `$`. ([#160](https://github.com/V0ldek/rsonpath/issues/160))
+  - Full nodes result when asking for root: 2 times throughput increase.
+  - Indices/count result when asking for root: basically unboundedly faster,
+    no longer looks at the entire document.
+
+### Documentation
+
+- Clarified that the `approximate_spans` guarantees.
+  - Now documentation mentions that the returned `MatchSpan`s can potentially
+    have their end indices farther than one would expect the input to logically end,
+    due to internal padding.
+
 ### Bug fixes
 
-- Fixed a bug when head-skipping to a single-byte key would panic.
+- Fixed handling of the root-only query `$` on atomic documents. ([#160](https://github.com/V0ldek/rsonpath/issues/160))
+  - Previously only object and array roots were supported.
+- Fixed a bug when head-skipping to a single-byte key would panic. ([#281](https://github.com/V0ldek/rsonpath/issues/281))
   - This was detected by fuzzing!
   - The queries `$..["{"]` and `$..["["]` would panic
     on inputs starting with the bytes `{"` or `["`, respectively.

diff --git a/crates/rsonpath-lib/src/engine.rs b/crates/rsonpath-lib/src/engine.rs
@@ -8,6 +8,7 @@ mod head_skipping;
 pub mod main;
 mod tail_skipping;
 pub use main::MainEngine as RsonpathEngine;
+mod empty_query;
 
 use self::error::EngineError;
 use crate::{
@@ -57,7 +58,10 @@ pub trait Engine {
     /// Find the approximate spans of matches on the given [`Input`] and write them to the [`Sink`].
     ///
     /// "Approximate" means that the ends of spans are not guaranteed to be located exactly at the end of a match,
-    /// but may include trailing whitespace. It is guaranteed that:
+    /// but may include trailing whitespace. **Importantly**, it may be beyond the rigidly-defined length of the
+    /// input, as the engine is allowed to pad the input with whitespace to help with processing. For the purposes
+    /// of this API, it is assumed that every character after the logical end of the input is a whitespace character.
+    /// With that in mind, it is guaranteed that:
     /// 1. the span start is exact;
     /// 2. the span encompasses the entire matched value;
     /// 3. the only characters included after the value are JSON whitespace characters:

diff --git a/crates/rsonpath-lib/src/engine/empty_query.rs b/crates/rsonpath-lib/src/engine/empty_query.rs
@@ -0,0 +1,118 @@
+//! Special case handlers for the empty query $.
+//!
+//! The main engine is built with a path automaton in mind, and for simplicity
+//! we assume the root opening was already read. This makes it incompatible with
+//! an empty query. Instead of rewriting the engine we provide fast-path implementations
+//! here.
+use crate::{
+    engine::{error::EngineError, Input},
+    is_json_whitespace,
+    result::{empty::EmptyRecorder, Match, MatchCount, MatchIndex, MatchSpan, Sink},
+    FallibleIterator, BLOCK_SIZE,
+};
+
+/// Count for an empty query &ndash; determine if the root exists.
+pub(super) fn count<I>(input: &I) -> Result<MatchCount, EngineError>
+where
+    I: Input,
+{
+    // Assuming a correct JSON, there is either one root if any non-whitespace character
+    // occurs in the document, or the document is empty.
+    if input.seek_non_whitespace_forward(0)?.is_some() {
+        Ok(1)
+    } else {
+        Ok(0)
+    }
+}
+
+/// Index for an empty query &ndash; determine the first index of the root.
+pub(super) fn index<I, S>(input: &I, sink: &mut S) -> Result<(), EngineError>
+where
+    I: Input,
+    S: Sink<MatchIndex>,
+{
+    // Assuming a correct JSON, the root starts at the first non-whitespace character, if any.
+    if let Some((first_idx, _)) = input.seek_non_whitespace_forward(0)? {
+        sink.add_match(first_idx)
+            .map_err(|err| EngineError::SinkError(Box::new(err)))?;
+    }
+
+    Ok(())
+}
+
+/// Approximate span for an empty query &ndash; determine the first index and the length of the root.
+pub(super) fn approx_span<I, S>(input: &I, sink: &mut S) -> Result<(), EngineError>
+where
+    I: Input,
+    S: Sink<MatchSpan>,
+{
+    // The root spans the entire document, by definition, with the exception of whitespace.
+    // We need to find the start index exactly, and then can return the length of the rest as the approximate
+    // length of the root.
+    //
+    // Some input know their lengths: bytes already in memory, file mmaps, etc.
+    // A BufferedInput over an arbitrary Read stream cannot know its length, so we actually
+    // need to iterate until the end and count the bytes.
+    if let Some((first_idx, _)) = input.seek_non_whitespace_forward(0)? {
+        let end_idx = match input.len_hint() {
+            Some(end_idx) => end_idx, // Known length, just take it.
+            None => {
+                // Unknown length, iterate and count.
+                let mut iter = input.iter_blocks::<_, BLOCK_SIZE>(&EmptyRecorder);
+                let mut end_idx = 0;
+
+                while (iter.next()?).is_some() {
+                    end_idx += BLOCK_SIZE;
+                }
+
+                end_idx
+            }
+        };
+
+        sink.add_match(MatchSpan::from_indices(first_idx, end_idx))
+            .map_err(|err| EngineError::SinkError(Box::new(err)))?;
+    }
+
+    Ok(())
+}
+
+/// Match for an empty query &ndash; copy the entire document, trimming whitespace.
+pub(super) fn match_<I, S>(input: &I, sink: &mut S) -> Result<(), EngineError>
+where
+    I: Input,
+    S: Sink<Match>,
+{
+    // For a full match we need to copy the entire input starting from first non-whitespace,
+    // and then trim the whitespace from the end. This might be slow if the document is excessively
+    // padded with whitespace at start and/or end, but that's a pathological case.
+    let mut iter = input.iter_blocks::<_, BLOCK_SIZE>(&EmptyRecorder);
+    let mut res: Vec<u8> = vec![];
+    let mut first_significant_idx = None;
+
+    while let Some(block) = iter.next()? {
+        if first_significant_idx.is_none() {
+            // Start of the root not found yet, look for it.
+            first_significant_idx = block.iter().position(|&x| !is_json_whitespace(x));
+
+            if let Some(first_idx) = first_significant_idx {
+                // Start of the root found in this block, copy the relevant part.
+                res.extend(&block[first_idx..]);
+            }
+        } else {
+            // Start of the root was already found, now we are copying everything.
+            res.extend(&*block);
+        }
+    }
+
+    if let Some(start) = first_significant_idx {
+        // Trim whitespace if we have a result.
+        while !res.is_empty() && is_json_whitespace(res[res.len() - 1]) {
+            res.pop();
+        }
+
+        sink.add_match(Match::from_start_and_bytes(start, res))
+            .map_err(|err| EngineError::SinkError(Box::new(err)))?;
+    }
+
+    Ok(())
+}
diff --git a/crates/rsonpath-lib/src/engine/main.rs b/crates/rsonpath-lib/src/engine/main.rs
@@ -13,6 +13,7 @@ use crate::{
     debug,
     depth::Depth,
     engine::{
+        empty_query,
         error::EngineError,
         head_skipping::{CanHeadSkip, HeadSkip, ResumeState},
         tail_skipping::TailSkip,
@@ -70,8 +71,7 @@ impl Engine for MainEngine<'_> {
         let recorder = CountRecorder::new();
 
         if self.automaton.is_empty_query() {
-            empty_query(input, &recorder, self.simd)?;
-            return Ok(recorder.into());
+            return empty_query::count(input);
         }
 
         simd_dispatch!(self.simd => |simd| {
@@ -91,7 +91,7 @@ impl Engine for MainEngine<'_> {
         let recorder = IndexRecorder::new(sink);
 
         if self.automaton.is_empty_query() {
-            return empty_query(input, &recorder, self.simd);
+            return empty_query::index(input, sink);
         }
 
         simd_dispatch!(self.simd => |simd| {
@@ -111,7 +111,7 @@ impl Engine for MainEngine<'_> {
         let recorder = ApproxSpanRecorder::new(sink);
 
         if self.automaton.is_empty_query() {
-            return empty_query(input, &recorder, self.simd);
+            return empty_query::approx_span(input, sink);
         }
 
         simd_dispatch!(self.simd => |simd| {
@@ -131,7 +131,7 @@ impl Engine for MainEngine<'_> {
         let recorder = NodesRecorder::build_recorder(sink);
 
         if self.automaton.is_empty_query() {
-            return empty_query(input, &recorder, self.simd);
+            return empty_query::match_(input, sink);
         }
 
         simd_dispatch!(self.simd => |simd| {
@@ -143,43 +143,6 @@ impl Engine for MainEngine<'_> {
     }
 }
 
-fn empty_query<'i, I, R>(input: &'i I, recorder: &R, simd: SimdConfiguration) -> Result<(), EngineError>
-where
-    I: Input + 'i,
-    R: Recorder<I::Block<'i, BLOCK_SIZE>>,
-{
-    simd_dispatch!(simd => |simd|
-    {
-        let iter = input.iter_blocks(recorder);
-        let quote_classifier = simd.classify_quoted_sequences(iter);
-        let mut block_event_source = simd.classify_structural_characters(quote_classifier);
-
-        let last_event = block_event_source.next()?;
-        if let Some(Structural::Opening(_, idx)) = last_event {
-            let mut depth = Depth::ONE;
-            recorder.record_match(idx, depth, MatchedNodeType::Complex)?;
-
-            while let Some(ev) = block_event_source.next()? {
-                match ev {
-                    Structural::Closing(_, idx) => {
-                        recorder.record_value_terminator(idx, depth)?;
-                        depth.decrement().map_err(|err| EngineError::DepthBelowZero(idx, err))?;
-                    }
-                    Structural::Colon(_) => (),
-                    Structural::Opening(_, idx) => {
-                        depth
-                            .increment()
-                            .map_err(|err| EngineError::DepthAboveLimit(idx, err))?;
-                    }
-                    Structural::Comma(idx) => recorder.record_value_terminator(idx, depth)?,
-                }
-            }
-        }
-    });
-
-    Ok(())
-}
-
 macro_rules! Classifier {
     () => {
         TailSkip<

diff --git a/crates/rsonpath-lib/src/input.rs b/crates/rsonpath-lib/src/input.rs
@@ -62,6 +62,17 @@ pub trait Input: Sized {
     where
         Self: 'i;
 
+    /// Return the length of the entire input, if known.
+    ///
+    /// This is meant to be used merely as a hint.
+    /// There are [`Input`] implementations that may not be able to know the entire
+    /// length a priori, and they should return [`None`].
+    #[inline(always)]
+    #[must_use]
+    fn len_hint(&self) -> Option<usize> {
+        None
+    }
+
     /// Iterate over blocks of size `N` of the input.
     /// `N` has to be a power of two larger than 1.
     #[must_use]
@@ -169,7 +180,7 @@ pub(super) mod in_slice {
 
     #[inline]
     pub(super) fn pad_last_block(bytes: &[u8]) -> LastBlock {
-        let mut last_block_buf = [0; MAX_BLOCK_SIZE];
+        let mut last_block_buf = [b' '; MAX_BLOCK_SIZE];
         let last_block_start = (bytes.len() / MAX_BLOCK_SIZE) * MAX_BLOCK_SIZE;
         let last_block_slice = &bytes[last_block_start..];
 
@@ -293,11 +304,11 @@ mod tests {
         use pretty_assertions::assert_eq;
 
         #[test]
-        fn on_empty_bytes_is_all_zero() {
+        fn on_empty_bytes_is_all_whitespace() {
             let result = in_slice::pad_last_block(&[]);
 
             assert_eq!(result.absolute_start, 0);
-            assert_eq!(result.bytes, [0; MAX_BLOCK_SIZE]);
+            assert_eq!(result.bytes, [b' '; MAX_BLOCK_SIZE]);
         }
 
         #[test]
@@ -308,17 +319,17 @@ mod tests {
 
             assert_eq!(result.absolute_start, 0);
             assert_eq!(&result.bytes[0..11], bytes);
-            assert_eq!(&result.bytes[11..], [0; MAX_BLOCK_SIZE - 11]);
+            assert_eq!(&result.bytes[11..], [b' '; MAX_BLOCK_SIZE - 11]);
         }
 
         #[test]
-        fn on_bytes_equal_to_full_block_gives_all_zero() {
+        fn on_bytes_equal_to_full_block_gives_all_whitespace() {
             let bytes = [42; MAX_BLOCK_SIZE];
 
             let result = in_slice::pad_last_block(&bytes);
 
             assert_eq!(result.absolute_start, MAX_BLOCK_SIZE);
-            assert_eq!(result.bytes, [0; MAX_BLOCK_SIZE]);
+            assert_eq!(result.bytes, [b' '; MAX_BLOCK_SIZE]);
         }
 
         #[test]
@@ -330,7 +341,7 @@ mod tests {
 
             assert_eq!(result.absolute_start, 2 * MAX_BLOCK_SIZE);
             assert_eq!(result.bytes[0..77], [69; 77]);
-            assert_eq!(result.bytes[77..], [0; MAX_BLOCK_SIZE - 77]);
+            assert_eq!(result.bytes[77..], [b' '; MAX_BLOCK_SIZE - 77]);
         }
     }
 

diff --git a/crates/rsonpath-lib/src/input/borrowed.rs b/crates/rsonpath-lib/src/input/borrowed.rs
@@ -95,6 +95,11 @@ impl<'a> Input for BorrowedBytes<'a> {
 
     type Block<'b, const N: usize> = &'b [u8] where Self: 'b;
 
+    #[inline(always)]
+    fn len_hint(&self) -> Option<usize> {
+        Some((self.bytes.len() / MAX_BLOCK_SIZE + 1) * MAX_BLOCK_SIZE)
+    }
+
     #[inline(always)]
     fn iter_blocks<'b, 'r, R, const N: usize>(&'b self, recorder: &'r R) -> Self::BlockIterator<'b, 'r, N, R>
     where

diff --git a/crates/rsonpath-lib/src/input/buffered.rs b/crates/rsonpath-lib/src/input/buffered.rs
@@ -74,7 +74,7 @@ impl<R: Read> InternalBuffer<R> {
         }
 
         if self.chunk_idx == self.bytes.len() {
-            self.bytes.push(BufferedChunk([0; BUF_SIZE]));
+            self.bytes.push(BufferedChunk([b' '; BUF_SIZE]));
         }
 
         let buf = &mut self.bytes[self.chunk_idx].0;

diff --git a/crates/rsonpath-lib/src/input/mmap.rs b/crates/rsonpath-lib/src/input/mmap.rs
@@ -15,7 +15,7 @@
 //! by an order of magnitude to execute the query on a memory map than it is to simply read the
 //! file into main memory.
 
-use super::{borrowed::BorrowedBytesBlockIterator, error::InputError, in_slice, Input, LastBlock};
+use super::{borrowed::BorrowedBytesBlockIterator, error::InputError, in_slice, Input, LastBlock, MAX_BLOCK_SIZE};
 use crate::{query::JsonString, result::InputRecorder};
 use memmap2::{Mmap, MmapAsRawDesc};
 
@@ -55,6 +55,11 @@ impl Input for MmapInput {
 
     type Block<'a, const N: usize> = &'a [u8];
 
+    #[inline(always)]
+    fn len_hint(&self) -> Option<usize> {
+        Some((self.mmap.len() / MAX_BLOCK_SIZE + 1) * MAX_BLOCK_SIZE)
+    }
+
     #[inline(always)]
     fn iter_blocks<'a, 'r, R, const N: usize>(&'a self, recorder: &'r R) -> Self::BlockIterator<'a, 'r, N, R>
     where

diff --git a/crates/rsonpath-lib/src/input/owned.rs b/crates/rsonpath-lib/src/input/owned.rs
@@ -116,7 +116,7 @@ impl OwnedBytes {
         // SAFETY:
         unsafe {
             ptr::copy_nonoverlapping(slice.as_ptr(), ptr.as_ptr(), slice.len());
-            ptr::write_bytes(ptr.as_ptr().add(slice.len()), 0, pad);
+            ptr::write_bytes(ptr.as_ptr().add(slice.len()), b' ', pad);
         };
 
         // SAFETY: At this point we allocated and initialized exactly `size` bytes.
@@ -225,6 +225,11 @@ impl Input for OwnedBytes {
 
     type Block<'a, const N: usize> = &'a [u8];
 
+    #[inline(always)]
+    fn len_hint(&self) -> Option<usize> {
+        Some((self.len / MAX_BLOCK_SIZE + 1) * MAX_BLOCK_SIZE)
+    }
+
     #[inline(always)]
     fn iter_blocks<'a, 'r, R, const N: usize>(&'a self, recorder: &'r R) -> Self::BlockIterator<'a, 'r, N, R>
     where