rsonquery · V0ldek · Sep 9, 2023 · Sep 6, 2023 · Sep 7, 2023 · Sep 9, 2023
diff --git a/Justfile b/Justfile
@@ -139,12 +139,12 @@ test-book:
 
 @add-test name:
     f=`echo {{name}} | sed s/-/_/g` && \
-        cp ./crates/rsonpath-lib/tests/documents/toml/test-template-inline.toml ./crates/rsonpath-lib/tests/documents/toml/$f.toml && \
+        cp ./crates/rsonpath-lib/tests/documents/toml/test_template_inline.toml ./crates/rsonpath-lib/tests/documents/toml/$f.toml && \
         echo "Test template initialised at crates/rsonpath-lib/tests/documents/toml/$f.toml"
 
 @add-test-large name:
     f=`echo {{name}} | sed s/-/_/g` && \
-        cp ./crates/rsonpath-lib/tests/documents/toml/test-template-large.toml ./crates/rsonpath-lib/tests/documents/toml/$f.toml && \
+        cp ./crates/rsonpath-lib/tests/documents/toml/test_template_large.toml ./crates/rsonpath-lib/tests/documents/toml/$f.toml && \
         echo "Test template initialised at crates/rsonpath-lib/tests/documents/toml/$f.toml" && \
         echo "{}" > ./crates/rsonpath-lib/tests/documents/json/large/$f.json && \
         echo "Put your large JSON document as contents of crates/rsonpath-lib/tests/documents/json/large/$f.json"

diff --git a/crates/rsonpath-benchmarks b/crates/rsonpath-benchmarks
diff --git a/crates/rsonpath-lib/src/classification.rs b/crates/rsonpath-lib/src/classification.rs
@@ -46,8 +46,8 @@
 //! let mut resume_state = structural_classifier.stop();
 //! assert_eq!(resume_state.get_idx(), 5);
 //!
-//! // Skip 6 bytes.
-//! resume_state.offset_bytes(6);
+//! // Skip to index 11.
+//! resume_state.forward_to(11);
 //! assert_eq!(resume_state.get_idx(), 11);
 //!
 //! // Resume.
@@ -113,42 +113,61 @@ where
         self.iter.get_offset() + self.block.as_ref().map_or(0, |b| b.idx)
     }
 
-    /// Move the state forward by `count` bytes.
+    /// Move the state forward to `index`.
     ///
     /// # Errors
     /// If the offset crosses block boundaries, then a new block is read from the underlying
     /// [`Input`](crate::input::Input) implementation, which can fail.
     ///
     /// # Panics
-    /// If the `count` is not positive.
+    /// If the `index` is not ahead of the current position of the state ([`get_idx`](ResumeClassifierState::get_idx)).
     #[inline]
     #[allow(clippy::panic_in_result_fn)]
-    pub fn offset_bytes(&mut self, count: isize) -> Result<(), InputError> {
-        assert!(count > 0);
-        let count = count as usize;
+    pub fn forward_to(&mut self, index: usize) -> Result<(), InputError> {
+        let current_block_start = self.iter.get_offset();
+        let current_block_idx = self.block.as_ref().map_or(0, |b| b.idx);
+        let current_idx = current_block_start + current_block_idx;
 
-        let remaining_in_block = self.block.as_ref().map_or(0, |b| b.block.len() - b.idx);
-
-        match self.block.as_mut() {
-            Some(b) if b.block.len() - b.idx > count => {
-                b.idx += count;
-            }
-            _ => {
-                let blocks_to_advance = (count - remaining_in_block) / N;
+        debug!(
+            "Calling forward_to({index}) when the inner iter offset is {current_block_start} and block idx is {current_block_idx:?}"
+        );
 
-                let remainder = (self.block.as_ref().map_or(0, |b| b.idx) + count - blocks_to_advance * N) % N;
+        // We want to move by this much forward, and delta > 0.
+        assert!(index > current_idx);
+        let delta = index - current_idx;
 
-                self.iter.offset(blocks_to_advance as isize);
-                let next_block = self.iter.next()?;
+        // First we virtually pretend to move *backward*, setting the index of the current block to zero,
+        // and adjust the delta to cover that distance. This makes calculations simpler.
+        // Then we need to skip zero or more blocks and set our self.block to the last one we visit.
+        let remaining = delta + current_block_idx;
+        let blocks_to_skip = remaining / N;
+        let remainder = remaining % N;
 
-                self.block = next_block.map(|b| ResumeClassifierBlockState {
-                    block: b,
-                    idx: remainder,
-                });
+        match self.block.as_mut() {
+            Some(b) if blocks_to_skip == 0 => {
+                b.idx = remaining;
+            }
+            Some(_) => {
+                self.block = self
+                    .iter
+                    .offset(blocks_to_skip as isize)?
+                    .map(|b| ResumeClassifierBlockState {
+                        block: b,
+                        idx: remainder,
+                    });
+            }
+            None => {
+                self.block = self
+                    .iter
+                    .offset((blocks_to_skip + 1) as isize)?
+                    .map(|b| ResumeClassifierBlockState {
+                        block: b,
+                        idx: remainder,
+                    });
             }
         }
 
-        debug!("offset_bytes({count}) results in idx moved to {}", self.get_idx());
+        debug!("forward_to({index}) results in idx moved to {}", self.get_idx());
 
         Ok(())
     }

diff --git a/crates/rsonpath-lib/src/classification/quotes.rs b/crates/rsonpath-lib/src/classification/quotes.rs
@@ -38,6 +38,10 @@ use crate::{
 };
 use cfg_if::cfg_if;
 
+/// Result of the [`FallibleIterator`] for quote classification,
+/// and of the [`offset`](`QuoteClassifiedIterator::offset`) function.
+pub type QuoteIterResult<I, M, const N: usize> = Result<Option<QuoteClassifiedBlock<I, M, N>>, InputError>;
+
 /// Input block with a bitmask signifying which characters are within quotes.
 ///
 /// Characters within quotes in the input are guaranteed to have their corresponding
@@ -63,8 +67,11 @@ pub trait QuoteClassifiedIterator<'i, I: InputBlockIterator<'i, N>, M, const N:
     fn get_offset(&self) -> usize;
 
     /// Move the iterator `count` blocks forward.
-    /// Effectively skips `count * Twice<BlockAlignment>::size()` bytes.
-    fn offset(&mut self, count: isize);
+    ///
+    /// # Errors
+    /// At least one new block is read from the underlying
+    /// [`InputBlockIterator`] implementation, which can fail.
+    fn offset(&mut self, count: isize) -> QuoteIterResult<I::Block, M, N>;
 
     /// Flip the bit representing whether the last block ended with a nonescaped quote.
     ///

diff --git a/crates/rsonpath-lib/src/classification/quotes/nosimd.rs b/crates/rsonpath-lib/src/classification/quotes/nosimd.rs
@@ -107,15 +107,15 @@ where
         self.iter.get_offset() - N
     }
 
-    fn offset(&mut self, count: isize) {
-        debug_assert!(count >= 0);
+    fn offset(&mut self, count: isize) -> QuoteIterResult<I::Block, usize, N> {
+        debug_assert!(count > 0);
         debug!("Offsetting by {count}");
 
-        if count == 0 {
-            return;
+        for _ in 0..count - 1 {
+            self.iter.next()?;
         }
 
-        self.iter.offset(count);
+        self.next()
     }
 
     fn flip_quotes_bit(&mut self) {

diff --git a/crates/rsonpath-lib/src/classification/quotes/shared.rs b/crates/rsonpath-lib/src/classification/quotes/shared.rs
@@ -89,15 +89,15 @@ macro_rules! quote_classifier {
                 self.iter.get_offset() - $size
             }
 
-            fn offset(&mut self, count: isize) {
-                debug_assert!(count >= 0);
+            fn offset(&mut self, count: isize) -> QuoteIterResult<I::Block, $mask_ty, $size> {
+                debug_assert!(count > 0);
                 debug!("Offsetting by {count}");
 
-                if count == 0 {
-                    return;
+                for _ in 0..count - 1 {
+                    self.iter.next()?;
                 }
 
-                self.iter.offset(count);
+                self.next()
             }
 
             fn flip_quotes_bit(&mut self) {

diff --git a/crates/rsonpath-lib/src/engine/head_skipping.rs b/crates/rsonpath-lib/src/engine/head_skipping.rs
@@ -12,7 +12,7 @@ use crate::{
     debug,
     depth::Depth,
     engine::EngineError,
-    input::{Input, InputBlockIterator},
+    input::Input,
     query::{
         automaton::{Automaton, State},
         JsonString,
@@ -139,19 +139,10 @@ impl<'b, 'q, I: Input> HeadSkip<'b, 'q, I, BLOCK_SIZE> {
                             .seek_non_whitespace_forward(colon_idx + 1)?
                             .ok_or(EngineError::MissingItem())?;
 
-                        // The goal is initializing the quote classifier correctly.
-                        // We can do it as follows:
-                        // - Initialize it to point to the start of the first block.
-                        // - Now we need to move it to the next_idx. Calculate the offset from that point.
-                        // - Offset by that much plus one.
-                        let start_of_second_block = input_iter.get_offset();
-                        debug_assert!(start_of_second_block >= BLOCK_SIZE);
-                        let start_of_first_block = start_of_second_block - BLOCK_SIZE;
-                        let distance_to_colon = colon_idx - start_of_first_block;
-                        let distance_to_value = next_idx - start_of_first_block - distance_to_colon;
-
                         let (quote_classifier, quote_classified_first_block) =
                             resume_quote_classification(input_iter, first_block);
+                        // Temporarily set the index within the current block to zero.
+                        // This makes sense for the move below.
                         let mut classifier_state = ResumeClassifierState {
                             iter: quote_classifier,
                             block: quote_classified_first_block
@@ -163,26 +154,37 @@ impl<'b, 'q, I: Input> HeadSkip<'b, 'q, I, BLOCK_SIZE> {
                         debug!("Actual match with colon at {colon_idx}");
                         debug!("Next significant character at {next_idx}");
                         debug!("Classifier is at {}", classifier_state.get_idx());
-                        debug!("We want to offset by {distance_to_colon} first, then by {distance_to_value}",);
-
-                        classifier_state.offset_bytes(distance_to_colon as isize)?;
-
-                        // Check if the colon is marked as within quotes.
-                        // If yes, that is an error of state propagation through skipped blocks.
-                        // Flip the quote mask.
+                        debug!("We will forward to {colon_idx} first, then to {next_idx}",);
+
+                        // Now we want to move the entire iterator state so that the current block is quote-classified,
+                        // and correctly points to the place the engine would expect had it found the matching key
+                        // in the regular loop. If the value is atomic, we handle it ourselves. If the value is complex,
+                        // the engine wants to start one byte *after* the opening character.
+                        let resume_idx = if next_c == b'{' || next_c == b'[' {
+                            next_idx + 1
+                        } else {
+                            next_idx
+                        };
+                        classifier_state.forward_to(resume_idx)?;
+
+                        // We now have the block where we want and we ran quote classification, but during the `forward_to`
+                        // call we lose all the flow-through quote information that usually is passed from one block to the next.
+                        // We need to manually verify the soundness of the classification. Fortunately:
+                        // 1. we know that resume_idx is either the start of a value, or one byte after an opening -
+                        //    in a valid JSON this character can be within quotes if and only if it is itself a quote;
+                        // 2. the only way the mask can be wrong is if it is flipped - marks chars within quotes
+                        //    as outside and vice versa - so it suffices to flip it if it is wrong.
                         if let Some(block) = classifier_state.block.as_mut() {
-                            if block.block.within_quotes_mask.is_lit(block.idx) {
+                            let should_be_quoted = block.block.block[block.idx] == b'"';
+                            if block.block.within_quotes_mask.is_lit(block.idx) != should_be_quoted {
                                 debug!("Mask needs flipping!");
                                 block.block.within_quotes_mask = !block.block.within_quotes_mask;
                                 classifier_state.iter.flip_quotes_bit();
                             }
                         }
 
-                        classifier_state.offset_bytes(distance_to_value as isize)?;
-
                         classifier_state = match next_c {
                             b'{' | b'[' => {
-                                classifier_state.offset_bytes(1)?;
                                 debug!("resuming");
                                 if self.is_accepting {
                                     engine.recorder().record_match(

diff --git a/crates/rsonpath-lib/src/input/borrowed.rs b/crates/rsonpath-lib/src/input/borrowed.rs
@@ -9,10 +9,8 @@
 //! This type of input is the fastest to process for the engine,
 //! since there is no additional overhead from loading anything to memory.
 
-use log::debug;
-
 use super::*;
-use crate::{query::JsonString, result::InputRecorder};
+use crate::{debug, query::JsonString, result::InputRecorder};
 
 /// Input wrapping a borrowed [`[u8]`] buffer.
 pub struct BorrowedBytes<'a> {

diff --git a/crates/rsonpath-lib/src/lib.rs b/crates/rsonpath-lib/src/lib.rs
@@ -244,7 +244,7 @@ macro_rules! block {
                     .map(|x| if x.is_ascii_whitespace() { b' ' } else { *x })
                     .collect::<Vec<_>>()
             )
-            .unwrap()
+            .unwrap_or("[INVALID UTF8]")
         );
     };
 }

diff --git a/crates/rsonpath-lib/src/query/parser.rs b/crates/rsonpath-lib/src/query/parser.rs
@@ -355,20 +355,20 @@ mod tests {
 
     #[test]
     fn single_quoted_member_should_not_unescape_backslashes() {
-        let input = r#"\\x"#;
+        let input = r"\\x";
 
         let result = super::single_quoted_member()(input);
 
-        assert_eq!(result, Ok(("", r#"\\x"#.to_owned())));
+        assert_eq!(result, Ok(("", r"\\x".to_owned())));
     }
 
     #[test]
     fn double_quoted_member_should_not_unescape_backslashes() {
-        let input = r#"\\x"#;
+        let input = r"\\x";
 
         let result = super::double_quoted_member()(input);
 
-        assert_eq!(result, Ok(("", r#"\\x"#.to_owned())));
+        assert_eq!(result, Ok(("", r"\\x".to_owned())));
     }
 
     #[test]

diff --git a/crates/rsonpath-lib/tests/classifier_correctness_tests.rs b/crates/rsonpath-lib/tests/classifier_correctness_tests.rs
@@ -144,7 +144,7 @@ mod prop_test {
             let escaped_string: String = value
                 .chars()
                 .map(|c| match c {
-                    '\\' => String::from(r#"\\"#),
+                    '\\' => String::from(r"\\"),
                     '"' => String::from(r#"\""#),
                     x => x.to_string(),
                 })
@@ -187,7 +187,7 @@ mod prop_test {
             Just(Token::OpeningBracket),
             Just(Token::ClosingBrace),
             Just(Token::ClosingBracket),
-            r#"[ -!#-+\--9;-Z^-z|~]+"#.prop_map(Token::Garbage), // ascii characters except structural
+            r"[ -!#-+\--9;-Z^-z|~]+".prop_map(Token::Garbage), // ascii characters except structural
             "[ -~]".prop_map(|x| Token::Quoted(QuotedString::from(&x)))  // all ascii characters
         ]
     }

diff --git a/crates/rsonpath-lib/tests/documents/toml/head_skip_long.toml b/crates/rsonpath-lib/tests/documents/toml/head_skip_long.toml
@@ -0,0 +1,54 @@
+# Define the JSON input for all query test cases.
+[input]
+# Short description of the input structure.
+description = "Long labels to search with head skipping (issue #249)"
+# Set to true only if your specific test input is fully compressed (no extraneous whitespace).
+is_compressed = false
+
+# Inline JSON document.
+[input.source]
+json_string = '''
+{
+  "target": {
+    "please note the important whitespaces after the upcoming comma (pretend indentation is really big)": 42
+  },                                                   
+  "target" : 43,
+  "very long label to search for, like, extremely long, so that the colon occurs really far away from the start of the needle match, which triggers some interesting behavior and might break the head skipping module like in #249": 44
+}
+'''
+
+# Define queries to test on the input.
+[[queries]]
+# First scenario causing the #249 panic - label that is so long that it spans multiple blocks of input.
+query = "$..['very long label to search for, like, extremely long, so that the colon occurs really far away from the start of the needle match, which triggers some interesting behavior and might break the head skipping module like in #249']"
+# Short descritpion of the query semantics.
+description = "select the extremely long label"
+
+[queries.results]
+# Number of expected matches.
+count = 1
+# Byte locations of starts of all matches, in order.
+bytes = [428]
+# Stringified values of all matches, verbatim as in the input,
+# in the same order as above.
+nodes = ['44']
+
+[[queries]]
+# Second scenario causing the #249 panic - extremely specific location of tokens, where
+# the end of a subtree (the closing character) is exactly at the end of a block,
+# the key being looked for is found in the next block,
+# but whitespace cause the colon to occur in the next-next block.
+# Trust me, it makes sense.
+query = "$..target"
+description = "select the label starting exactly at block boundary"
+
+[queries.results]
+# Number of expected matches.
+count = 2
+# Byte locations of starts of all matches, in order.
+bytes = [14, 194]
+# Stringified values of all matches, verbatim as in the input,
+# in the same order as above.
+nodes = ['''{
+    "please note the important whitespaces after the upcoming comma (pretend indentation is really big)": 42
+  }''', '43']