Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: #249 panic during head skipping #253

Merged
merged 4 commits into from
Sep 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Justfile
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,12 @@ test-book:

@add-test name:
f=`echo {{name}} | sed s/-/_/g` && \
cp ./crates/rsonpath-lib/tests/documents/toml/test-template-inline.toml ./crates/rsonpath-lib/tests/documents/toml/$f.toml && \
cp ./crates/rsonpath-lib/tests/documents/toml/test_template_inline.toml ./crates/rsonpath-lib/tests/documents/toml/$f.toml && \
echo "Test template initialised at crates/rsonpath-lib/tests/documents/toml/$f.toml"

@add-test-large name:
f=`echo {{name}} | sed s/-/_/g` && \
cp ./crates/rsonpath-lib/tests/documents/toml/test-template-large.toml ./crates/rsonpath-lib/tests/documents/toml/$f.toml && \
cp ./crates/rsonpath-lib/tests/documents/toml/test_template_large.toml ./crates/rsonpath-lib/tests/documents/toml/$f.toml && \
echo "Test template initialised at crates/rsonpath-lib/tests/documents/toml/$f.toml" && \
echo "{}" > ./crates/rsonpath-lib/tests/documents/json/large/$f.json && \
echo "Put your large JSON document as contents of crates/rsonpath-lib/tests/documents/json/large/$f.json"
Expand Down
2 changes: 1 addition & 1 deletion crates/rsonpath-benchmarks
Submodule rsonpath-benchmarks updated 1 files
+6 −14 Cargo.lock
65 changes: 42 additions & 23 deletions crates/rsonpath-lib/src/classification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@
//! let mut resume_state = structural_classifier.stop();
//! assert_eq!(resume_state.get_idx(), 5);
//!
//! // Skip 6 bytes.
//! resume_state.offset_bytes(6);
//! // Skip to index 11.
//! resume_state.forward_to(11);
//! assert_eq!(resume_state.get_idx(), 11);
//!
//! // Resume.
Expand Down Expand Up @@ -113,42 +113,61 @@ where
self.iter.get_offset() + self.block.as_ref().map_or(0, |b| b.idx)
}

/// Move the state forward by `count` bytes.
/// Move the state forward to `index`.
///
/// # Errors
/// If the offset crosses block boundaries, then a new block is read from the underlying
/// [`Input`](crate::input::Input) implementation, which can fail.
///
/// # Panics
/// If the `count` is not positive.
/// If the `index` is not ahead of the current position of the state ([`get_idx`](ResumeClassifierState::get_idx)).
#[inline]
#[allow(clippy::panic_in_result_fn)]
pub fn offset_bytes(&mut self, count: isize) -> Result<(), InputError> {
assert!(count > 0);
let count = count as usize;
pub fn forward_to(&mut self, index: usize) -> Result<(), InputError> {
let current_block_start = self.iter.get_offset();
let current_block_idx = self.block.as_ref().map_or(0, |b| b.idx);
let current_idx = current_block_start + current_block_idx;

let remaining_in_block = self.block.as_ref().map_or(0, |b| b.block.len() - b.idx);

match self.block.as_mut() {
Some(b) if b.block.len() - b.idx > count => {
b.idx += count;
}
_ => {
let blocks_to_advance = (count - remaining_in_block) / N;
debug!(
"Calling forward_to({index}) when the inner iter offset is {current_block_start} and block idx is {current_block_idx:?}"
);

let remainder = (self.block.as_ref().map_or(0, |b| b.idx) + count - blocks_to_advance * N) % N;
// We want to move by this much forward, and delta > 0.
assert!(index > current_idx);
let delta = index - current_idx;

self.iter.offset(blocks_to_advance as isize);
let next_block = self.iter.next()?;
// First we virtually pretend to move *backward*, setting the index of the current block to zero,
// and adjust the delta to cover that distance. This makes calculations simpler.
// Then we need to skip zero or more blocks and set our self.block to the last one we visit.
let remaining = delta + current_block_idx;
let blocks_to_skip = remaining / N;
let remainder = remaining % N;

self.block = next_block.map(|b| ResumeClassifierBlockState {
block: b,
idx: remainder,
});
match self.block.as_mut() {
Some(b) if blocks_to_skip == 0 => {
b.idx = remaining;
}
Some(_) => {
self.block = self
.iter
.offset(blocks_to_skip as isize)?
.map(|b| ResumeClassifierBlockState {
block: b,
idx: remainder,
});
}
None => {
self.block = self
.iter
.offset((blocks_to_skip + 1) as isize)?
.map(|b| ResumeClassifierBlockState {
block: b,
idx: remainder,
});
}
}

debug!("offset_bytes({count}) results in idx moved to {}", self.get_idx());
debug!("forward_to({index}) results in idx moved to {}", self.get_idx());

Ok(())
}
Expand Down
11 changes: 9 additions & 2 deletions crates/rsonpath-lib/src/classification/quotes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ use crate::{
};
use cfg_if::cfg_if;

/// Result of the [`FallibleIterator`] for quote classification,
/// and of the [`offset`](`QuoteClassifiedIterator::offset`) function.
pub type QuoteIterResult<I, M, const N: usize> = Result<Option<QuoteClassifiedBlock<I, M, N>>, InputError>;

/// Input block with a bitmask signifying which characters are within quotes.
///
/// Characters within quotes in the input are guaranteed to have their corresponding
Expand All @@ -63,8 +67,11 @@ pub trait QuoteClassifiedIterator<'i, I: InputBlockIterator<'i, N>, M, const N:
fn get_offset(&self) -> usize;

/// Move the iterator `count` blocks forward.
/// Effectively skips `count * Twice<BlockAlignment>::size()` bytes.
fn offset(&mut self, count: isize);
///
/// # Errors
/// At least one new block is read from the underlying
/// [`InputBlockIterator`] implementation, which can fail.
fn offset(&mut self, count: isize) -> QuoteIterResult<I::Block, M, N>;

/// Flip the bit representing whether the last block ended with a nonescaped quote.
///
Expand Down
10 changes: 5 additions & 5 deletions crates/rsonpath-lib/src/classification/quotes/nosimd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,15 +107,15 @@ where
self.iter.get_offset() - N
}

fn offset(&mut self, count: isize) {
debug_assert!(count >= 0);
fn offset(&mut self, count: isize) -> QuoteIterResult<I::Block, usize, N> {
debug_assert!(count > 0);
V0ldek marked this conversation as resolved.
Show resolved Hide resolved
debug!("Offsetting by {count}");

if count == 0 {
return;
for _ in 0..count - 1 {
self.iter.next()?;
}

self.iter.offset(count);
self.next()
}

fn flip_quotes_bit(&mut self) {
Expand Down
10 changes: 5 additions & 5 deletions crates/rsonpath-lib/src/classification/quotes/shared.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,15 @@ macro_rules! quote_classifier {
self.iter.get_offset() - $size
}

fn offset(&mut self, count: isize) {
debug_assert!(count >= 0);
fn offset(&mut self, count: isize) -> QuoteIterResult<I::Block, $mask_ty, $size> {
debug_assert!(count > 0);
debug!("Offsetting by {count}");

if count == 0 {
return;
for _ in 0..count - 1 {
self.iter.next()?;
}

self.iter.offset(count);
self.next()
}

fn flip_quotes_bit(&mut self) {
Expand Down
48 changes: 25 additions & 23 deletions crates/rsonpath-lib/src/engine/head_skipping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use crate::{
debug,
depth::Depth,
engine::EngineError,
input::{Input, InputBlockIterator},
input::Input,
query::{
automaton::{Automaton, State},
JsonString,
Expand Down Expand Up @@ -139,19 +139,10 @@ impl<'b, 'q, I: Input> HeadSkip<'b, 'q, I, BLOCK_SIZE> {
.seek_non_whitespace_forward(colon_idx + 1)?
.ok_or(EngineError::MissingItem())?;

// The goal is initializing the quote classifier correctly.
// We can do it as follows:
// - Initialize it to point to the start of the first block.
// - Now we need to move it to the next_idx. Calculate the offset from that point.
// - Offset by that much plus one.
let start_of_second_block = input_iter.get_offset();
debug_assert!(start_of_second_block >= BLOCK_SIZE);
let start_of_first_block = start_of_second_block - BLOCK_SIZE;
let distance_to_colon = colon_idx - start_of_first_block;
let distance_to_value = next_idx - start_of_first_block - distance_to_colon;

let (quote_classifier, quote_classified_first_block) =
resume_quote_classification(input_iter, first_block);
// Temporarily set the index within the current block to zero.
// This makes sense for the move below.
let mut classifier_state = ResumeClassifierState {
iter: quote_classifier,
block: quote_classified_first_block
Expand All @@ -163,26 +154,37 @@ impl<'b, 'q, I: Input> HeadSkip<'b, 'q, I, BLOCK_SIZE> {
debug!("Actual match with colon at {colon_idx}");
debug!("Next significant character at {next_idx}");
debug!("Classifier is at {}", classifier_state.get_idx());
debug!("We want to offset by {distance_to_colon} first, then by {distance_to_value}",);

classifier_state.offset_bytes(distance_to_colon as isize)?;

// Check if the colon is marked as within quotes.
// If yes, that is an error of state propagation through skipped blocks.
// Flip the quote mask.
debug!("We will forward to {colon_idx} first, then to {next_idx}",);

// Now we want to move the entire iterator state so that the current block is quote-classified,
// and correctly points to the place the engine would expect had it found the matching key
// in the regular loop. If the value is atomic, we handle it ourselves. If the value is complex,
// the engine wants to start one byte *after* the opening character.
let resume_idx = if next_c == b'{' || next_c == b'[' {
next_idx + 1
} else {
next_idx
};
classifier_state.forward_to(resume_idx)?;

// We now have the block where we want and we ran quote classification, but during the `forward_to`
// call we lose all the flow-through quote information that usually is passed from one block to the next.
// We need to manually verify the soundness of the classification. Fortunately:
// 1. we know that resume_idx is either the start of a value, or one byte after an opening -
// in a valid JSON this character can be within quotes if and only if it is itself a quote;
// 2. the only way the mask can be wrong is if it is flipped - marks chars within quotes
// as outside and vice versa - so it suffices to flip it if it is wrong.
if let Some(block) = classifier_state.block.as_mut() {
if block.block.within_quotes_mask.is_lit(block.idx) {
let should_be_quoted = block.block.block[block.idx] == b'"';
if block.block.within_quotes_mask.is_lit(block.idx) != should_be_quoted {
debug!("Mask needs flipping!");
block.block.within_quotes_mask = !block.block.within_quotes_mask;
classifier_state.iter.flip_quotes_bit();
}
}

classifier_state.offset_bytes(distance_to_value as isize)?;

classifier_state = match next_c {
b'{' | b'[' => {
classifier_state.offset_bytes(1)?;
debug!("resuming");
if self.is_accepting {
engine.recorder().record_match(
Expand Down
4 changes: 1 addition & 3 deletions crates/rsonpath-lib/src/input/borrowed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,8 @@
//! This type of input is the fastest to process for the engine,
//! since there is no additional overhead from loading anything to memory.

use log::debug;

use super::*;
use crate::{query::JsonString, result::InputRecorder};
use crate::{debug, query::JsonString, result::InputRecorder};

/// Input wrapping a borrowed [`[u8]`] buffer.
pub struct BorrowedBytes<'a> {
Expand Down
2 changes: 1 addition & 1 deletion crates/rsonpath-lib/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ macro_rules! block {
.map(|x| if x.is_ascii_whitespace() { b' ' } else { *x })
.collect::<Vec<_>>()
)
.unwrap()
.unwrap_or("[INVALID UTF8]")
);
};
}
Expand Down
8 changes: 4 additions & 4 deletions crates/rsonpath-lib/src/query/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -355,20 +355,20 @@ mod tests {

#[test]
fn single_quoted_member_should_not_unescape_backslashes() {
let input = r#"\\x"#;
let input = r"\\x";

let result = super::single_quoted_member()(input);

assert_eq!(result, Ok(("", r#"\\x"#.to_owned())));
assert_eq!(result, Ok(("", r"\\x".to_owned())));
}

#[test]
fn double_quoted_member_should_not_unescape_backslashes() {
let input = r#"\\x"#;
let input = r"\\x";

let result = super::double_quoted_member()(input);

assert_eq!(result, Ok(("", r#"\\x"#.to_owned())));
assert_eq!(result, Ok(("", r"\\x".to_owned())));
}

#[test]
Expand Down
4 changes: 2 additions & 2 deletions crates/rsonpath-lib/tests/classifier_correctness_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ mod prop_test {
let escaped_string: String = value
.chars()
.map(|c| match c {
'\\' => String::from(r#"\\"#),
'\\' => String::from(r"\\"),
'"' => String::from(r#"\""#),
x => x.to_string(),
})
Expand Down Expand Up @@ -187,7 +187,7 @@ mod prop_test {
Just(Token::OpeningBracket),
Just(Token::ClosingBrace),
Just(Token::ClosingBracket),
r#"[ -!#-+\--9;-Z^-z|~]+"#.prop_map(Token::Garbage), // ascii characters except structural
r"[ -!#-+\--9;-Z^-z|~]+".prop_map(Token::Garbage), // ascii characters except structural
"[ -~]".prop_map(|x| Token::Quoted(QuotedString::from(&x))) // all ascii characters
]
}
Expand Down
54 changes: 54 additions & 0 deletions crates/rsonpath-lib/tests/documents/toml/head_skip_long.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Define the JSON input for all query test cases.
[input]
# Short description of the input structure.
description = "Long labels to search with head skipping (issue #249)"
# Set to true only if your specific test input is fully compressed (no extraneous whitespace).
is_compressed = false

# Inline JSON document.
[input.source]
json_string = '''
{
"target": {
"please note the important whitespaces after the upcoming comma (pretend indentation is really big)": 42
},
"target" : 43,
"very long label to search for, like, extremely long, so that the colon occurs really far away from the start of the needle match, which triggers some interesting behavior and might break the head skipping module like in #249": 44
}
'''

# Define queries to test on the input.
[[queries]]
# First scenario causing the #249 panic - label that is so long that it spans multiple blocks of input.
query = "$..['very long label to search for, like, extremely long, so that the colon occurs really far away from the start of the needle match, which triggers some interesting behavior and might break the head skipping module like in #249']"
# Short descritpion of the query semantics.
description = "select the extremely long label"

[queries.results]
# Number of expected matches.
count = 1
# Byte locations of starts of all matches, in order.
bytes = [428]
# Stringified values of all matches, verbatim as in the input,
# in the same order as above.
nodes = ['44']

[[queries]]
# Second scenario causing the #249 panic - extremely specific location of tokens, where
# the end of a subtree (the closing character) is exactly at the end of a block,
# the key being looked for is found in the next block,
# but whitespace cause the colon to occur in the next-next block.
# Trust me, it makes sense.
query = "$..target"
description = "select the label starting exactly at block boundary"

[queries.results]
# Number of expected matches.
count = 2
# Byte locations of starts of all matches, in order.
bytes = [14, 194]
# Stringified values of all matches, verbatim as in the input,
# in the same order as above.
nodes = ['''{
"please note the important whitespaces after the upcoming comma (pretend indentation is really big)": 42
}''', '43']
Loading