Skip to content

Commit

Permalink
Fixed bugs in parsing block scalars (#435)
Browse files Browse the repository at this point in the history
* separate block scalar content indent level into base and indicated indent levels in lexer

* emit error if a following content line of a block scalar is less indented

* emit error if a leading empty line is more indented than the first non-empty line in a block scalar

* fixed parsing more-indented empty lines in block folded scalar contents in scalar_parser

* fixed clang-tidy error
  • Loading branch information
fktn-k authored Nov 26, 2024
1 parent ca34fe9 commit a3ada06
Show file tree
Hide file tree
Showing 6 changed files with 468 additions and 278 deletions.
2 changes: 1 addition & 1 deletion include/fkYAML/detail/input/block_scalar_header.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ enum class chomping_indicator_t : std::uint8_t {
struct block_scalar_header {
/// Chomping indicator type.
chomping_indicator_t chomp {chomping_indicator_t::CLIP};
/// Indentation for block scalar contents.
/// Content indentation level of a block scalar.
uint32_t indent {0};
};

Expand Down
302 changes: 171 additions & 131 deletions include/fkYAML/detail/input/lexical_analyzer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,16 +221,22 @@ class lexical_analyzer {
case '>': {
const str_view sv {m_token_begin_itr, m_end_itr};
const std::size_t header_end_pos = sv.find('\n');
FK_YAML_ASSERT(header_end_pos != str_view::npos);
const uint32_t base_indent = get_current_indent_level(&sv[header_end_pos]);

FK_YAML_ASSERT(!sv.empty());
token.type = (sv[0] == '|') ? lexical_token_t::BLOCK_LITERAL_SCALAR : lexical_token_t::BLOCK_FOLDED_SCALAR;
if (*m_token_begin_itr == '|') {
token.type = lexical_token_t::BLOCK_LITERAL_SCALAR;
}
else {
token.type = lexical_token_t::BLOCK_FOLDED_SCALAR;
}

FK_YAML_ASSERT(header_end_pos != str_view::npos);
const str_view header_line = sv.substr(1, header_end_pos - 1);
m_block_scalar_header = convert_to_block_scalar_header(header_line);

m_token_begin_itr = sv.begin() + (header_end_pos + 1);
scan_block_style_string_token(m_block_scalar_header.indent, token.str);
m_block_scalar_header.indent =
determine_block_scalar_content_range(base_indent, m_block_scalar_header.indent, token.str);

return token;
}
Expand Down Expand Up @@ -299,6 +305,102 @@ class lexical_analyzer {
}

private:
uint32_t get_current_indent_level(const char* p_line_end) {
// get the beginning position of the current line.
const char* cur_itr = p_line_end - 1;
const char* input_begin_itr = m_input_buffer.begin();
while (cur_itr != input_begin_itr) {
if (*cur_itr == '\n') {
++cur_itr;
break;
}
--cur_itr;
}

const char* line_begin_itr = cur_itr;

// get the indentation of the current line.
uint32_t indent = 0;
bool indent_found = false;
// 0: none, 1: block seq item, 2: explicit map key, 3: explicit map value
uint32_t context = 0;
while (cur_itr != p_line_end && !indent_found) {
switch (*cur_itr) {
case ' ':
++indent;
++cur_itr;
break;
case '-':
switch (*(cur_itr + 1)) {
case ' ':
case '\t':
indent += 2;
cur_itr += 2;
context = 1;
break;
default:
indent_found = true;
break;
}
break;
case '?':
if (*(cur_itr + 1) == ' ') {
indent += 2;
cur_itr += 2;
context = 2;
break;
}

indent_found = true;
break;
case ':':
switch (*(cur_itr + 1)) {
case ' ':
case '\t':
indent += 2;
cur_itr += 2;
context = 3;
break;
default:
indent_found = true;
break;
}
break;
default:
indent_found = true;
break;
}
}

// If "- ", "? " and/or ": " occur in the first line of this plain scalar content.
if (context > 0) {
// Check if the first line contains the key separator ": ".
// If so, the indent value remains the current one.
// Otherwise, the indent value is changed based on the last ocurrence of the above 3.
// In any case, multiline plain scalar content must be indented more than the indent value.
const str_view line_content_part {line_begin_itr + indent, p_line_end};
std::size_t key_sep_pos = line_content_part.find(": ");
if (key_sep_pos == str_view::npos) {
key_sep_pos = line_content_part.find(":\t");
}

if (key_sep_pos == str_view::npos) {
constexpr char targets[] = "-?:";
FK_YAML_ASSERT(context - 1 < sizeof(targets));
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index)
const char target_char = targets[context - 1];

// Find the position of the last ocuurence of "- ", "? " or ": ".
const str_view line_indent_part {line_begin_itr, indent};
const std::size_t block_seq_item_begin_pos = line_indent_part.find_last_of(target_char);
FK_YAML_ASSERT(block_seq_item_begin_pos != str_view::npos);
indent = static_cast<uint32_t>(block_seq_item_begin_pos);
}
}

return indent;
}

/// @brief Skip until a newline code or a null character is found.
void scan_comment() {
FK_YAML_ASSERT(*m_cur_itr == '#');
Expand Down Expand Up @@ -726,98 +828,7 @@ class lexical_analyzer {
switch (sv[pos]) {
case '\n': {
if (indent == std::numeric_limits<uint32_t>::max()) {
// get the beginning position of the current line.
const char* cur_itr = m_token_begin_itr;
const char* input_begin_itr = m_input_buffer.begin();
while (cur_itr != input_begin_itr) {
if (*cur_itr == '\n') {
++cur_itr;
break;
}
--cur_itr;
}

const char* line_begin_itr = cur_itr;

// get the indentation of the current line.
indent = 0;
bool indent_found = false;
// 0: none, 1: block seq item, 2: explicit map key, 3: explicit map value
uint32_t context = 0;
while (cur_itr != m_token_begin_itr && !indent_found) {
switch (*cur_itr) {
case ' ':
case '\t':
++indent;
++cur_itr;
break;
case '-':
switch (*(cur_itr + 1)) {
case ' ':
case '\t':
indent += 2;
cur_itr += 2;
context = 1;
break;
default:
indent_found = true;
break;
}
break;
case '?':
if (*(cur_itr + 1) == ' ') {
indent += 2;
cur_itr += 2;
context = 2;
break;
}

indent_found = true;
break;
case ':':
switch (*(cur_itr + 1)) {
case ' ':
case '\t':
indent += 2;
cur_itr += 2;
context = 3;
break;
default:
indent_found = true;
break;
}
break;
default:
indent_found = true;
break;
}
}

// If "- ", "? " and/or ": " occur in the first line of this plain scalar content.
if (context > 0) {
// Check if the first line contains the key separator ": ".
// If so, the indent value remains the current one.
// Otherwise, the indent value is changed based on the last ocurrence of the above 3.
// In any case, multiline plain scalar content must be indented more than the indent value.
const str_view line_content_part {line_begin_itr + indent, &sv[pos]};
std::size_t key_seq_pos = line_content_part.find(": ");
if (key_seq_pos == str_view::npos) {
key_seq_pos = line_content_part.find(":\t");
}

if (key_seq_pos == str_view::npos) {
constexpr char targets[] = "-?:";
FK_YAML_ASSERT(context - 1 < sizeof(targets));
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index)
const char target_char = targets[context - 1];

// Find the position of the last ocuurence of "- ", "? " or ": ".
const str_view line_indent_part {line_begin_itr, indent};
const std::size_t block_seq_item_begin_pos = line_indent_part.find_last_of(target_char);
FK_YAML_ASSERT(block_seq_item_begin_pos != str_view::npos);
indent = static_cast<uint32_t>(block_seq_item_begin_pos);
}
}
indent = get_current_indent_level(&sv[pos]);
}

constexpr str_view space_filter = " \t\n";
Expand Down Expand Up @@ -903,56 +914,71 @@ class lexical_analyzer {
}

/// @brief Scan a block style string token either in the literal or folded style.
/// @param style The style of the given token, either literal or folded.
/// @param chomp The chomping indicator type of the given token, either strip, keep or clip.
/// @param indent The indent size specified for the given token.
void scan_block_style_string_token(uint32_t& indent, str_view& token) {
/// @param base_indent The base indent level of the block scalar.
/// @param indicated_indent The indicated indent level in the block scalar header. 0 means it's not indicated.
/// @param token Storage for the scanned block scalar range.
/// @return The content indentation level of the block scalar.
uint32_t determine_block_scalar_content_range(uint32_t base_indent, uint32_t indicated_indent, str_view& token) {
const str_view sv {m_token_begin_itr, m_end_itr};

// Handle leading all-space lines.
constexpr str_view space_filter = " \t\n";
const std::size_t first_non_space_pos = sv.find_first_not_of(space_filter);
if (first_non_space_pos == str_view::npos) {
// empty block scalar with no subsequent tokens.
indent = static_cast<uint32_t>(sv.size());
token = sv;
uint32_t cur_indent = 0;
uint32_t max_leading_indent = 0;
const char* cur_itr = m_token_begin_itr;
for (bool stop_increment = false; cur_itr != m_end_itr; ++cur_itr) {
const char c = *cur_itr;
if (c == ' ') {
if (!stop_increment) {
++cur_indent;
}
continue;
}
if (c == '\n') {
max_leading_indent = std::max(cur_indent, max_leading_indent);
cur_indent = 0;
stop_increment = false;
continue;
}
if (c == '\t') {
// Tabs are not counted as an indent character but still part of an empty line.
// See https://yaml.org/spec/1.2.2/#rule-s-indent and https://yaml.org/spec/1.2.2/#64-empty-lines.
stop_increment = true;
continue;
}
break;
}

// all the block scalar contents are empty lines, and no subsequent token exists.
if FK_YAML_UNLIKELY (cur_itr == m_end_itr) {
// Without the following iterator update, lexer cannot reach the end of input buffer and causes infinite
// loops from the next loop. (https://github.com/fktn-k/fkYAML/pull/410)
m_cur_itr = m_end_itr;
return;

token = sv;
// If there's no non-empty line, the content indentation level is equal to the number of spaces on the
// longest line. https://yaml.org/spec/1.2.2/#8111-block-indentation-indicator
return indicated_indent == 0 ? std::max(cur_indent, max_leading_indent) : base_indent + indicated_indent;
}

// get indentation of the first non-space character.
std::size_t last_newline_pos = sv.substr(0, first_non_space_pos).find_last_of('\n');
if (last_newline_pos == str_view::npos) {
// first_non_space_pos in on the first line.
const auto cur_indent = static_cast<uint32_t>(first_non_space_pos);
if (indent == 0) {
indent = cur_indent;
}
else if FK_YAML_UNLIKELY (cur_indent < indent) {
emit_error("A block style scalar is less indented than the indicated level.");
}
// Any leading empty line must not contain more spaces than the first non-empty line.
if FK_YAML_UNLIKELY (cur_indent < max_leading_indent) {
emit_error("Any leading empty line must not be more indented than the first non-empty line.");
}
else {
FK_YAML_ASSERT(last_newline_pos < first_non_space_pos);
const auto cur_indent = static_cast<uint32_t>(first_non_space_pos - last_newline_pos - 1);

// TODO: preserve and compare the last indentation with `cur_indent`
if (indent == 0) {
indent = cur_indent;
}
else if FK_YAML_UNLIKELY (cur_indent < indent) {
emit_error("A block style scalar is less indented than the indicated level.");
}
if (indicated_indent == 0) {
FK_YAML_ASSERT(base_indent < cur_indent);
indicated_indent = cur_indent - base_indent;
}
else if FK_YAML_UNLIKELY (cur_indent < base_indent + indicated_indent) {
emit_error("The first non-empty line in the block scalar is less indented.");
}

last_newline_pos = sv.find('\n', first_non_space_pos + 1);
std::size_t last_newline_pos = sv.find('\n', cur_itr - m_token_begin_itr + 1);
if (last_newline_pos == str_view::npos) {
last_newline_pos = sv.size();
}

const uint32_t content_indent = base_indent + indicated_indent;
while (last_newline_pos < sv.size()) {
std::size_t cur_line_end_pos = sv.find('\n', last_newline_pos + 1);
if (cur_line_end_pos == str_view::npos) {
Expand All @@ -966,8 +992,20 @@ class lexical_analyzer {
}

FK_YAML_ASSERT(last_newline_pos < cur_line_content_begin_pos);
const auto cur_indent = static_cast<uint32_t>(cur_line_content_begin_pos - last_newline_pos - 1);
if (cur_indent < indent && sv[cur_line_content_begin_pos] != '\n') {
cur_indent = static_cast<uint32_t>(cur_line_content_begin_pos - last_newline_pos - 1);
if (cur_indent < content_indent && sv[cur_line_content_begin_pos] != '\n') {
if FK_YAML_UNLIKELY (cur_indent > base_indent) {
// This path assumes an input like the following:
// ```yaml
// foo: |
// text
// invalid # this line is less indented than the content indent level (2)
// # but more indented than the base indent level (0)
// ```
// In such cases, the less indented line cannot be the start of the next token.
emit_error("A content line of the block scalar is less indented.");
}

// Interpret less indented non-space characters as the start of the next token.
break;
}
Expand All @@ -982,6 +1020,8 @@ class lexical_analyzer {

token = sv.substr(0, last_newline_pos);
m_cur_itr = token.end();

return content_indent;
}

/// @brief Handle unescaped control characters.
Expand Down
Loading

0 comments on commit a3ada06

Please sign in to comment.