Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HTML API: Fix CDATA section null and whitespace handling #7230

24 changes: 19 additions & 5 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -843,10 +843,7 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ): bool {

if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
parent::next_token();
if (
WP_HTML_Tag_Processor::STATE_TEXT_NODE === $this->parser_state ||
WP_HTML_Tag_Processor::STATE_CDATA_NODE === $this->parser_state
) {
if ( WP_HTML_Tag_Processor::STATE_TEXT_NODE === $this->parser_state ) {
parent::subdivide_text_appropriately();
}
}
Expand Down Expand Up @@ -4375,7 +4372,6 @@ private function step_in_foreign_content(): bool {
}

switch ( $op ) {
case '#cdata-section':
case '#text':
/*
* > A character token that is U+0000 NULL
Expand All @@ -4395,6 +4391,24 @@ private function step_in_foreign_content(): bool {
$this->insert_foreign_element( $this->state->current_token, false );
return true;

/*
* CDATA sections are alternate wrappers for text content and therefore
* ought to follow the same rules as text nodes.
*/
case '#cdata-section':
/*
* NULL bytes and whitespace do not change the frameset-ok flag.
*/
$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
sirreal marked this conversation as resolved.
Show resolved Hide resolved
$cdata_content_start = $current_token->start + 9;
$cdata_content_length = $current_token->length - 12;
if ( strspn( $this->html, "\0 \t\n\f\r", $cdata_content_start, $cdata_content_length ) !== $cdata_content_length ) {
$this->state->frameset_ok = false;
}

$this->insert_foreign_element( $this->state->current_token, false );
return true;

/*
* > A comment token
*/
Expand Down
101 changes: 43 additions & 58 deletions src/wp-includes/html-api/class-wp-html-tag-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -3337,8 +3337,8 @@ public function get_comment_type(): ?string {
}

/**
* Subdivides a matched text node or CDATA text node, splitting NULL byte sequences
* and decoded whitespace as distinct prefixes.
* Subdivides a matched text node, splitting NULL byte sequences and decoded whitespace as
* distinct nodes prefixes.
*
* Note that once anything that's neither a NULL byte nor decoded whitespace is
* encountered, then the remainder of the text node is left intact as generic text.
Expand Down Expand Up @@ -3368,70 +3368,55 @@ public function get_comment_type(): ?string {
* @return bool Whether the text node was subdivided.
*/
public function subdivide_text_appropriately(): bool {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When viewing with whitespace changes, the diff here is confusing. I've adjusted this method so that it only operates on text nodes.

if ( self::STATE_TEXT_NODE !== $this->parser_state ) {
return false;
}

$this->text_node_classification = self::TEXT_IS_GENERIC;

if ( self::STATE_TEXT_NODE === $this->parser_state ) {
/*
* NULL bytes are treated categorically different than numeric character
* references whose number is zero. `�` is not the same as `"\x00"`.
*/
$leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length );
if ( $leading_nulls > 0 ) {
$this->token_length = $leading_nulls;
$this->text_length = $leading_nulls;
$this->bytes_already_parsed = $this->token_starts_at + $leading_nulls;
$this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE;
return true;
}
/*
* NULL bytes are treated categorically different than numeric character
* references whose number is zero. `�` is not the same as `"\x00"`.
*/
$leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length );
if ( $leading_nulls > 0 ) {
$this->token_length = $leading_nulls;
$this->text_length = $leading_nulls;
$this->bytes_already_parsed = $this->token_starts_at + $leading_nulls;
$this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE;
return true;
}

/*
* Start a decoding loop to determine the point at which the
* text subdivides. This entails raw whitespace bytes and any
* character reference that decodes to the same.
*/
$at = $this->text_starts_at;
$end = $this->text_starts_at + $this->text_length;
while ( $at < $end ) {
$skipped = strspn( $this->html, " \t\f\r\n", $at, $end - $at );
$at += $skipped;

if ( $at < $end && '&' === $this->html[ $at ] ) {
$matched_byte_length = null;
$replacement = WP_HTML_Decoder::read_character_reference( 'data', $this->html, $at, $matched_byte_length );
if ( isset( $replacement ) && 1 === strspn( $replacement, " \t\f\r\n" ) ) {
$at += $matched_byte_length;
continue;
}
/*
* Start a decoding loop to determine the point at which the
* text subdivides. This entails raw whitespace bytes and any
* character reference that decodes to the same.
*/
$at = $this->text_starts_at;
$end = $this->text_starts_at + $this->text_length;
while ( $at < $end ) {
$skipped = strspn( $this->html, " \t\f\r\n", $at, $end - $at );
$at += $skipped;

if ( $at < $end && '&' === $this->html[ $at ] ) {
$matched_byte_length = null;
$replacement = WP_HTML_Decoder::read_character_reference( 'data', $this->html, $at, $matched_byte_length );
if ( isset( $replacement ) && 1 === strspn( $replacement, " \t\f\r\n" ) ) {
$at += $matched_byte_length;
continue;
}

break;
}

if ( $at > $this->text_starts_at ) {
$new_length = $at - $this->text_starts_at;
$this->text_length = $new_length;
$this->token_length = $new_length;
$this->bytes_already_parsed = $at;
$this->text_node_classification = self::TEXT_IS_WHITESPACE;
return true;
}

return false;
break;
}

// Unlike text nodes, there are no character references within CDATA sections.
if ( self::STATE_CDATA_NODE === $this->parser_state ) {
$leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length );
if ( $leading_nulls === $this->text_length ) {
$this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE;
return true;
}

$leading_ws = strspn( $this->html, " \t\f\r\n", $this->text_starts_at, $this->text_length );
if ( $leading_ws === $this->text_length ) {
$this->text_node_classification = self::TEXT_IS_WHITESPACE;
return true;
}
if ( $at > $this->text_starts_at ) {
$new_length = $at - $this->text_starts_at;
$this->text_length = $new_length;
$this->token_length = $new_length;
$this->bytes_already_parsed = $at;
$this->text_node_classification = self::TEXT_IS_WHITESPACE;
return true;
}

return false;
Expand Down
Loading