HTML API: Ensure that NULL and whitespace-only CDATA sections don't f…

…orbid FRAMESET. When CDATA sections (which can only occur inside SVG and MathML content) consist only of NULL bytes or whitespace characters they should not clear the "frameset ok" flag. Previously they have always been clearing this flag, but in this patch the logic is updated to detect these sequences properly. Developed in #7230 Discussed in https://core.trac.wordpress.org/ticket/61576 Follow-up to [58867]. Props dmsnell, jonsurrell. See #61576. git-svn-id: https://develop.svn.wordpress.org/trunk@58977 602fd350-edb4-49c9-b593-d223f7449a82
WordPress · Sep 3, 2024 · 79c1047 · 79c1047
1 parent 37d896a
commit 79c1047
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 63 deletions.
diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php
@@ -843,10 +843,7 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ): bool {
 
 		if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
 			parent::next_token();
-			if (
-				WP_HTML_Tag_Processor::STATE_TEXT_NODE === $this->parser_state ||
-				WP_HTML_Tag_Processor::STATE_CDATA_NODE === $this->parser_state
-			) {
+			if ( WP_HTML_Tag_Processor::STATE_TEXT_NODE === $this->parser_state ) {
 				parent::subdivide_text_appropriately();
 			}
 		}
@@ -4375,7 +4372,6 @@ private function step_in_foreign_content(): bool {
 		}
 
 		switch ( $op ) {
-			case '#cdata-section':
 			case '#text':
 				/*
 				 * > A character token that is U+0000 NULL
@@ -4395,6 +4391,24 @@ private function step_in_foreign_content(): bool {
 				$this->insert_foreign_element( $this->state->current_token, false );
 				return true;
 
+			/*
+			 * CDATA sections are alternate wrappers for text content and therefore
+			 * ought to follow the same rules as text nodes.
+			 */
+			case '#cdata-section':
+				/*
+				 * NULL bytes and whitespace do not change the frameset-ok flag.
+				 */
+				$current_token        = $this->bookmarks[ $this->state->current_token->bookmark_name ];
+				$cdata_content_start  = $current_token->start + 9;
+				$cdata_content_length = $current_token->length - 12;
+				if ( strspn( $this->html, "\0 \t\n\f\r", $cdata_content_start, $cdata_content_length ) !== $cdata_content_length ) {
+					$this->state->frameset_ok = false;
+				}
+
+				$this->insert_foreign_element( $this->state->current_token, false );
+				return true;
+
 			/*
 			 * > A comment token
 			 */

diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php
@@ -3337,8 +3337,8 @@ public function get_comment_type(): ?string {
 	}
 
 	/**
-	 * Subdivides a matched text node or CDATA text node, splitting NULL byte sequences
-	 * and decoded whitespace as distinct prefixes.
+	 * Subdivides a matched text node, splitting NULL byte sequences and decoded whitespace as
+	 * distinct nodes prefixes.
 	 *
 	 * Note that once anything that's neither a NULL byte nor decoded whitespace is
 	 * encountered, then the remainder of the text node is left intact as generic text.
@@ -3368,70 +3368,55 @@ public function get_comment_type(): ?string {
 	 * @return bool Whether the text node was subdivided.
 	 */
 	public function subdivide_text_appropriately(): bool {
+		if ( self::STATE_TEXT_NODE !== $this->parser_state ) {
+			return false;
+		}
+
 		$this->text_node_classification = self::TEXT_IS_GENERIC;
 
-		if ( self::STATE_TEXT_NODE === $this->parser_state ) {
-			/*
-			 * NULL bytes are treated categorically different than numeric character
-			 * references whose number is zero. `&#x00;` is not the same as `"\x00"`.
-			 */
-			$leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length );
-			if ( $leading_nulls > 0 ) {
-				$this->token_length             = $leading_nulls;
-				$this->text_length              = $leading_nulls;
-				$this->bytes_already_parsed     = $this->token_starts_at + $leading_nulls;
-				$this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE;
-				return true;
-			}
+		/*
+		 * NULL bytes are treated categorically different than numeric character
+		 * references whose number is zero. `&#x00;` is not the same as `"\x00"`.
+		 */
+		$leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length );
+		if ( $leading_nulls > 0 ) {
+			$this->token_length             = $leading_nulls;
+			$this->text_length              = $leading_nulls;
+			$this->bytes_already_parsed     = $this->token_starts_at + $leading_nulls;
+			$this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE;
+			return true;
+		}
 
-			/*
-			 * Start a decoding loop to determine the point at which the
-			 * text subdivides. This entails raw whitespace bytes and any
-			 * character reference that decodes to the same.
-			 */
-			$at  = $this->text_starts_at;
-			$end = $this->text_starts_at + $this->text_length;
-			while ( $at < $end ) {
-				$skipped = strspn( $this->html, " \t\f\r\n", $at, $end - $at );
-				$at     += $skipped;
-
-				if ( $at < $end && '&' === $this->html[ $at ] ) {
-					$matched_byte_length = null;
-					$replacement         = WP_HTML_Decoder::read_character_reference( 'data', $this->html, $at, $matched_byte_length );
-					if ( isset( $replacement ) && 1 === strspn( $replacement, " \t\f\r\n" ) ) {
-						$at += $matched_byte_length;
-						continue;
-					}
+		/*
+		 * Start a decoding loop to determine the point at which the
+		 * text subdivides. This entails raw whitespace bytes and any
+		 * character reference that decodes to the same.
+		 */
+		$at  = $this->text_starts_at;
+		$end = $this->text_starts_at + $this->text_length;
+		while ( $at < $end ) {
+			$skipped = strspn( $this->html, " \t\f\r\n", $at, $end - $at );
+			$at     += $skipped;
+
+			if ( $at < $end && '&' === $this->html[ $at ] ) {
+				$matched_byte_length = null;
+				$replacement         = WP_HTML_Decoder::read_character_reference( 'data', $this->html, $at, $matched_byte_length );
+				if ( isset( $replacement ) && 1 === strspn( $replacement, " \t\f\r\n" ) ) {
+					$at += $matched_byte_length;
+					continue;
 				}
-
-				break;
-			}
-
-			if ( $at > $this->text_starts_at ) {
-				$new_length                     = $at - $this->text_starts_at;
-				$this->text_length              = $new_length;
-				$this->token_length             = $new_length;
-				$this->bytes_already_parsed     = $at;
-				$this->text_node_classification = self::TEXT_IS_WHITESPACE;
-				return true;
 			}
 
-			return false;
+			break;
 		}
 
-		// Unlike text nodes, there are no character references within CDATA sections.
-		if ( self::STATE_CDATA_NODE === $this->parser_state ) {
-			$leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length );
-			if ( $leading_nulls === $this->text_length ) {
-				$this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE;
-				return true;
-			}
-
-			$leading_ws = strspn( $this->html, " \t\f\r\n", $this->text_starts_at, $this->text_length );
-			if ( $leading_ws === $this->text_length ) {
-				$this->text_node_classification = self::TEXT_IS_WHITESPACE;
-				return true;
-			}
+		if ( $at > $this->text_starts_at ) {
+			$new_length                     = $at - $this->text_starts_at;
+			$this->text_length              = $new_length;
+			$this->token_length             = $new_length;
+			$this->bytes_already_parsed     = $at;
+			$this->text_node_classification = self::TEXT_IS_WHITESPACE;
+			return true;
 		}
 
 		return false;