HTML API: Add support for missing FRAMESET and "after" insertion modes.

As part of work to add more spec support to the HTML API, this patch adds support for the FRAMESET-related insertion modes, as well as the set of missing after insertion modes. These modes run at the end of parsing a document, closing it and taking care of any lingering tags. Developed in https://github.com/wordpress/wordpress-develop/7165 Discussed in https://core.trac.wordpress.org/ticket/61576 Props dmsnell, jonsurrell. See #61576. git-svn-id: https://develop.svn.wordpress.org/trunk@58926 602fd350-edb4-49c9-b593-d223f7449a82
dmsnell · Aug 23, 2024 · 15dca4e · 15dca4e
1 parent 1139a51
commit 15dca4e
Show file tree

Hide file tree

Showing 2 changed files with 331 additions and 6 deletions.
diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php
@@ -3972,7 +3972,71 @@ private function step_in_template(): bool {
 	 * @return bool Whether an element was found.
 	 */
 	private function step_after_body(): bool {
-		$this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY . ' state.' );
+		$tag_name   = $this->get_token_name();
+		$token_type = $this->get_token_type();
+		$op_sigil   = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
+		$op         = "{$op_sigil}{$tag_name}";
+
+		switch ( $op ) {
+			/*
+			 * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
+			 * >   U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+			 *
+			 * > Process the token using the rules for the "in body" insertion mode.
+			 */
+			case '#text':
+				$text = $this->get_modifiable_text();
+				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+					return $this->step_in_body();
+				}
+				goto after_body_anything_else;
+				break;
+
+			/*
+			 * > A comment token
+			 */
+			case '#comment':
+			case '#funky-comment':
+			case '#presumptuous-tag':
+				$this->bail( 'Content outside of BODY is unsupported.' );
+				break;
+
+			/*
+			 * > A DOCTYPE token
+			 */
+			case 'html':
+				// Parse error: ignore the token.
+				return $this->step();
+
+			/*
+			 * > A start tag whose tag name is "html"
+			 */
+			case '+HTML':
+				return $this->step_in_body();
+
+			/*
+			 * > An end tag whose tag name is "html"
+			 *
+			 * > If the parser was created as part of the HTML fragment parsing algorithm,
+			 * > this is a parse error; ignore the token. (fragment case)
+			 * >
+			 * > Otherwise, switch the insertion mode to "after after body".
+			 */
+			case '-HTML':
+				if ( isset( $this->context_node ) ) {
+					return $this->step();
+				}
+
+				$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY;
+				return true;
+		}
+
+		/*
+		 * > Parse error. Switch the insertion mode to "in body" and reprocess the token.
+		 */
+		after_body_anything_else:
+		$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
+		return $this->step( self::REPROCESS_CURRENT_NODE );
 	}
 
 	/**
@@ -3991,7 +4055,109 @@ private function step_after_body(): bool {
 	 * @return bool Whether an element was found.
 	 */
 	private function step_in_frameset(): bool {
-		$this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET . ' state.' );
+		$tag_name   = $this->get_token_name();
+		$token_type = $this->get_token_type();
+		$op_sigil   = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
+		$op         = "{$op_sigil}{$tag_name}";
+
+		switch ( $op ) {
+			/*
+			 * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
+			 * >   U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+			 * >
+			 * > Insert the character.
+			 *
+			 * This algorithm effectively strips non-whitespace characters from text and inserts
+			 * them under HTML. This is not supported at this time.
+			 */
+			case '#text':
+				$text = $this->get_modifiable_text();
+				$text = $this->get_modifiable_text();
+				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+					return $this->step_in_body();
+				}
+				$this->bail( 'Non-whitespace characters cannot be handled in frameset.' );
+				break;
+
+			/*
+			 * > A comment token
+			 */
+			case '#comment':
+			case '#funky-comment':
+			case '#presumptuous-tag':
+				$this->insert_html_element( $this->state->current_token );
+				return true;
+
+			/*
+			 * > A DOCTYPE token
+			 */
+			case 'html':
+				// Parse error: ignore the token.
+				return $this->step();
+
+			/*
+			 * > A start tag whose tag name is "html"
+			 */
+			case '+HTML':
+				return $this->step_in_body();
+
+			/*
+			 * > A start tag whose tag name is "frameset"
+			 */
+			case '+FRAMESET':
+				$this->insert_html_element( $this->state->current_token );
+				return true;
+
+			/*
+			 * > An end tag whose tag name is "frameset"
+			 */
+			case '-FRAMESET':
+				/*
+				 * > If the current node is the root html element, then this is a parse error;
+				 * > ignore the token. (fragment case)
+				 */
+				if ( $this->state->stack_of_open_elements->current_node_is( 'HTML' ) ) {
+					return $this->step();
+				}
+
+				/*
+				 * > Otherwise, pop the current node from the stack of open elements.
+				 */
+				$this->state->stack_of_open_elements->pop();
+
+				/*
+				 * > If the parser was not created as part of the HTML fragment parsing algorithm
+				 * > (fragment case), and the current node is no longer a frameset element, then
+				 * > switch the insertion mode to "after frameset".
+				 */
+				if ( ! isset( $this->context_node ) && ! $this->state->stack_of_open_elements->current_node_is( 'FRAMESET' ) ) {
+					$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET;
+				}
+
+				return true;
+
+			/*
+			 * > A start tag whose tag name is "frame"
+			 *
+			 * > Insert an HTML element for the token. Immediately pop the
+			 * > current node off the stack of open elements.
+			 * >
+			 * > Acknowledge the token's self-closing flag, if it is set.
+			 */
+			case '+FRAME':
+				$this->insert_html_element( $this->state->current_token );
+				$this->state->stack_of_open_elements->pop();
+				return true;
+
+			/*
+			 * > A start tag whose tag name is "noframes"
+			 */
+			case '+NOFRAMES':
+				return $this->step_in_head();
+		}
+
+		// Parse error: ignore the token.
+		return $this->step();
 	}
 
 	/**
@@ -4010,7 +4176,67 @@ private function step_in_frameset(): bool {
 	 * @return bool Whether an element was found.
 	 */
 	private function step_after_frameset(): bool {
-		$this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET . ' state.' );
+		$tag_name   = $this->get_token_name();
+		$token_type = $this->get_token_type();
+		$op_sigil   = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
+		$op         = "{$op_sigil}{$tag_name}";
+
+		switch ( $op ) {
+			/*
+			 * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
+			 * >   U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+			 * >
+			 * > Insert the character.
+			 *
+			 * This algorithm effectively strips non-whitespace characters from text and inserts
+			 * them under HTML. This is not supported at this time.
+			 */
+			case '#text':
+				$text = $this->get_modifiable_text();
+				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+					return $this->step_in_body();
+				}
+				$this->bail( 'Non-whitespace characters cannot be handled in after frameset' );
+				break;
+
+			/*
+			 * > A comment token
+			 */
+			case '#comment':
+			case '#funky-comment':
+			case '#presumptuous-tag':
+				$this->insert_html_element( $this->state->current_token );
+				return true;
+
+			/*
+			 * > A DOCTYPE token
+			 */
+			case 'html':
+				// Parse error: ignore the token.
+				return $this->step();
+
+			/*
+			 * > A start tag whose tag name is "html"
+			 */
+			case '+HTML':
+				return $this->step_in_body();
+
+			/*
+			 * > An end tag whose tag name is "html"
+			 */
+			case '-HTML':
+				$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET;
+				return true;
+
+			/*
+			 * > A start tag whose tag name is "noframes"
+			 */
+			case '+NOFRAMES':
+				return $this->step_in_head();
+		}
+
+		// Parse error: ignore the token.
+		return $this->step();
 	}
 
 	/**
@@ -4029,7 +4255,52 @@ private function step_after_frameset(): bool {
 	 * @return bool Whether an element was found.
 	 */
 	private function step_after_after_body(): bool {
-		$this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY . ' state.' );
+		$tag_name   = $this->get_token_name();
+		$token_type = $this->get_token_type();
+		$op_sigil   = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
+		$op         = "{$op_sigil}{$tag_name}";
+
+		switch ( $op ) {
+			/*
+			 * > A comment token
+			 */
+			case '#comment':
+			case '#funky-comment':
+			case '#presumptuous-tag':
+				$this->bail( 'Content outside of HTML is unsupported.' );
+				break;
+
+			/*
+			 * > A DOCTYPE token
+			 * > A start tag whose tag name is "html"
+			 *
+			 * > Process the token using the rules for the "in body" insertion mode.
+			 */
+			case 'html':
+			case '+HTML':
+				return $this->step_in_body();
+
+			/*
+			 * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
+			 * >   U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+			 * >
+			 * > Process the token using the rules for the "in body" insertion mode.
+			 */
+			case '#text':
+				$text = $this->get_modifiable_text();
+				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+					return $this->step_in_body();
+				}
+				goto after_after_body_anything_else;
+				break;
+		}
+
+		/*
+		 * > Parse error. Switch the insertion mode to "in body" and reprocess the token.
+		 */
+		after_after_body_anything_else:
+		$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
+		return $this->step( self::REPROCESS_CURRENT_NODE );
 	}
 
 	/**
@@ -4048,7 +4319,57 @@ private function step_after_after_body(): bool {
 	 * @return bool Whether an element was found.
 	 */
 	private function step_after_after_frameset(): bool {
-		$this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET . ' state.' );
+		$tag_name   = $this->get_token_name();
+		$token_type = $this->get_token_type();
+		$op_sigil   = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
+		$op         = "{$op_sigil}{$tag_name}";
+
+		switch ( $op ) {
+			/*
+			 * > A comment token
+			 */
+			case '#comment':
+			case '#funky-comment':
+			case '#presumptuous-tag':
+				$this->bail( 'Content outside of HTML is unsupported.' );
+				break;
+
+			/*
+			 * > A DOCTYPE token
+			 * > A start tag whose tag name is "html"
+			 *
+			 * > Process the token using the rules for the "in body" insertion mode.
+			 */
+			case 'html':
+			case '+HTML':
+				return $this->step_in_body();
+
+			/*
+			 * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
+			 * >   U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+			 * >
+			 * > Process the token using the rules for the "in body" insertion mode.
+			 *
+			 * This algorithm effectively strips non-whitespace characters from text and inserts
+			 * them under HTML. This is not supported at this time.
+			 */
+			case '#text':
+				$text = $this->get_modifiable_text();
+				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+					return $this->step_in_body();
+				}
+				$this->bail( 'Non-whitespace characters cannot be handled in after after frameset.' );
+				break;
+
+			/*
+			 * > A start tag whose tag name is "noframes"
+			 */
+			case '+NOFRAMES':
+				return $this->step_in_head();
+		}
+
+		// Parse error: ignore the token.
+		return $this->step();
 	}
 
 	/**
@@ -4115,7 +4436,8 @@ private function step_in_foreign_content(): bool {
 			 */
 			case '#cdata-section':
 			case '#comment':
-			case '#funky_comment':
+			case '#funky-comment':
+			case '#presumptuous-tag':
 				$this->insert_foreign_element( $this->state->current_token, false );
 				return true;
 

diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php
@@ -33,10 +33,13 @@ class Tests_HtmlApi_Html5lib extends WP_UnitTestCase {
 		'tests1/line0692'        => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly',
 		'tests14/line0022'       => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
 		'tests14/line0055'       => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
+		'tests19/line0488'       => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
+		'tests19/line0500'       => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
 		'tests19/line0965'       => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.',
 		'tests19/line1079'       => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
 		'tests2/line0207'        => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
 		'tests2/line0686'        => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
+		'tests2/line0697'        => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
 		'tests2/line0709'        => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
 		'tests5/line0013'        => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.',
 		'tests5/line0077'        => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.',