HTML API: Allow subdividing text nodes by meaningful prefixes.

HTML parsing rules at times differentiate character tokens that are all null bytes, all whitespace, or other content. This patch introduces a new function which may be used to classify text node sub-regions and lead to more efficient application of these parsing rules. Further, when classified in this way, application code may skip some rules and decoding entirely, improving performance. For example, this can be used to ease the implementation of skipping inter-element whitespace, which is usually not rendered. Developed in WordPress#7236 Discussed in https://core.trac.wordpress.org/ticket/61974 Props dmsnell, jonsurrell. Fixes #61974. git-svn-id: https://develop.svn.wordpress.org/trunk@58970 602fd350-edb4-49c9-b593-d223f7449a82
sirreal · Sep 2, 2024 · 95eb879 · 95eb879
1 parent b37cbf9
commit 95eb879
Show file tree

Hide file tree

Showing 3 changed files with 182 additions and 79 deletions.
diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php
@@ -843,6 +843,12 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ): bool {
 
 		if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
 			parent::next_token();
+			if (
+				WP_HTML_Tag_Processor::STATE_TEXT_NODE === $this->parser_state ||
+				WP_HTML_Tag_Processor::STATE_CDATA_NODE === $this->parser_state
+			) {
+				parent::subdivide_text_appropriately();
+			}
 		}
 
 		// Finish stepping when there are no more tokens in the document.
@@ -1056,8 +1062,7 @@ private function step_initial(): bool {
 			 * Parse error: ignore the token.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step();
 				}
 				goto initial_anything_else;
@@ -1145,8 +1150,7 @@ private function step_before_html(): bool {
 			 * Parse error: ignore the token.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step();
 				}
 				goto before_html_anything_else;
@@ -1227,8 +1231,7 @@ private function step_before_head(): bool {
 			 * Parse error: ignore the token.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step();
 				}
 				goto before_head_anything_else;
@@ -1323,16 +1326,7 @@ private function step_in_head(): bool {
 				 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF),
 				 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
 				 */
-				$text = $this->get_modifiable_text();
-				if ( '' === $text ) {
-					/*
-					 * If the text is empty after processing HTML entities and stripping
-					 * U+0000 NULL bytes then ignore the token.
-					 */
-					return $this->step();
-				}
-
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					// Insert the character.
 					$this->insert_html_element( $this->state->current_token );
 					return true;
@@ -1552,8 +1546,7 @@ private function step_in_head_noscript(): bool {
 			 * Parse error: ignore the token.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step_in_head();
 				}
 
@@ -1654,8 +1647,7 @@ private function step_after_head(): bool {
 			 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					// Insert the character.
 					$this->insert_html_element( $this->state->current_token );
 					return true;
@@ -1793,8 +1785,6 @@ private function step_in_body(): bool {
 
 		switch ( $op ) {
 			case '#text':
-				$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
-
 				/*
 				 * > A character token that is U+0000 NULL
 				 *
@@ -1804,11 +1794,7 @@ private function step_in_body(): bool {
 				 * here, but if there are any other characters in the stream
 				 * the active formats should be reconstructed.
 				 */
-				if (
-					1 <= $current_token->length &&
-					"\x00" === $this->html[ $current_token->start ] &&
-					strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
-				) {
+				if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) {
 					// Parse error: ignore the token.
 					return $this->step();
 				}
@@ -1820,8 +1806,7 @@ private function step_in_body(): bool {
 				 * It is probably inter-element whitespace, but it may also
 				 * contain character references which decode only to whitespace.
 				 */
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_GENERIC === $this->text_node_classification ) {
 					$this->state->frameset_ok = false;
 				}
 
@@ -2829,12 +2814,11 @@ private function step_in_table(): bool {
 						'TR' === $current_node_name
 					)
 				) {
-					$text = $this->get_modifiable_text();
 					/*
 					 * If the text is empty after processing HTML entities and stripping
 					 * U+0000 NULL bytes then ignore the token.
 					 */
-					if ( '' === $text ) {
+					if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) {
 						return $this->step();
 					}
 
@@ -2857,7 +2841,7 @@ private function step_in_table(): bool {
 					 *
 					 * @see https://html.spec.whatwg.org/#parsing-main-intabletext
 					 */
-					if ( strlen( $text ) === strspn( $text, " \t\f\r\n" ) ) {
+					if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 						$this->insert_html_element( $this->state->current_token );
 						return true;
 					}
@@ -3177,16 +3161,7 @@ private function step_in_column_group(): bool {
 			 * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( '' === $text ) {
-					/*
-					 * If the text is empty after processing HTML entities and stripping
-					 * U+0000 NULL bytes then ignore the token.
-					 */
-					return $this->step();
-				}
-
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					// Insert the character.
 					$this->insert_html_element( $this->state->current_token );
 					return true;
@@ -3609,19 +3584,13 @@ private function step_in_select(): bool {
 			 * > Any other character token
 			 */
 			case '#text':
-				$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
-
 				/*
 				 * > A character token that is U+0000 NULL
 				 *
 				 * If a text node only comprises null bytes then it should be
 				 * entirely ignored and should not return to calling code.
 				 */
-				if (
-					1 <= $current_token->length &&
-					"\x00" === $this->html[ $current_token->start ] &&
-					strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
-				) {
+				if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) {
 					// Parse error: ignore the token.
 					return $this->step();
 				}
@@ -3986,8 +3955,7 @@ private function step_after_body(): bool {
 			 * > Process the token using the rules for the "in body" insertion mode.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step_in_body();
 				}
 				goto after_body_anything_else;
@@ -4072,9 +4040,7 @@ private function step_in_frameset(): bool {
 			 * them under HTML. This is not supported at this time.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step_in_body();
 				}
 				$this->bail( 'Non-whitespace characters cannot be handled in frameset.' );
@@ -4193,8 +4159,7 @@ private function step_after_frameset(): bool {
 			 * them under HTML. This is not supported at this time.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step_in_body();
 				}
 				$this->bail( 'Non-whitespace characters cannot be handled in after frameset' );
@@ -4288,8 +4253,7 @@ private function step_after_after_body(): bool {
 			 * > Process the token using the rules for the "in body" insertion mode.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step_in_body();
 				}
 				goto after_after_body_anything_else;
@@ -4355,8 +4319,7 @@ private function step_after_after_frameset(): bool {
 			 * them under HTML. This is not supported at this time.
 			 */
 			case '#text':
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
 					return $this->step_in_body();
 				}
 				$this->bail( 'Non-whitespace characters cannot be handled in after after frameset.' );
@@ -4412,6 +4375,7 @@ private function step_in_foreign_content(): bool {
 		}
 
 		switch ( $op ) {
+			case '#cdata-section':
 			case '#text':
 				/*
 				 * > A character token that is U+0000 NULL
@@ -4424,8 +4388,7 @@ private function step_in_foreign_content(): bool {
 				 * It is probably inter-element whitespace, but it may also
 				 * contain character references which decode only to whitespace.
 				 */
-				$text = $this->get_modifiable_text();
-				if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
+				if ( parent::TEXT_IS_GENERIC === $this->text_node_classification ) {
 					$this->state->frameset_ok = false;
 				}
 
@@ -4435,7 +4398,6 @@ private function step_in_foreign_content(): bool {
 			/*
 			 * > A comment token
 			 */
-			case '#cdata-section':
 			case '#comment':
 			case '#funky-comment':
 			case '#presumptuous-tag':