Skip to content

Commit

Permalink
HTML API: Allow subdividing text nodes by meaningful prefixes.
Browse files Browse the repository at this point in the history
HTML parsing rules at times differentiate character tokens that are all null bytes, all whitespace, or other content. This patch introduces a new function which may be used to classify text node sub-regions and lead to more efficient application of these parsing rules.

Further, when classified in this way, application code may skip some rules and decoding entirely, improving performance. For example, this can be used to ease the implementation of skipping inter-element whitespace, which is usually not rendered.

Developed in WordPress#7236
Discussed in https://core.trac.wordpress.org/ticket/61974

Props dmsnell, jonsurrell.
Fixes #61974.


git-svn-id: https://develop.svn.wordpress.org/trunk@58970 602fd350-edb4-49c9-b593-d223f7449a82
  • Loading branch information
dmsnell committed Sep 2, 2024
1 parent b37cbf9 commit 95eb879
Show file tree
Hide file tree
Showing 3 changed files with 182 additions and 79 deletions.
88 changes: 25 additions & 63 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,12 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ): bool {

if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
parent::next_token();
if (
WP_HTML_Tag_Processor::STATE_TEXT_NODE === $this->parser_state ||
WP_HTML_Tag_Processor::STATE_CDATA_NODE === $this->parser_state
) {
parent::subdivide_text_appropriately();
}
}

// Finish stepping when there are no more tokens in the document.
Expand Down Expand Up @@ -1056,8 +1062,7 @@ private function step_initial(): bool {
* Parse error: ignore the token.
*/
case '#text':
$text = $this->get_modifiable_text();
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
return $this->step();
}
goto initial_anything_else;
Expand Down Expand Up @@ -1145,8 +1150,7 @@ private function step_before_html(): bool {
* Parse error: ignore the token.
*/
case '#text':
$text = $this->get_modifiable_text();
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
return $this->step();
}
goto before_html_anything_else;
Expand Down Expand Up @@ -1227,8 +1231,7 @@ private function step_before_head(): bool {
* Parse error: ignore the token.
*/
case '#text':
$text = $this->get_modifiable_text();
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
return $this->step();
}
goto before_head_anything_else;
Expand Down Expand Up @@ -1323,16 +1326,7 @@ private function step_in_head(): bool {
* > U+000A LINE FEED (LF), U+000C FORM FEED (FF),
* > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
*/
$text = $this->get_modifiable_text();
if ( '' === $text ) {
/*
* If the text is empty after processing HTML entities and stripping
* U+0000 NULL bytes then ignore the token.
*/
return $this->step();
}

if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
// Insert the character.
$this->insert_html_element( $this->state->current_token );
return true;
Expand Down Expand Up @@ -1552,8 +1546,7 @@ private function step_in_head_noscript(): bool {
* Parse error: ignore the token.
*/
case '#text':
$text = $this->get_modifiable_text();
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
return $this->step_in_head();
}

Expand Down Expand Up @@ -1654,8 +1647,7 @@ private function step_after_head(): bool {
* > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
*/
case '#text':
$text = $this->get_modifiable_text();
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
// Insert the character.
$this->insert_html_element( $this->state->current_token );
return true;
Expand Down Expand Up @@ -1793,8 +1785,6 @@ private function step_in_body(): bool {

switch ( $op ) {
case '#text':
$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];

/*
* > A character token that is U+0000 NULL
*
Expand All @@ -1804,11 +1794,7 @@ private function step_in_body(): bool {
* here, but if there are any other characters in the stream
* the active formats should be reconstructed.
*/
if (
1 <= $current_token->length &&
"\x00" === $this->html[ $current_token->start ] &&
strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
) {
if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) {
// Parse error: ignore the token.
return $this->step();
}
Expand All @@ -1820,8 +1806,7 @@ private function step_in_body(): bool {
* It is probably inter-element whitespace, but it may also
* contain character references which decode only to whitespace.
*/
$text = $this->get_modifiable_text();
if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
if ( parent::TEXT_IS_GENERIC === $this->text_node_classification ) {
$this->state->frameset_ok = false;
}

Expand Down Expand Up @@ -2829,12 +2814,11 @@ private function step_in_table(): bool {
'TR' === $current_node_name
)
) {
$text = $this->get_modifiable_text();
/*
* If the text is empty after processing HTML entities and stripping
* U+0000 NULL bytes then ignore the token.
*/
if ( '' === $text ) {
if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) {
return $this->step();
}

Expand All @@ -2857,7 +2841,7 @@ private function step_in_table(): bool {
*
* @see https://html.spec.whatwg.org/#parsing-main-intabletext
*/
if ( strlen( $text ) === strspn( $text, " \t\f\r\n" ) ) {
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
$this->insert_html_element( $this->state->current_token );
return true;
}
Expand Down Expand Up @@ -3177,16 +3161,7 @@ private function step_in_column_group(): bool {
* > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
*/
case '#text':
$text = $this->get_modifiable_text();
if ( '' === $text ) {
/*
* If the text is empty after processing HTML entities and stripping
* U+0000 NULL bytes then ignore the token.
*/
return $this->step();
}

if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
// Insert the character.
$this->insert_html_element( $this->state->current_token );
return true;
Expand Down Expand Up @@ -3609,19 +3584,13 @@ private function step_in_select(): bool {
* > Any other character token
*/
case '#text':
$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];

/*
* > A character token that is U+0000 NULL
*
* If a text node only comprises null bytes then it should be
* entirely ignored and should not return to calling code.
*/
if (
1 <= $current_token->length &&
"\x00" === $this->html[ $current_token->start ] &&
strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
) {
if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) {
// Parse error: ignore the token.
return $this->step();
}
Expand Down Expand Up @@ -3986,8 +3955,7 @@ private function step_after_body(): bool {
* > Process the token using the rules for the "in body" insertion mode.
*/
case '#text':
$text = $this->get_modifiable_text();
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
return $this->step_in_body();
}
goto after_body_anything_else;
Expand Down Expand Up @@ -4072,9 +4040,7 @@ private function step_in_frameset(): bool {
* them under HTML. This is not supported at this time.
*/
case '#text':
$text = $this->get_modifiable_text();
$text = $this->get_modifiable_text();
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
return $this->step_in_body();
}
$this->bail( 'Non-whitespace characters cannot be handled in frameset.' );
Expand Down Expand Up @@ -4193,8 +4159,7 @@ private function step_after_frameset(): bool {
* them under HTML. This is not supported at this time.
*/
case '#text':
$text = $this->get_modifiable_text();
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
return $this->step_in_body();
}
$this->bail( 'Non-whitespace characters cannot be handled in after frameset' );
Expand Down Expand Up @@ -4288,8 +4253,7 @@ private function step_after_after_body(): bool {
* > Process the token using the rules for the "in body" insertion mode.
*/
case '#text':
$text = $this->get_modifiable_text();
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
return $this->step_in_body();
}
goto after_after_body_anything_else;
Expand Down Expand Up @@ -4355,8 +4319,7 @@ private function step_after_after_frameset(): bool {
* them under HTML. This is not supported at this time.
*/
case '#text':
$text = $this->get_modifiable_text();
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
return $this->step_in_body();
}
$this->bail( 'Non-whitespace characters cannot be handled in after after frameset.' );
Expand Down Expand Up @@ -4412,6 +4375,7 @@ private function step_in_foreign_content(): bool {
}

switch ( $op ) {
case '#cdata-section':
case '#text':
/*
* > A character token that is U+0000 NULL
Expand All @@ -4424,8 +4388,7 @@ private function step_in_foreign_content(): bool {
* It is probably inter-element whitespace, but it may also
* contain character references which decode only to whitespace.
*/
$text = $this->get_modifiable_text();
if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
if ( parent::TEXT_IS_GENERIC === $this->text_node_classification ) {
$this->state->frameset_ok = false;
}

Expand All @@ -4435,7 +4398,6 @@ private function step_in_foreign_content(): bool {
/*
* > A comment token
*/
case '#cdata-section':
case '#comment':
case '#funky-comment':
case '#presumptuous-tag':
Expand Down
Loading

0 comments on commit 95eb879

Please sign in to comment.