From 7e76592a8fec0ee9e6a8dcb3cfda558fbc07ecec Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 15 Nov 2022 16:36:30 -0700 Subject: [PATCH] Tag Processor: Add ability to stop at tag closers, if requested `WP_HTML_Tag_Processor` introduces the ability to walk through an HTML document and modify attributes on tag openers. At times it could be useful to stop also at tag closers in order to perform augmented inspection and querying of a document. In this patch we're opening up a mode that allows just that; if querying with the `[ "tag_closers" => "visit" ]` query parrameter then in addition to stopping at tag openers, this will allow stopping at all tag tokens regardless of open or close status. --- .../html/class-wp-html-tag-processor.php | 137 +++++++++++++----- phpunit/html/wp-html-tag-processor-test.php | 56 +++++++ 2 files changed, 159 insertions(+), 34 deletions(-) diff --git a/lib/experimental/html/class-wp-html-tag-processor.php b/lib/experimental/html/class-wp-html-tag-processor.php index 6e72fc38ff411..acdf8d97a95a9 100644 --- a/lib/experimental/html/class-wp-html-tag-processor.php +++ b/lib/experimental/html/class-wp-html-tag-processor.php @@ -221,6 +221,14 @@ class WP_HTML_Tag_Processor { */ private $sought_match_offset; + /** + * Whether to visit tag closers, e.g. , when walking an input document. + * + * @since 6.2.0 + * @var boolean + */ + private $stop_on_tag_closers; + /** * The updated HTML document. * @@ -276,6 +284,29 @@ class WP_HTML_Tag_Processor { */ private $tag_name_length; + /** + * Byte offset in input document where current tag token ends. + * + * Example: + * ``` + *
... + * 0 1 | + * 01234567890123456 + * --- tag name ends at 14 + * ``` + * + * @since 6.2.0 + * @var ?int + */ + private $tag_ends_at; + + /** + * Whether the current tag is an opening tag, e.g.
, or a closing tag, e.g.
. + * + * @var boolean + */ + private $is_closing_tag; + /** * Lazily-built index of attributes found within an HTML tag, keyed by the attribute name. * @@ -412,7 +443,16 @@ public function next_tag( $query = null ) { return false; } - $this->parse_tag_opener_attributes(); + while ( $this->parse_next_attribute() ) { + continue; + } + + $tag_ends_at = strpos( $this->html, '>', $this->parsed_bytes ); + if ( false === $tag_ends_at ) { + return false; + } + $this->tag_ends_at = $tag_ends_at; + $this->parsed_bytes = $tag_ends_at; if ( $this->matches() ) { ++$already_found; @@ -495,7 +535,9 @@ private function skip_rcdata( $tag_name ) { continue; } - $this->skip_tag_closer_attributes(); + while ( $this->parse_next_attribute() ) { + continue; + } $at = $this->parsed_bytes; if ( $at >= strlen( $this->html ) ) { return false; @@ -620,12 +662,14 @@ private function skip_script_data() { if ( $is_closing ) { $this->parsed_bytes = $at; - $this->skip_tag_closer_attributes(); - if ( $this->parsed_bytes >= $doc_length ) { return false; } + while ( $this->parse_next_attribute() ) { + continue; + } + if ( '>' === $html[ $this->parsed_bytes ] ) { ++$this->parsed_bytes; return true; @@ -656,6 +700,13 @@ private function parse_next_tag() { return false; } + if ( '/' === $this->html[ $at + 1 ] ) { + $this->is_closing_tag = true; + $at++; + } else { + $this->is_closing_tag = false; + } + /* * HTML tag names must start with [a-zA-Z] otherwise they are not tags. * For example, "<3" is rendered as text, not a tag opener. This means @@ -777,35 +828,12 @@ private function parse_next_tag() { return false; } - /** - * Parses all attributes of the current tag. - * - * @since 6.2.0 - */ - private function parse_tag_opener_attributes() { - while ( $this->parse_next_attribute() ) { - continue; - } - } - - /** - * Skips all attributes of the current tag. - * - * @since 6.2.0 - */ - private function skip_tag_closer_attributes() { - while ( $this->parse_next_attribute( 'tag-closer' ) ) { - continue; - } - } - /** * Parses the next attribute. * - * @param string $context tag-opener or tag-closer. * @since 6.2.0 */ - private function parse_next_attribute( $context = 'tag-opener' ) { + private function parse_next_attribute() { // Skip whitespace and slashes. $this->parsed_bytes += strspn( $this->html, " \t\f\r\n/", $this->parsed_bytes ); if ( $this->parsed_bytes >= strlen( $this->html ) ) { @@ -872,7 +900,7 @@ private function parse_next_attribute( $context = 'tag-opener' ) { return false; } - if ( 'tag-opener' !== $context ) { + if ( $this->is_closing_tag ) { return true; } @@ -914,6 +942,8 @@ private function after_tag() { $this->apply_attributes_updates(); $this->tag_name_starts_at = null; $this->tag_name_length = null; + $this->tag_ends_at = null; + $this->is_closing_tag = null; $this->attributes = array(); } @@ -1159,6 +1189,25 @@ public function get_tag() { return strtoupper( $tag_name ); } + /** + * Indicates if the current tag token is a tag closer. + * + * Example: + * + * $p = new WP_HTML_Tag_Processor( '
' ); + * $p->next_tag( [ 'tag_name' => 'div', 'tag_closers' => 'visit' ] ); + * $p->is_tag_closer() === false; + * + * $p->next_tag( [ 'tag_name' => 'div', 'tag_closers' => 'visit' ] ); + * $p->is_tag_closer() === true; + *
+ * + * @return bool + */ + public function is_tag_closer() { + return $this->is_closing_tag; + } + /** * Updates or creates a new attribute on the currently matched tag with the value passed. * @@ -1175,8 +1224,8 @@ public function get_tag() { * @throws Exception When WP_DEBUG is true and the attribute name is invalid. */ public function set_attribute( $name, $value ) { - if ( null === $this->tag_name_starts_at ) { - return; + if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) { + return false; } /* @@ -1286,8 +1335,8 @@ public function set_attribute( $name, $value ) { * @param string $name The attribute name to remove. */ public function remove_attribute( $name ) { - if ( ! isset( $this->attributes[ $name ] ) ) { - return; + if ( $this->is_closing_tag || ! isset( $this->attributes[ $name ] ) ) { + return false; } /* @@ -1316,6 +1365,10 @@ public function remove_attribute( $name ) { * @param string $class_name The class name to add. */ public function add_class( $class_name ) { + if ( $this->is_closing_tag ) { + return false; + } + if ( null !== $this->tag_name_starts_at ) { $this->classname_updates[ $class_name ] = self::ADD_CLASS; } @@ -1329,6 +1382,10 @@ public function add_class( $class_name ) { * @param string $class_name The class name to remove. */ public function remove_class( $class_name ) { + if ( $this->is_closing_tag ) { + return false; + } + if ( null !== $this->tag_name_starts_at ) { $this->classname_updates[ $class_name ] = self::REMOVE_CLASS; } @@ -1392,7 +1449,9 @@ public function get_updated_html() { // Parse the attributes in the updated markup. $this->attributes = array(); - $this->parse_tag_opener_attributes(); + while ( $this->parse_next_attribute() ) { + continue; + } return $this->html; } @@ -1407,6 +1466,7 @@ public function get_updated_html() { * * @type string|null $tag_name Which tag to find, or `null` for "any tag." * @type string|null $class_name Tag must contain this class name to match. + * @type string $tag_closers "visit" or "skip": whether to stop on tag closers, e.g.
. * } */ private function parse_query( $query ) { @@ -1418,6 +1478,7 @@ private function parse_query( $query ) { $this->sought_tag_name = null; $this->sought_class_name = null; $this->sought_match_offset = 1; + $this->stop_on_tag_closers = false; // A single string value means "find the tag of this name". if ( is_string( $query ) ) { @@ -1441,6 +1502,10 @@ private function parse_query( $query ) { if ( isset( $query['match_offset'] ) && is_int( $query['match_offset'] ) && 0 < $query['match_offset'] ) { $this->sought_match_offset = $query['match_offset']; } + + if ( isset( $query['tag_closers'] ) ) { + $this->stop_on_tag_closers = 'visit' === $query['tag_closers']; + } } @@ -1452,6 +1517,10 @@ private function parse_query( $query ) { * @return boolean */ private function matches() { + if ( $this->is_closing_tag && ! $this->stop_on_tag_closers ) { + return false; + } + // Do we match a case-insensitive HTML tag name? if ( null !== $this->sought_tag_name ) { /* diff --git a/phpunit/html/wp-html-tag-processor-test.php b/phpunit/html/wp-html-tag-processor-test.php index e66b1d50758d0..273cbddddea4b 100644 --- a/phpunit/html/wp-html-tag-processor-test.php +++ b/phpunit/html/wp-html-tag-processor-test.php @@ -237,6 +237,35 @@ public function test_next_tag_should_return_false_for_a_non_existing_tag() { $this->assertFalse( $p->next_tag( 'p' ), 'Querying a non-existing tag did not return false' ); } + /** + * @covers next_tag + * @covers is_tag_closer + */ + public function test_next_tag_should_stop_on_closers_only_when_requested() { + $p = new WP_HTML_Tag_Processor( '
' ); + $this->assertTrue( $p->next_tag( array( 'tag_name' => 'div' ) ), 'Did not find desired tag opener' ); + $this->assertFalse( $p->next_tag( array( 'tag_name' => 'div' ) ), 'Visited an unwanted tag, a tag closer' ); + + $p = new WP_HTML_Tag_Processor( '
' ); + $p->next_tag( + array( + 'tag_name' => 'div', + 'tag_closers' => 'visit', + ) + ); + $this->assertFalse( $p->is_tag_closer(), 'Indicated a tag opener is a tag closer' ); + $this->assertTrue( + $p->next_tag( + array( + 'tag_name' => 'div', + 'tag_closers' => 'visit', + ) + ), + 'Did not stop at desired tag closer' + ); + $this->assertTrue( $p->is_tag_closer(), 'Indicated a tag closer is a tag opener' ); + } + /** * @ticket 56299 * @@ -255,6 +284,33 @@ public function test_set_attribute_on_a_non_existing_tag_does_not_change_the_mar ); } + public function test_attribute_ops_on_tag_closer_do_not_change_the_markup() { + $p = new WP_HTML_Tag_Processor( '
' ); + $p->next_tag( + array( + 'tag_name' => 'div', + 'tag_closers' => 'visit', + ) + ); + $this->assertFalse( $p->is_tag_closer(), 'Skipped tag opener' ); + $p->next_tag( + array( + 'tag_name' => 'div', + 'tag_closers' => 'visit', + ) + ); + $this->assertTrue( $p->is_tag_closer(), 'Skipped tag closer' ); + $this->assertFalse( $p->set_attribute( 'id', 'test' ), "Allowed setting an attribute on a tag closer when it shouldn't have" ); + $this->assertFalse( $p->remove_attribute( 'invalid-id' ), "Allowed removing an attribute on a tag closer when it shouldn't have" ); + $this->assertFalse( $p->add_class( 'sneaky' ), "Allowed adding a class on a tag closer when it shouldn't have" ); + $this->assertFalse( $p->remove_class( 'not-appearing-in-this-test' ), "Allowed removing a class on a tag closer when it shouldn't have" ); + $this->assertSame( + '
', + $p->get_updated_html(), + 'Calling get_updated_html after updating a non-existing tag returned an HTML that was different from the original HTML' + ); + } + /** * Passing a double quote inside of an attribute values could lead to an XSS attack as follows: *