Skip to content

Commit

Permalink
HTML API: Add functions to read inner and outer HTML.
Browse files Browse the repository at this point in the history
  • Loading branch information
dmsnell committed Aug 3, 2023
1 parent 0c23a14 commit 060d219
Show file tree
Hide file tree
Showing 4 changed files with 484 additions and 9 deletions.
45 changes: 45 additions & 0 deletions src/wp-includes/html-api/class-wp-html-open-elements.php
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,17 @@ class WP_HTML_Open_Elements {
*/
public $stack = array();

/**
* Holds functions added to be called after popping an element off the stack.
*
* Listeners are passed the WP_HTML_Token for the item that was removed.
*
* @since 6.4.0
*
* @var array
*/
private $after_pop_listeners = array();

/**
* Whether a P element is in button scope currently.
*
Expand Down Expand Up @@ -428,5 +439,39 @@ public function after_element_pop( $item ) {
$this->has_p_in_button_scope = $this->has_element_in_button_scope( 'P' );
break;
}

// Call any listeners that are registered.
foreach ( $this->after_pop_listeners as $listener ) {
call_user_func( $listener, $item );
}
}

/**
* Creates a context in which a given listener is called after
* popping an element off of the stack of open elements.
*
* It's unlikely that you will need this function. It exists
* to aid an optimization in the `WP_HTML_Processor` and the
* strange form of calling a generator inside a `foreach`
* loop ensures that proper cleanup of the listener occurs.
*
* Example:
*
* $did_close = false;
* $closed_a_p = function ( $item ) use ( &$did_close ) { $did_close = 'P' === $item->node_name; };
* foreach ( $stack_of_open_elements->with_pop_listener( $closed_a_p ) ) {
* while ( ! $did_close && $processor->next_tag() ) {
* // This loop executes until _any_ P element is closed.
* }
* }
*
* @since 6.4.0
*
* @param callable $listener Called with the WP_HTML_Token for the item that was popped off of the stack.
*/
public function with_pop_listener( $listener ) {
$this->after_pop_listeners[] = $listener;
yield;
array_pop( $this->after_pop_listeners );
}
}
194 changes: 185 additions & 9 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ public function __construct( $html, $use_the_static_create_methods_instead = nul
* failed it's possible to request the last error. This can be
* helpful to know if it's possible to fix something or to give up.
*
* Example
* Example:
*
* $p = WP_HTML_Processor::createFragment( '<template><strong><button><em><p><em>' );
* false === $p->next_tag();
Expand Down Expand Up @@ -418,6 +418,93 @@ public function next_tag( $query = null ) {
return false;
}

/**
* Returns the raw HTML content inside a matched tag.
*
* "Markup" differs from inner HTML in that it returns the raw HTML inside the matched tag.
* This means that it's possible this returns HTML without matching tags, or with HTML attributes
* serialized differently than a DOM API would return.
*
* Example:
*
* $processor = WP_HTML_Processor::createFragment( '<div><p>Inside <em>P</em> <i>tags</div>' );
* $processor->next_tag( 'P' );
* 'Inside <em>P</em> <i>tags' === $processor->get_inner_markup();
*
* @since 6.4.0
*
* @throws Exception When unable to allocate a bookmark for internal tracking of the open tag.
*
* @return string|null The inner markup if available, else NULL.
*/
public function get_inner_markup() {
if ( null === $this->get_tag() ) {
return null;
}

$this->set_bookmark( 'start' );
$found_tag = $this->step_until_tag_is_closed();
$this->set_bookmark( 'end' );

if ( $found_tag ) {
$inner_markup = $this->substr_bookmarks( 'after', 'start', 'before', 'end' );
} else {
// If there's no closing tag then the inner markup continues to the end of the document.
$inner_markup = $this->substr_bookmark( 'after', 'start' );
}

$this->seek( 'start' );
$this->release_bookmark( 'start' );
$this->release_bookmark( 'end' );

return $inner_markup;
}

/**
* Returns the raw HTML content around a matched tag, including the tag itself.
*
* "Markup" differs from outer HTML in that it returns the raw HTML inside the matched tag.
* This means that it's possible this returns HTML without matching tags, or with HTML attributes
* serialized differently than a DOM API would return.
*
* Example:
*
* $processor = WP_HTML_Processor::createFragment( '<div><p>Inside <em>P</em> <i>tags</div>' );
* $processor->next_tag( 'P' );
* '<p>Inside <em>P</em> <i>tags' === $processor->get_inner_markup();
*
* @since 6.4.0
*
* @throws Exception When unable to allocate a bookmark for internal tracking of the open tag.
*
* @return string|null The outer markup if available, else NULL.
*/
public function get_outer_markup() {
if ( null === $this->get_tag() ) {
return null;
}

$this->set_bookmark( 'start' );
$start_tag = $this->current_token->node_name;
$found_tag = $this->step_until_tag_is_closed();
$this->set_bookmark( 'end' );

if ( $found_tag ) {
$did_close = $this->get_tag() === $start_tag && $this->is_tag_closer();
$end_position = $did_close ? 'after' : 'before';
$outer_markup = $this->substr_bookmarks( 'before', 'start', $end_position, 'end' );
} else {
// If there's no closing tag then the outer markup continues to the end of the document.
$outer_markup = $this->substr_bookmark( 'before', 'start' );
}

$this->seek( 'start' );
$this->release_bookmark( 'start' );
$this->release_bookmark( 'end' );

return $outer_markup;
}

/**
* Steps through the HTML document and stop at the next tag, if any.
*
Expand All @@ -438,12 +525,9 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
$this->state->stack_of_open_elements->pop();
}

parent::next_tag( self::VISIT_EVERYTHING );
}

// Finish stepping when there are no more tokens in the document.
if ( null === $this->get_tag() ) {
return false;
if ( ! parent::next_tag( self::VISIT_EVERYTHING ) ) {
return false;
}
}

$this->current_token = new WP_HTML_Token(
Expand Down Expand Up @@ -474,9 +558,9 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
/**
* Computes the HTML breadcrumbs for the currently-matched node, if matched.
*
* Breadcrumbs start at the outer-most parent and descend toward the matched element.
* Breadcrumbs start at the outermost parent and descend toward the matched element.
*
* Example
* Example:
*
* $p = WP_HTML_Processor::createFragment( '<p><strong><em><img></em></strong></p>' );
* $p->next_tag( 'IMG' );
Expand Down Expand Up @@ -723,6 +807,98 @@ private function bookmark_tag() {
return "{$this->bookmark_counter}";
}

/**
* Steps through the HTML document until the current open tag is closed.
*
* @since 6.4.0
*
* @throws Exception When unable to allocate bookmark for internal tracking.
*
* @return bool|null true if a closing tag was found, false if not, and null if not starting at a matched tag.
*/
private function step_until_tag_is_closed() {
if ( null === $this->get_tag() ) {
return null;
}

/** @var WP_HTML_Token $start Reference to the opening tag when calling this function. */
$start = $this->current_token;

/** @var bool $keep_searching Whether to continue scanning for a point where the opening tag is closed. */
$keep_searching = true;

/**
* Sets a flag indicating that the starting tag has been closed once
* it's popped from the stack of open elements. This is a listener function.
*
* @since 6.4.0
*
* @see WP_HTML_Open_Elements::with_pop_listener()
*
* @param WP_HTML_Token $node Node that was popped.
*/
$tag_is_closed = function ( $node ) use ( &$keep_searching, $start ) {
if ( $node === $start ) {
$keep_searching = false;
}
};

/*
* Normally, when stepping into each new elements, it would be required to walk up the
* stack of open elements and look to see if the starting tag is still open, if it's
* on the stack. By listening for elements that are popped from the stack, however, it's
* possible to know if the starting tag has been closed without anything more than a
* constant boolean access, as the listener is called for each tag that's closed.
*
* The use of the `foreach` here creates a context which ensures that the listener is
* properly removed and cleaned up without having to manually remove it.
*/
foreach ( $this->state->stack_of_open_elements->with_pop_listener( $tag_is_closed ) as $_ ) {
// Find where the tag is closed by stepping forward until it's no longer on the stack of open elements.
do {
$found_tag = $this->step();
} while ( $found_tag && $keep_searching );
}

return $found_tag;
}

/**
* Returns a substring of the input HTML document from a bookmark until the end.
*
* @since 6.4.0
*
* @param string $start_position "before" to clip before bookmark, "after" to clip after.
* @param string $start Bookmark name at which to start clipping.
* @return string Clipped substring of input HTMl document.
*/
private function substr_bookmark( $start_position, $start ) {
$start_bookmark = $this->bookmarks[ "_{$start}" ];
$start_offset = 'before' === $start_position ? $start_bookmark->start : $start_bookmark->end + 1;

return substr( $this->html, $start_offset );
}

/**
* Returns a substring of the input HTML document delimited by bookmarks.
*
* @since 6.4.0
*
* @param string $start_position "before" to clip before bookmark, "after" to clip after.
* @param string $start Bookmark name at which to start clipping.
* @param string $end_position "before" to clip before bookmark, "after" to clip after.
* @param string $end Bookmark name at which to end clipping.
* @return string Clipped substring of input HTMl document.
*/
private function substr_bookmarks( $start_position, $start, $end_position, $end ) {
$start_bookmark = $this->bookmarks[ "_{$start}" ];
$end_bookmark = $this->bookmarks[ "_{$end}" ];
$start_offset = 'before' === $start_position ? $start_bookmark->start : $start_bookmark->end + 1;
$end_offset = 'before' === $end_position ? $end_bookmark->start : $end_bookmark->end + 1;

return substr( $this->html, $start_offset, $end_offset - $start_offset );
}

/*
* HTML semantic overrides for Tag Processor
*/
Expand Down
Loading

0 comments on commit 060d219

Please sign in to comment.