-
Notifications
You must be signed in to change notification settings - Fork 2.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
HTML API: Allow additional fragment contexts. #7141
base: trunk
Are you sure you want to change the base?
Changes from 5 commits
704ab8e
691d39e
7005b6c
101d345
5583818
4291703
d47eadd
d146461
a175677
77f1a38
640f458
c3f1a81
40f4967
4c652ad
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -281,24 +281,41 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { | |
* | ||
* ## Current HTML Support | ||
* | ||
* - The only supported context is `<body>`, which is the default value. | ||
* - The only supported document encoding is `UTF-8`, which is the default value. | ||
* | ||
* @todo Verify that creating a fragment in self-contained elements works. | ||
* | ||
* @since 6.4.0 | ||
* @since 6.6.0 Returns `static` instead of `self` so it can create subclass instances. | ||
* @since 6.7.0 Can create fragment in any context. | ||
* | ||
* @param string $html Input HTML fragment to process. | ||
* @param string $context Context element for the fragment, must be default of `<body>`. | ||
* @param string $encoding Text encoding of the document; must be default of 'UTF-8'. | ||
* @return static|null The created processor if successful, otherwise null. | ||
*/ | ||
public static function create_fragment( $html, $context = '<body>', $encoding = 'UTF-8' ) { | ||
if ( '<body>' !== $context || 'UTF-8' !== $encoding ) { | ||
if ( 'UTF-8' !== $encoding ) { | ||
return null; | ||
} | ||
|
||
$context_processor = new WP_HTML_Tag_Processor( $context ); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure how deep you want to comment, but a comment here could help understanding what you were up to. |
||
if ( ! $context_processor->next_token() || '#tag' !== $context_processor->get_token_type() ) { | ||
return null; | ||
} | ||
|
||
$context_tag = $context_processor->get_tag(); | ||
$context_attributes = array(); | ||
foreach ( $context_processor->get_attribute_names_with_prefix( '' ) as $name ) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Having an empty prefix here looks strange. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks @apermo. if this looks strange, it's supposed to 🙃 |
||
$context_attributes[ $name ] = $context_processor->get_attribute( $name ); | ||
} | ||
|
||
if ( $context_processor->next_token() ) { | ||
return null; | ||
} | ||
|
||
$processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE ); | ||
$processor->state->context_node = array( 'BODY', array() ); | ||
$processor->state->context_node = array( $context_tag, $context_attributes ); | ||
$processor->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I understood your PR correctly, it's about allowing insertion to anything other than body, or is body in this case ambivalent? So content body vs Anyways, I'm uncertain wether this is intentional or if you forgot to touch this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the fragment parser is synonymous with so if you knew you were inside a basically this is not something most people will need to use, but it will be used by |
||
$processor->state->encoding = $encoding; | ||
$processor->state->encoding_confidence = 'certain'; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
<?php | ||
/** | ||
* Unit tests covering WP_HTML_Processor fragment parsing functionality. | ||
* | ||
* @package WordPress | ||
* @subpackage HTML-API | ||
* | ||
* @since 6.7.0 | ||
* | ||
* @group html-api | ||
* | ||
* @coversDefaultClass WP_HTML_Processor | ||
*/ | ||
class Tests_HtmlApi_WpHtmlProcessorFragmentParsing extends WP_UnitTestCase { | ||
/** | ||
* Verifies that SCRIPT fragment parses behave as they should. | ||
* | ||
* @dataProvider data_script_fragments | ||
* | ||
* @param string $inner_html HTML to parse in SCRIPT fragment. | ||
* @param string|null $expected_html Expected output of the parse, or `null` if unsupported. | ||
*/ | ||
public function test_script_tag( string $inner_html, ?string $expected_html ) { | ||
$processor = WP_HTML_Processor::create_fragment( $inner_html, '<script></script>' ); | ||
$normalized = static::normalize_html( $processor ); | ||
|
||
if ( isset( $expected_html ) ) { | ||
$this->assertSame( | ||
$expected_html, | ||
$normalized, | ||
'Failed to properly parse SCRIPT fragment.' | ||
); | ||
} else { | ||
$this->assertNull( | ||
$normalized, | ||
"Should have bailed when parsing but didn't." | ||
); | ||
} | ||
} | ||
|
||
/** | ||
* Data provider. | ||
* | ||
* @ticket 61576 | ||
* | ||
* @return array[] | ||
*/ | ||
public static function data_script_fragments() { | ||
return array( | ||
'Basic SCRIPT' => array( 'const x = 5 < y;', 'const x = 5 < y;' ), | ||
'Text after SCRIPT' => array( 'const x = 5 < y;</script>test', null ), | ||
'Tag after SCRIPT' => array( 'end</script><img>', null ), | ||
'Double escape' => array( "<!--<script>\nconsole.log('</script>');\n-->\nconsole.log('<img>');", "<!--<script>\nconsole.log('\</script>');\n-->\nconsole.log('<img'>);" ), | ||
); | ||
} | ||
|
||
/** | ||
* Produces normalized HTML output given a processor as input, which has not | ||
* yet started to proceed through its document. | ||
* | ||
* This can be used with a full or a fragment parser. | ||
* | ||
* @param WP_HTML_Processor $processor HTML Processor in READY state at the beginning of its input. | ||
* @return string|null Normalized HTML from input processor. | ||
*/ | ||
private static function normalize_html( WP_HTML_Processor $processor ): ?string { | ||
$html = ''; | ||
|
||
while ( $processor->next_token() ) { | ||
$token_name = $processor->get_token_name(); | ||
$token_type = $processor->get_token_type(); | ||
$is_closer = $processor->is_tag_closer(); | ||
|
||
switch ( $token_type ) { | ||
case '#text': | ||
$html .= $processor->get_modifiable_text(); | ||
break; | ||
|
||
case '#tag': | ||
if ( $is_closer ) { | ||
$html .= "</{$token_name}>"; | ||
} else { | ||
$names = $processor->get_attribute_names_with_prefix( '' ); | ||
if ( ! isset( $names ) ) { | ||
$html .= "<{$token_name}>"; | ||
} else { | ||
$html .= "<{$token_name}"; | ||
foreach ( $names as $name ) { | ||
$value = $processor->get_attribute( $name ); | ||
if ( true === $value ) { | ||
$html .= " {$name}"; | ||
} else { | ||
$value = strtr( $value, '"', '"' ); | ||
$html .= " {$name}=\"{$value}\""; | ||
} | ||
} | ||
} | ||
|
||
$text = $processor->get_modifiable_text(); | ||
if ( '' !== $text ) { | ||
$html .= "{$text}</{$token_name}>"; | ||
} | ||
} | ||
break; | ||
} | ||
} | ||
|
||
if ( null !== $processor->get_last_error() ) { | ||
return null; | ||
} | ||
|
||
return $html; | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"must be default of ..."?
My internal autocomplete expected an "or" here. Which likely is equivalent, but I was expecting it.
And I think at least for
$context
you need to update it, since your change was about allowing other than body.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thanks for noticing.
must be default of UTF-8
andmust be default or UTF-8
are very different, whereas only the default value has been allowed (and the default is UTF-8). some day we might open it up to other values, but this is there to communicate intentionally that this is a UTF-8-only interface at the moment.