Skip to content

Commit

Permalink
HTML API: Parse DOCTYPE tokens and set HTML parser mode accordingly.
Browse files Browse the repository at this point in the history
This patch adds until-now missing code to parse the structure of HTML DOCTYPE declarations. The DOCTYPE is mostly unused but can dictate the document compatability mode, which governs whether CSS class names match in a ASCII-case-insensitive way or not, and whether TABLE elements close an open P element.

The DOCTYPE information is made available through a new method on the Tag Processor, `get_doctype_info()`.

Developed in WordPress#7195
Discussed in https://core.trac.wordpress.org/ticket/61576

Props dmsnell, jonsurrell.
See #61576.


git-svn-id: https://develop.svn.wordpress.org/trunk@58925 602fd350-edb4-49c9-b593-d223f7449a82
  • Loading branch information
dmsnell committed Aug 23, 2024
1 parent b515d22 commit 1139a51
Show file tree
Hide file tree
Showing 8 changed files with 792 additions and 19 deletions.
1 change: 1 addition & 0 deletions phpcs.xml.dist
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@
in the parsing, and distance the code from its standard. -->
<rule ref="Generic.PHP.DiscourageGoto.Found">
<exclude-pattern>/wp-includes/html-api/class-wp-html-processor\.php</exclude-pattern>
<exclude-pattern>/wp-includes/html-api/class-wp-html-doctype-info\.php</exclude-pattern>
</rule>

<!-- Exclude sample config from modernization to prevent breaking CI workflows based on WP-CLI scaffold.
Expand Down
616 changes: 616 additions & 0 deletions src/wp-includes/html-api/class-wp-html-doctype-info.php

Large diffs are not rendered by default.

12 changes: 5 additions & 7 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -1076,26 +1076,24 @@ private function step_initial(): bool {
* > A DOCTYPE token
*/
case 'html':
$contents = $this->get_modifiable_text();
if ( ' html' !== $contents ) {
/*
* @todo When the HTML Tag Processor fully parses the DOCTYPE declaration,
* this code should examine the contents to set the compatability mode.
*/
$this->bail( 'Cannot process any DOCTYPE other than a normative HTML5 doctype.' );
$doctype = $this->get_doctype_info();
if ( null !== $doctype && 'quirks' === $doctype->indicated_compatability_mode ) {
$this->state->document_mode = WP_HTML_Processor_State::QUIRKS_MODE;
}

/*
* > Then, switch the insertion mode to "before html".
*/
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML;
$this->insert_html_element( $this->state->current_token );
return true;
}

/*
* > Anything else
*/
initial_anything_else:
$this->state->document_mode = WP_HTML_Processor_State::QUIRKS_MODE;
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML;
return $this->step( self::REPROCESS_CURRENT_NODE );
}
Expand Down
23 changes: 22 additions & 1 deletion src/wp-includes/html-api/class-wp-html-tag-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -4026,6 +4026,27 @@ private function matches(): bool {
return true;
}

/**
* Gets DOCTYPE declaration info from a DOCTYPE token.
*
* DOCTYPE tokens may appear in many places in an HTML document. In most places, they are
* simply ignored. The main parsing functions find the basic shape of DOCTYPE tokens but
* do not perform detailed parsing.
*
* This method can be called to perform a full parse of the DOCTYPE token and retrieve
* its information.
*
* @return WP_HTML_Doctype_Info|null The DOCTYPE declaration information or `null` if not
* currently at a DOCTYPE node.
*/
public function get_doctype_info(): ?WP_HTML_Doctype_Info {
if ( self::STATE_DOCTYPE !== $this->parser_state ) {
return null;
}

return WP_HTML_Doctype_Info::from_doctype_token( substr( $this->html, $this->token_starts_at, $this->token_length ) );
}

/**
* Parser Ready State.
*
Expand Down Expand Up @@ -4117,7 +4138,7 @@ private function matches(): bool {

/**
* Indicates that the parser has found a DOCTYPE node and it's
* possible to read and modify its modifiable text.
* possible to read its DOCTYPE information via `get_doctype_info()`.
*
* @since 6.5.0
*
Expand Down
1 change: 1 addition & 0 deletions src/wp-settings.php
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@
require ABSPATH . WPINC . '/html-api/html5-named-character-references.php';
require ABSPATH . WPINC . '/html-api/class-wp-html-attribute-token.php';
require ABSPATH . WPINC . '/html-api/class-wp-html-span.php';
require ABSPATH . WPINC . '/html-api/class-wp-html-doctype-info.php';
require ABSPATH . WPINC . '/html-api/class-wp-html-text-replacement.php';
require ABSPATH . WPINC . '/html-api/class-wp-html-decoder.php';
require ABSPATH . WPINC . '/html-api/class-wp-html-tag-processor.php';
Expand Down
118 changes: 118 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlDoctypeInfo.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
<?php
/**
* Unit tests covering WP_HTML_Doctype_Info functionality.
*
* @package WordPress
* @subpackage HTML-API
*/

/**
* @group html-api
*
* @coversDefaultClass WP_HTML_Doctype_Info
*/
class Tests_HtmlApi_WpHtmlDoctypeInfo extends WP_UnitTestCase {
/**
* Test DOCTYPE handling.
*
* @ticket 61576
*
* @dataProvider data_parseable_raw_doctypes
*/
public function test_doctype_doc_info(
string $html,
string $expected_compat_mode,
?string $expected_name = null,
?string $expected_public_id = null,
?string $expected_system_id = null
) {
$doctype = WP_HTML_Doctype_Info::from_doctype_token( $html );
$this->assertNotNull(
$doctype,
"Should have parsed the following doctype declaration: {$html}"
);

$this->assertSame(
$expected_compat_mode,
$doctype->indicated_compatability_mode,
'Failed to infer the expected document compatability mode.'
);

$this->assertSame(
$expected_name,
$doctype->name,
'Failed to parse the expected DOCTYPE name.'
);

$this->assertSame(
$expected_public_id,
$doctype->public_identifier,
'Failed to parse the expected DOCTYPE public identifier.'
);

$this->assertSame(
$expected_system_id,
$doctype->system_identifier,
'Failed to parse the expected DOCTYPE system identifier.'
);
}

/**
* Data provider.
*
* @return array[]
*/
public static function data_parseable_raw_doctypes(): array {
return array(
'Missing doctype name' => array( '<!DOCTYPE>', 'quirks' ),
'HTML5 doctype' => array( '<!DOCTYPE html>', 'no-quirks', 'html' ),
'HTML5 doctype no whitespace before name' => array( '<!DOCTYPEhtml>', 'no-quirks', 'html' ),
'XHTML doctype' => array( '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">', 'no-quirks', 'html', '-//W3C//DTD HTML 4.01//EN', 'http://www.w3.org/TR/html4/strict.dtd' ),
'SVG doctype' => array( '<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">', 'quirks', 'svg', '-//W3C//DTD SVG 1.1//EN', 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd' ),
'MathML doctype' => array( '<!DOCTYPE math PUBLIC "-//W3C//DTD MathML 2.0//EN" "http://www.w3.org/Math/DTD/mathml2/mathml2.dtd">', 'quirks', 'math', '-//W3C//DTD MathML 2.0//EN', 'http://www.w3.org/Math/DTD/mathml2/mathml2.dtd' ),
'Doctype with null byte replacement' => array( "<!DOCTYPE null-\0 PUBLIC '\0' '\0\0'>", 'quirks', "null-\u{FFFD}", "\u{FFFD}", "\u{FFFD}\u{FFFD}" ),
'Uppercase doctype' => array( '<!DOCTYPE UPPERCASE>', 'quirks', 'uppercase' ),
'Lowercase doctype' => array( '<!doctype lowercase>', 'quirks', 'lowercase' ),
'Doctype with whitespace' => array( "<!DOCTYPE\n\thtml\f\rPUBLIC\r\n''\t''>", 'no-quirks', 'html', '', '' ),
'Doctype trailing characters' => array( "<!DOCTYPE html PUBLIC '' '' Anything (except closing angle bracket) is just fine here !!!>", 'no-quirks', 'html', '', '' ),
'An ugly no-quirks doctype' => array( "<!dOcTyPehtml\tPublIC\"pub-id\"'sysid'>", 'no-quirks', 'html', 'pub-id', 'sysid' ),
'Missing public ID' => array( '<!DOCTYPE html PUBLIC>', 'quirks', 'html' ),
'Missing system ID' => array( '<!DOCTYPE html SYSTEM>', 'quirks', 'html' ),
'Missing close quote public ID' => array( "<!DOCTYPE html PUBLIC 'xyz>", 'quirks', 'html', 'xyz' ),
'Missing close quote system ID' => array( "<!DOCTYPE html SYSTEM 'xyz>", 'quirks', 'html', null, 'xyz' ),
'Missing close quote system ID with public' => array( "<!DOCTYPE html PUBLIC 'abc' 'xyz>", 'quirks', 'html', 'abc', 'xyz' ),
'Bogus characters instead of system/public' => array( '<!DOCTYPE html FOOBAR>', 'quirks', 'html' ),
'Bogus characters instead of PUBLIC quote' => array( "<!DOCTYPE html PUBLIC x ''''>", 'quirks', 'html' ),
'Bogus characters instead of SYSTEM quote ' => array( "<!DOCTYPE html SYSTEM x ''>", 'quirks', 'html' ),
'Emoji' => array( '<!DOCTYPE 🏴󠁧󠁢󠁥󠁮󠁧󠁿 PUBLIC "🔥" "😈">', 'quirks', "\u{1F3F4}\u{E0067}\u{E0062}\u{E0065}\u{E006E}\u{E0067}\u{E007F}", '🔥', '😈' ),
'Bogus characters instead of SYSTEM quote after public' => array( "<!DOCTYPE html PUBLIC ''x''>", 'quirks', 'html', '' ),
'Special quirks mode if system unset' => array( '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Frameset//">', 'quirks', 'html', '-//W3C//DTD HTML 4.01 Frameset//' ),
'Special limited-quirks mode if system set' => array( '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Frameset//" "">', 'limited-quirks', 'html', '-//W3C//DTD HTML 4.01 Frameset//', '' ),
);
}

/**
* @dataProvider invalid_inputs
*
* @ticket 61576
*/
public function test_invalid_inputs_return_null( string $html ) {
$this->assertNull( WP_HTML_Doctype_Info::from_doctype_token( $html ) );
}

/**
* Data provider.
*
* @return array[]
*/
public static function invalid_inputs(): array {
return array(
'Empty string' => array( '' ),
'Other HTML' => array( '<div>' ),
'DOCTYPE after HTML' => array( 'x<!DOCTYPE>' ),
'DOCTYPE before HTML' => array( '<!DOCTYPE>x' ),
'Incomplete DOCTYPE' => array( '<!DOCTYPE' ),
'Pseudo DOCTYPE containing ">"' => array( '<!DOCTYPE html PUBLIC ">">' ),
);
}
}
24 changes: 13 additions & 11 deletions tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class Tests_HtmlApi_Html5lib extends WP_UnitTestCase {
const SKIP_TESTS = array(
'comments01/line0155' => 'Unimplemented: Need to access raw comment text on non-normative comments.',
'comments01/line0169' => 'Unimplemented: Need to access raw comment text on non-normative comments.',
'doctype01/line0380' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly',
'html5test-com/line0129' => 'Unimplemented: Need to access raw comment text on non-normative comments.',
'noscript01/line0014' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
'tests1/line0692' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly',
Expand Down Expand Up @@ -115,7 +116,7 @@ public function data_external_html5lib_tests() {

$test_context_element = $test[1];

if ( self::should_skip_test( $test_context_element, $test_name, $test[3] ) ) {
if ( self::should_skip_test( $test_context_element, $test_name ) ) {
continue;
}

Expand All @@ -133,7 +134,7 @@ public function data_external_html5lib_tests() {
*
* @return bool True if the test case should be skipped. False otherwise.
*/
private static function should_skip_test( ?string $test_context_element, string $test_name, string $expected_tree ): bool {
private static function should_skip_test( ?string $test_context_element, string $test_name ): bool {
if ( null !== $test_context_element && 'body' !== $test_context_element ) {
return true;
}
Expand Down Expand Up @@ -189,6 +190,15 @@ private static function build_tree_representation( ?string $fragment_context, st
}

switch ( $token_type ) {
case '#doctype':
$doctype = $processor->get_doctype_info();
$output .= "<!DOCTYPE {$doctype->name}";
if ( null !== $doctype->public_identifier || null !== $doctype->system_identifier ) {
$output .= " \"{$doctype->public_identifier}\" \"{$doctype->system_identifier}\"";
}
$output .= ">\n";
break;

case '#tag':
$namespace = $processor->get_namespace();
$tag_name = 'html' === $namespace
Expand Down Expand Up @@ -450,15 +460,7 @@ public static function parse_html5_dat_testfile( $filename ) {
*/
case 'document':
if ( '|' === $line[0] ) {
/*
* The next_token() method these tests rely on do not stop
* at doctype nodes. Strip doctypes from output.
* @todo Restore this line if and when the processor
* exposes doctypes.
*/
if ( '| <!DOCTYPE ' !== substr( $line, 0, 12 ) ) {
$test_dom .= substr( $line, 2 );
}
$test_dom .= substr( $line, 2 );
} else {
// This is a text node that includes unescaped newlines.
// Everything else should be singles lines starting with "| ".
Expand Down
16 changes: 16 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlTagProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -2939,4 +2939,20 @@ public function test_unclosed_funky_comment_input_too_short() {
$this->assertFalse( $processor->next_tag() );
$this->assertTrue( $processor->paused_at_incomplete_token() );
}

/**
* Test basic DOCTYPE handling.
*
* @ticket 61576
*/
public function test_doctype_doc_name() {
$processor = new WP_HTML_Tag_Processor( '<!DOCTYPE html>' );
$this->assertTrue( $processor->next_token() );
$doctype = $processor->get_doctype_info();
$this->assertNotNull( $doctype );
$this->assertSame( 'html', $doctype->name );
$this->assertSame( 'no-quirks', $doctype->indicated_compatability_mode );
$this->assertNull( $doctype->public_identifier );
$this->assertNull( $doctype->system_identifier );
}
}

0 comments on commit 1139a51

Please sign in to comment.