From a7fe382566e1df945882fb3172a28089bbf54efb Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Mon, 26 Aug 2024 15:32:37 -0500 Subject: [PATCH] WIP: XML API: Introduce XML Processor. See also https://github.com/adamziel/wordpress-develop/pull/44 See also https://github.com/adamziel/wordpress-develop/pull/43 See also https://github.com/WordPress/wordpress-develop/compare/trunk...sirreal:wordpress-develop:xml-processor-continue?expand=1 --- .../xml-api/class-wp-xml-malformed-error.php | 32 ++ .../xml-api/class-wp-xml-processor.php | 434 ++++++++++++++++++ src/wp-settings.php | 1 + 3 files changed, 467 insertions(+) create mode 100644 src/wp-includes/xml-api/class-wp-xml-malformed-error.php create mode 100644 src/wp-includes/xml-api/class-wp-xml-processor.php diff --git a/src/wp-includes/xml-api/class-wp-xml-malformed-error.php b/src/wp-includes/xml-api/class-wp-xml-malformed-error.php new file mode 100644 index 0000000000000..58afcce637152 --- /dev/null +++ b/src/wp-includes/xml-api/class-wp-xml-malformed-error.php @@ -0,0 +1,32 @@ +level_of_concern = $level_of_concern; + } + + public function get_level_of_concern(): int { + switch ( $this->level_of_concern ) { + case WP_XML_Processor::CONCERNED_ABOUT_EVERYTHING: + return 'EVERYTHING'; + + case WP_XML_Processor::CONCERNED_ABOUT_CONTENT: + return 'CONTENT'; + + case WP_XML_Processor::CONCERNED_ABOUT_BENIGN_SYNTAX: + return 'BENIGN_SYNTAX'; + + case WP_XML_Processor::CONCERNED_ABOUT_INVALID_SYNTAX: + return 'INVALID_SYNTAX'; + + case WP_XML_Processor::CONCERNED_ABOUT_UNRESOLVABLES: + return 'UNRESOLVABLE'; + + default: + throw new ValueError( "Unknown level of concern: {$this->level_of_concern}" ); + } + } +} diff --git a/src/wp-includes/xml-api/class-wp-xml-processor.php b/src/wp-includes/xml-api/class-wp-xml-processor.php new file mode 100644 index 0000000000000..60d92aba182ad --- /dev/null +++ b/src/wp-includes/xml-api/class-wp-xml-processor.php @@ -0,0 +1,434 @@ +xml = $xml; + } + + /** + * Advances to the next syntactic token in the XML document. + * + * @since {WP_VERSION} + * + * @return bool Whether a well-formed token was found. + */ + public function next_token(): bool { + try { + return $this->base_class_next_token(); + } catch ( WP_XML_Malformed_Error $e ) { + /* + * @todo Return `true` after preventing normative functions like `get_token_name()` + * from returning values. `get_malformed_token_name()` family of functions + * should take their place. + */ + return false; + } + } + + /** + * Advances to the next syntactic token in the XML document. + * + * @since {WP_VERSION} + * + * @throws WP_XML_Malformed_Error When encountering an error of sufficient concern. + * + * @return bool + */ + public function base_class_next_token(): bool { + $xml = $this->xml; + $at = $this->next_byte_at; + $end = strlen( $xml ); + $last_byte_at = $end - 1; + + while ( $at < $end ) { + $was_at = $at; + + /* + * Transitions in the parsing model occur at the following: + * + * - `<` starts tags, comments, CDATA, DTDs, PI nodes, and declarations + * - `&` starts entity refs and character refs. + * - `%` starts parameter-entity refs. + * + * @todo Probably only need to transition on `<`. + */ + $next_transition_after = strcspn( $xml, '<&%', $at, $end - $at ); + $at = $at + $next_transition_after; + if ( $at >= $end ) { + // @todo Return a text node? + } + + if ( $at > $was_at ) { + // @todo Create a text node. + } + + // Officially move to where this token begins. + $this->next_byte_at = $at; + + // Enter a comment. + if ( 0 === substr_compare( $xml, '', min( $end, $text_starts_at ) ); + + // Maybe the comment ends in the next chunk. + if ( false === $closer_at ) { + $this->parser_state = self::STATE_INCOMPLETE; + return false; + } + + $this->parser_state = self::STATE_COMMENT; + $this->next_byte_at = $closer_at + 3; + + if ( + $this->level_of_concern >= self::CONCERNED_ABOUT_CONTENT && + ( $closer_at - $text_starts_at ) !== self::skip_chars( $xml, $text_starts_at, $closer_at ) + ) { + $this->malformed( + self::CONCERNED_ABOUT_CONTENT, + 'Comment nodes must contain only allowable characters.' + ); + } + + if ( + $this->level_of_concern >= self::CONCERNED_ABOUT_BENIGN_SYNTAX && + strpos( $xml, '--', $text_starts_at ) < $closer_at + ) { + /* + * In SGML, the double-hyphen delimits comments within markup + * declarations. In order to maintain compatability with SGML + * parsers, XML thus forbids their inclusion inside comments. + * + * Since WordPress is not concerned with SGML parsers, this + * parser considers this a benign syntax error, as it does not + * confuse the understanding of the structure of the input. + */ + $this->malformed( + self::CONCERNED_ABOUT_BENIGN_SYNTAX, + 'Comment nodes must not contain a double-hyphen.' + ); + } + + return true; + } + + // Enter a Processing Instruction Node. + if ( 0 === substr_compare( $xml, '', min( $end, $text_starts_at ) ); + + if ( false === $closer_at ) { + $this->parser_state = self::STATE_INCOMPLETE; + return false; + } + + $target_length = self::skip_whitespace( $xml, $text_starts_at, $closer_at ); + + // "< ? ? >" + if ( $this->level_of_concern >= self::CONCERNED_ABOUT_INVALID_SYNTAX ) { + if ( 0 === $target_length ) { + $this->malformed( + self::CONCERNED_ABOUT_INVALID_SYNTAX, + 'Processing instruction nodes must contain a target indicating to which application the instruction is directed.' + ); + } + + if ( self::skip_name( $xml, $text_starts_at ) !== $target_length ) { + $this->malformed( + self::CONCERNED_ABOUT_INVALID_SYNTAX, + 'Processing instruction target names must contain only allowable characters.' + ); + } + } + + // This is an XML Declaration. + if ( 3 === $target_length && 0 === substr_compare( $xml, $text_starts_at, 'xml', 3, false ) ) { + $this->parser_state = self::STATE_XML_DECLARATION; + + // @todo Parse version and encoding and standalone declaration. + return true; + } + + if ( $this->level_of_concern >= self::CONCERNED_ABOUT_BENIGN_SYNTAX ) { + if ( 3 === $target_length && 0 === substr_compare( $xml, $text_starts_at, 'xml', 3, true ) ) { + $this->malformed( + self::CONCERNED_ABOUT_BENIGN_SYNTAX, + 'Processing instruction target names matching "XML" (in a case-insensitive manner) are reserved names.' + ); + } + + $character_data_length = $closer_at - ( $text_starts_at + $target_length ); + if ( self::skip_chars( $xml, $text_starts_at + $target_length, $closer_at ) !== $character_data_length ) { + $this->malformed( + self::CONCERNED_ABOUT_CONTENT, + 'Processing instruction nodes must contain only allowable characters.' + ); + } + } + + return true; + } + } + + // @todo Placeholder until other tokens are supported. + return false; + } + + /** + * Pauses the parser and indicates that the currently-matched token is malformed. + * + * @since {WP_VERSION} + * + * @see self::CONCERNED_ABOUT_EVERYTHING + * @see self::CONCERNED_ABOUT_CONTENT + * @see self::CONCERNED_ABOUT_BENIGN_SYNTAX + * @see self::CONCERNED_ABOUT_INVALID_SYNTAX + * @see self::CONCERNED_ABOUT_UNRESOLVABLES + * + * @throws WP_XML_Malformed_Error This function is a helper to throw this error. + * + * @param int $level_of_concern One of the levels-of-concern constants in this class. + * @param string $reason Explanation for why the markup is malformed. + */ + protected function malformed( int $level_of_concern, string $reason ): void { + $this->parser_state |= self::STATE_MALFORMED; + throw new WP_XML_Malformed_Error( $level_of_concern, $reason ); + } + + /** + * Starts parsing an XML document. + * + * > document ::= prolog element Misc* + * + * @see https://www.w3.org/TR/xml/#NT-document + * + * @return bool + */ + private function step_initial(): bool { + // Look for prolog + } + + /* + * Parser helpers. + */ + + /** + * Parses a sequence of XML `Char`. + * + * > [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + * > // any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. + * + * @see https://www.w3.org/TR/xml/#dt-character + * + * @param string $xml + * @param int $start_at + * @param int|null $end_at + * + * @return int + */ + public static function skip_chars( string $xml, int $start_at, int $end_at = null ): int { + $at = $start_at; + $end = $end_at ?? strlen( $xml ); + + while ( $at < $end ) { + $code_point = ord( $xml[ $at ] ); + + if ( + ( 0x20 <= $code_point && $code_point <= 0x7F ) || + 0x09 === $code_point || 0x0A === $code_point || 0x0D === $code_point + ) { + ++$at; + continue; + } + + break; + } + + return $at - $start_at; + } + + /** + * Parses an XML `Name`. + * + * > [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | + * > [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | + * > [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | + * > [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | + * > [#x10000-#xEFFFF] + * > [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | + * > [#x0300-#x036F] | [#x203F-#x2040] + * > [5] Name ::= NameStartChar (NameChar)* + * + * @see https://www.w3.org/TR/xml/#NT-Name + * + * @since {WP_VERSION} + * + * @param int $at Byte offset into input document to begin parsing name. + * @return int|null Byte length of parsed name, if found, otherwise `null`. + */ + public static function skip_name( string $xml, int $at ): int { + $starting_chars = self::skip_name_start_chars( $xml, $at ); + if ( 0 === $starting_chars ) { + return 0; + } + + return $starting_chars + self::skip_name_chars( $xml, $at + $starting_chars ); + } + + public static function skip_name_chars( string $xml, int $at ): int { + $end = strlen( $xml ); + $was_at = $at; + + while ( $at < $end ) { + $at += strspn( $xml, ':ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz-0123456789.', $at, $end - $at ); + /* + * @todo Parse multi-byte UTF-8 characters with `utf8_parse_next()`. + * + * Example: + * + * $code_point = utf_parse_next( $xml, $at + $stride, $consumed_bytes ); + * if ( null === $code_point ) { + * throw new ValueError( 'Invalid UTF-8 cannot appear in an XML Name' ); + * } + * + * if ( 0xB7 === $code_point || ( 0xC0 <= $code_point && $code_point <= D6 ) || ... ) { + * $at += $consumed_bytes; + * continue; + * } + */ + + break; + } + + return $at - $was_at; + } + + public static function skip_name_start_chars( string $xml, int $at ): int { + $end = strlen( $xml ); + $was_at = $at; + + while ( $at < $end ) { + $at += strspn( $xml, ':ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz', $at, $end - $at ); + /* + * @todo Parse multi-byte UTF-8 characters with `utf8_parse_next()`. + * + * Example: + * + * $code_point = utf_parse_next( $xml, $at + $stride, $consumed_bytes ); + * if ( null === $code_point ) { + * throw new ValueError( 'Invalid UTF-8 cannot appear in an XML Name' ); + * } + * + * if ( ( 0xC0 <= $code_point && $code_point <= D6 ) || ... ) { + * $at += $consumed_bytes; + * continue; + * } + */ + + break; + } + + return $at - $was_at; + } + + public static function skip_whitespace( string $xml, int $at, int $end = null ): int { + return strspn( $xml, "\x20\x09\x0D\x0A", $at, $end ); + } + + /* + * Constants + */ + + /* + * Parser states. + */ + + const STATE_READY = 0; + + const STATE_TEXT_NODE = 1; + + const STATE_START_TAG = self::STATE_TEXT_NODE << 1; + + const STATE_END_TAG = self::STATE_START_TAG << 1; + + const STATE_EMPTY_TAG = self::STATE_END_TAG << 1; + + const STATE_COMMENT = self::STATE_EMPTY_TAG << 1; + + const STATE_CDATA_SECTION = self::STATE_COMMENT << 1; + + const STATE_DOCTYPE = self::STATE_CDATA_SECTION << 1; + + const STATE_PI_NODE = self::STATE_DOCTYPE << 1; + + const STATE_XML_DECLARATION = self::STATE_PI_NODE << 1; + + const STATE_COMPLETE = self::STATE_XML_DECLARATION << 1; + + const STATE_INCOMPLETE = self::STATE_COMPLETE << 1; + + const STATE_MALFORMED = self::STATE_INCOMPLETE << 1; + + /* + * Levels of concern. + */ + + + const CONCERNED_ABOUT_UNRESOLVABLES = 0; + + const CONCERNED_ABOUT_INVALID_SYNTAX = self::CONCERNED_ABOUT_UNRESOLVABLES + 1; + + const CONCERNED_ABOUT_BENIGN_SYNTAX = self::CONCERNED_ABOUT_INVALID_SYNTAX + 1; + + const CONCERNED_ABOUT_CONTENT = self::CONCERNED_ABOUT_BENIGN_SYNTAX + 1; + + const CONCERNED_ABOUT_EVERYTHING = self::CONCERNED_ABOUT_CONTENT + 1; + +} diff --git a/src/wp-settings.php b/src/wp-settings.php index d3dfe5776ee88..532250b0980f4 100644 --- a/src/wp-settings.php +++ b/src/wp-settings.php @@ -263,6 +263,7 @@ require ABSPATH . WPINC . '/html-api/class-wp-html-stack-event.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-processor-state.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-processor.php'; +require ABSPATH . WPINC . '/xml-api/class-wp-xml-processor.php'; require ABSPATH . WPINC . '/class-wp-http.php'; require ABSPATH . WPINC . '/class-wp-http-streams.php'; require ABSPATH . WPINC . '/class-wp-http-curl.php';