From d4563edef832c195e0405c744dfce8afaf95c144 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 28 Nov 2024 08:47:48 +0100 Subject: [PATCH 1/3] Only allow context in ::create_fragment Partial revert of https://core.trac.wordpress.org/changeset/59467. --- .../html-api/class-wp-html-processor.php | 36 ++----- .../html-api/wpHtmlProcessorHtml5lib.php | 98 +++++++++++-------- 2 files changed, 64 insertions(+), 70 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 1be795c5c7de2..f665fa55c3e5a 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -279,44 +279,24 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * form is provided because a context element may have attributes that * impact the parse, such as with a SCRIPT tag and its `type` attribute. * - * Example: - * - * // Usually, snippets of HTML ought to be processed in the default `` context: - * $processor = WP_HTML_Processor::create_fragment( '

Hi

' ); - * - * // Some fragments should be processed in the correct context like this SVG: - * $processor = WP_HTML_Processor::create_fragment( '', '' ); - * - * // This fragment with TD tags should be processed in a TR context: - * $processor = WP_HTML_Processor::create_fragment( - * '123', - * '' - * ); - * - * In order to create a fragment processor at the correct location, the - * provided fragment will be processed as part of a full HTML document. - * The processor will search for the last opener tag in the document and - * create a fragment processor at that location. The document will be - * forced into "no-quirks" mode by including the HTML5 doctype. - * - * For advanced usage and precise control over the context element, use - * `WP_HTML_Processor::create_full_processor()` and - * `WP_HTML_Processor::create_fragment_at_current_node()`. + * ## Current HTML Support * - * UTF-8 is the only allowed encoding. If working with a document that - * isn't UTF-8, first convert the document to UTF-8, then pass in the - * converted HTML. + * - The only supported context is ``, which is the default value. + * - The only supported document encoding is `UTF-8`, which is the default value. * * @since 6.4.0 * @since 6.6.0 Returns `static` instead of `self` so it can create subclass instances. - * @since 6.8.0 Can create fragments with any context element. * * @param string $html Input HTML fragment to process. - * @param string $context Context element for the fragment. Defaults to ``. + * @param string $context Context element for the fragment, must be default of ``. * @param string $encoding Text encoding of the document; must be default of 'UTF-8'. * @return static|null The created processor if successful, otherwise null. */ public static function create_fragment( $html, $context = '', $encoding = 'UTF-8' ) { + if ( '' !== $context || 'UTF-8' !== $encoding ) { + return null; + } + $context_processor = static::create_full_parser( "{$context}", $encoding ); if ( null === $context_processor ) { return null; diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index 5e0c3b77f8732..7abe63a859954 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -153,55 +153,69 @@ private static function should_skip_test( ?string $test_context_element, string * @return string|null Tree structure of parsed HTML, if supported, else null. */ private static function build_tree_representation( ?string $fragment_context, string $html ) { + $processor = null; if ( $fragment_context ) { - /* - * If the string of characters starts with "svg ", the context - * element is in the SVG namespace and the substring after - * "svg " is the local name. If the string of characters starts - * with "math ", the context element is in the MathML namespace - * and the substring after "math " is the local name. - * Otherwise, the context element is in the HTML namespace and - * the string is the local name. - */ - if ( str_starts_with( $fragment_context, 'svg ' ) ) { - $tag_name = substr( $fragment_context, 4 ); - if ( 'svg' === $tag_name ) { - $fragment_context_html = ''; + if ( 'body' === $fragment_context ) { + $processor = WP_HTML_Processor::create_fragment( $html ); + } else { + + /* + * If the string of characters starts with "svg ", the context + * element is in the SVG namespace and the substring after + * "svg " is the local name. If the string of characters starts + * with "math ", the context element is in the MathML namespace + * and the substring after "math " is the local name. + * Otherwise, the context element is in the HTML namespace and + * the string is the local name. + */ + if ( str_starts_with( $fragment_context, 'svg ' ) ) { + $tag_name = substr( $fragment_context, 4 ); + if ( 'svg' === $tag_name ) { + $parent_processor = WP_HTML_Processor::create_full_parser( '' ); + } else { + $parent_processor = WP_HTML_Processor::create_full_parser( "<{$tag_name}>" ); + } + $parent_processor->next_tag( $tag_name ); + } elseif ( str_starts_with( $fragment_context, 'math ' ) ) { + $tag_name = substr( $fragment_context, 5 ); + if ( 'math' === $tag_name ) { + $parent_processor = WP_HTML_Processor::create_full_parser( '' ); + } else { + $parent_processor = WP_HTML_Processor::create_full_parser( "<{$tag_name}>" ); + } + $parent_processor->next_tag( $tag_name ); } else { - $fragment_context_html = "<{$tag_name}>"; + if ( in_array( + $fragment_context, + array( + 'caption', + 'col', + 'colgroup', + 'tbody', + 'td', + 'tfoot', + 'th', + 'thead', + 'tr', + ), + true + ) ) { + $parent_processor = WP_HTML_Processor::create_full_parser( "
<{$fragment_context}>" ); + $parent_processor->next_tag(); + } else { + $parent_processor = WP_HTML_Processor::create_full_parser( "<{$fragment_context}>" ); + } + $parent_processor->next_tag( $fragment_context ); } - } elseif ( str_starts_with( $fragment_context, 'math ' ) ) { - $tag_name = substr( $fragment_context, 5 ); - if ( 'math' === $tag_name ) { - $fragment_context_html = ''; - } else { - $fragment_context_html = "<{$tag_name}>"; + if ( null !== $parent_processor->get_unsupported_exception() ) { + throw $parent_processor->get_unsupported_exception(); } - } else { - // Tags that only appear in tables need a special case. - if ( in_array( - $fragment_context, - array( - 'caption', - 'col', - 'colgroup', - 'tbody', - 'td', - 'tfoot', - 'th', - 'thead', - 'tr', - ), - true - ) ) { - $fragment_context_html = "
<{$fragment_context}>"; - } else { - $fragment_context_html = "<{$fragment_context}>"; + if ( null !== $parent_processor->get_last_error() ) { + throw new Exception( $parent_processor->get_last_error() ); } + $processor = $parent_processor->create_fragment_at_current_node( $html ); } - $processor = WP_HTML_Processor::create_fragment( $html, $fragment_context_html ); - if ( null === $processor ) { throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() ); } From 792763b0731411b15d32b6dc21adf9cdf047a984 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 28 Nov 2024 08:52:17 +0100 Subject: [PATCH 2/3] Make create_fragment_at_current_node private Keep this method private for iteration as an internal method to continue improving the interface. Introduced in https://core.trac.wordpress.org/changeset/59444 --- .../html-api/class-wp-html-processor.php | 2 +- .../html-api/wpHtmlProcessorHtml5lib.php | 80 +++---------------- 2 files changed, 10 insertions(+), 72 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index f665fa55c3e5a..e88757ec7b4c2 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -455,7 +455,7 @@ function ( WP_HTML_Token $token ): void { * @param string $html Input HTML fragment to process. * @return static|null The created processor if successful, otherwise null. */ - public function create_fragment_at_current_node( string $html ) { + private function create_fragment_at_current_node( string $html ) { if ( $this->get_token_type() !== '#tag' || $this->is_tag_closer() ) { _doing_it_wrong( __METHOD__, diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index 7abe63a859954..a03a9ab806a93 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -138,6 +138,10 @@ public function data_external_html5lib_tests() { * @return bool True if the test case should be skipped. False otherwise. */ private static function should_skip_test( ?string $test_context_element, string $test_name ): bool { + if ( null !== $test_context_element && 'body' !== $test_context_element ) { + return true; + } + if ( array_key_exists( $test_name, self::SKIP_TESTS ) ) { return true; } @@ -153,77 +157,11 @@ private static function should_skip_test( ?string $test_context_element, string * @return string|null Tree structure of parsed HTML, if supported, else null. */ private static function build_tree_representation( ?string $fragment_context, string $html ) { - $processor = null; - if ( $fragment_context ) { - if ( 'body' === $fragment_context ) { - $processor = WP_HTML_Processor::create_fragment( $html ); - } else { - - /* - * If the string of characters starts with "svg ", the context - * element is in the SVG namespace and the substring after - * "svg " is the local name. If the string of characters starts - * with "math ", the context element is in the MathML namespace - * and the substring after "math " is the local name. - * Otherwise, the context element is in the HTML namespace and - * the string is the local name. - */ - if ( str_starts_with( $fragment_context, 'svg ' ) ) { - $tag_name = substr( $fragment_context, 4 ); - if ( 'svg' === $tag_name ) { - $parent_processor = WP_HTML_Processor::create_full_parser( '' ); - } else { - $parent_processor = WP_HTML_Processor::create_full_parser( "<{$tag_name}>" ); - } - $parent_processor->next_tag( $tag_name ); - } elseif ( str_starts_with( $fragment_context, 'math ' ) ) { - $tag_name = substr( $fragment_context, 5 ); - if ( 'math' === $tag_name ) { - $parent_processor = WP_HTML_Processor::create_full_parser( '' ); - } else { - $parent_processor = WP_HTML_Processor::create_full_parser( "<{$tag_name}>" ); - } - $parent_processor->next_tag( $tag_name ); - } else { - if ( in_array( - $fragment_context, - array( - 'caption', - 'col', - 'colgroup', - 'tbody', - 'td', - 'tfoot', - 'th', - 'thead', - 'tr', - ), - true - ) ) { - $parent_processor = WP_HTML_Processor::create_full_parser( "
<{$fragment_context}>" ); - $parent_processor->next_tag(); - } else { - $parent_processor = WP_HTML_Processor::create_full_parser( "<{$fragment_context}>" ); - } - $parent_processor->next_tag( $fragment_context ); - } - if ( null !== $parent_processor->get_unsupported_exception() ) { - throw $parent_processor->get_unsupported_exception(); - } - if ( null !== $parent_processor->get_last_error() ) { - throw new Exception( $parent_processor->get_last_error() ); - } - $processor = $parent_processor->create_fragment_at_current_node( $html ); - } - - if ( null === $processor ) { - throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() ); - } - } else { - $processor = WP_HTML_Processor::create_full_parser( $html ); - if ( null === $processor ) { - throw new Exception( 'Could not create a full parser.' ); - } + $processor = $fragment_context + ? WP_HTML_Processor::create_fragment( $html, "<{$fragment_context}>" ) + : WP_HTML_Processor::create_full_parser( $html ); + if ( null === $processor ) { + throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() ); } $output = ''; From 77e3f74822eb099ffb288fbb0be90521aa6435ab Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 28 Nov 2024 11:23:52 +0100 Subject: [PATCH 3/3] Remove unused fragment test suite --- .../wpHtmlProcessorFragmentParsing.php | 178 ------------------ 1 file changed, 178 deletions(-) delete mode 100644 tests/phpunit/tests/html-api/wpHtmlProcessorFragmentParsing.php diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorFragmentParsing.php b/tests/phpunit/tests/html-api/wpHtmlProcessorFragmentParsing.php deleted file mode 100644 index 4913fa07eb412..0000000000000 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorFragmentParsing.php +++ /dev/null @@ -1,178 +0,0 @@ -' ); - $this->assertTrue( $processor->next_tag( 'SVG' ) ); - - $fragment = $processor->create_fragment_at_current_node( "\0preceded-by-nul-byte
" ); - - $this->assertSame( 'svg', $fragment->get_namespace() ); - $this->assertTrue( $fragment->next_token() ); - - /* - * In HTML parsing, a nul byte would be ignored. - * In SVG it should be replaced with a replacement character. - */ - $this->assertSame( '#text', $fragment->get_token_type() ); - $this->assertSame( "\u{FFFD}", $fragment->get_modifiable_text() ); - - $this->assertTrue( $fragment->next_tag( 'RECT' ) ); - $this->assertSame( 'svg', $fragment->get_namespace() ); - - $this->assertTrue( $fragment->next_tag( 'CIRCLE' ) ); - $this->assertSame( array( 'HTML', 'SVG', 'CIRCLE' ), $fragment->get_breadcrumbs() ); - $this->assertTrue( $fragment->next_tag( 'foreignObject' ) ); - $this->assertSame( 'svg', $fragment->get_namespace() ); - } - - /** - * @ticket 62357 - */ - public function test_create_fragment_at_current_node_in_foreign_content_integration_point() { - $processor = WP_HTML_Processor::create_full_parser( '' ); - $this->assertTrue( $processor->next_tag( 'foreignObject' ) ); - - $fragment = $processor->create_fragment_at_current_node( "\0not-preceded-by-nul-byte" ); - - // Nothing has been processed, the html namespace should be used for parsing as an integration point. - $this->assertSame( 'html', $fragment->get_namespace() ); - - // HTML parsing transforms IMAGE into IMG. - $this->assertTrue( $fragment->next_tag( 'IMG' ) ); - - $this->assertTrue( $fragment->next_token() ); - - // In HTML parsing, the nul byte is ignored and the text is reached. - $this->assertSame( '#text', $fragment->get_token_type() ); - $this->assertSame( 'not-preceded-by-nul-byte', $fragment->get_modifiable_text() ); - - /* - * svg:foreignObject is an HTML integration point, so the processor should be in the HTML namespace. - * RECT is an HTML element here, meaning it may have the self-closing flag but does not self-close. - */ - $this->assertTrue( $fragment->next_tag( 'RECT' ) ); - $this->assertSame( array( 'HTML', 'FOREIGNOBJECT', 'RECT' ), $fragment->get_breadcrumbs() ); - $this->assertSame( 'html', $fragment->get_namespace() ); - $this->assertTrue( $fragment->has_self_closing_flag() ); - $this->assertTrue( $fragment->expects_closer() ); - } - - /** - * @expectedIncorrectUsage WP_HTML_Processor::create_fragment_at_current_node - * @ticket 62357 - */ - public function test_prevent_fragment_creation_on_closers() { - $processor = WP_HTML_Processor::create_full_parser( '

' ); - $processor->next_tag( 'P' ); - $processor->next_tag( - array( - 'tag_name' => 'P', - 'tag_closers' => 'visit', - ) - ); - $this->assertSame( 'P', $processor->get_tag() ); - $this->assertTrue( $processor->is_tag_closer() ); - $this->assertNull( $processor->create_fragment_at_current_node( 'fragment HTML' ) ); - } - - /** - * Verifies that the fragment parser doesn't allow invalid context nodes. - * - * This includes void elements and self-contained elements because they can - * contain no inner HTML. Operations on self-contained elements should occur - * through methods such as {@see WP_HTML_Tag_Processor::set_modifiable_text}. - * - * @ticket 62584 - * - * @dataProvider data_invalid_fragment_contexts - * - * @param string $context Invalid context node for fragment parser. - */ - public function test_rejects_invalid_fragment_contexts( string $context, string $doing_it_wrong_method_name ) { - $this->setExpectedIncorrectUsage( "WP_HTML_Processor::{$doing_it_wrong_method_name}" ); - $this->assertNull( - WP_HTML_Processor::create_fragment( 'just a test', $context ), - "Should not have been able to create a fragment parser with context node {$context}" - ); - } - - /** - * Data provider. - * - * @return array[] - */ - public static function data_invalid_fragment_contexts() { - return array( - /* - * Invalid contexts. - */ - /* - * The text node is confused with a virtual body open tag. - * This should fail to set a bookmark in `create_fragment` - * but currently does not, it slips through and fails in - * `create_fragment_at_current_node`. - */ - 'Invalid text' => array( 'just some text', 'create_fragment_at_current_node' ), - 'Invalid comment' => array( '', 'create_fragment' ), - 'Invalid closing' => array( '', 'create_fragment' ), - 'Invalid DOCTYPE' => array( '', 'create_fragment' ), - /* - * PLAINTEXT should appear in the unsupported elements, but at the - * moment it's completely unsupported by the processor so - * the context element cannot be found. - */ - 'Unsupported PLAINTEXT' => array( '', 'create_fragment' ), - - /* - * Invalid contexts. - */ - 'AREA' => array( '<area>', 'create_fragment_at_current_node' ), - 'BASE' => array( '<base>', 'create_fragment_at_current_node' ), - 'BASEFONT' => array( '<basefont>', 'create_fragment_at_current_node' ), - 'BGSOUND' => array( '<bgsound>', 'create_fragment_at_current_node' ), - 'BR' => array( '<br>', 'create_fragment_at_current_node' ), - 'COL' => array( '<table><colgroup><col>', 'create_fragment_at_current_node' ), - 'EMBED' => array( '<embed>', 'create_fragment_at_current_node' ), - 'FRAME' => array( '<frameset><frame>', 'create_fragment_at_current_node' ), - 'HR' => array( '<hr>', 'create_fragment_at_current_node' ), - 'IMG' => array( '<img>', 'create_fragment_at_current_node' ), - 'INPUT' => array( '<input>', 'create_fragment_at_current_node' ), - 'KEYGEN' => array( '<keygen>', 'create_fragment_at_current_node' ), - 'LINK' => array( '<link>', 'create_fragment_at_current_node' ), - 'META' => array( '<meta>', 'create_fragment_at_current_node' ), - 'PARAM' => array( '<param>', 'create_fragment_at_current_node' ), - 'SOURCE' => array( '<source>', 'create_fragment_at_current_node' ), - 'TRACK' => array( '<track>', 'create_fragment_at_current_node' ), - 'WBR' => array( '<wbr>', 'create_fragment_at_current_node' ), - - /* - * Unsupported elements. Include a tag closer to ensure the element can be found - * and does not pause the parser at an incomplete token. - */ - 'IFRAME' => array( '<iframe></iframe>', 'create_fragment_at_current_node' ), - 'NOEMBED' => array( '<noembed></noembed>', 'create_fragment_at_current_node' ), - 'NOFRAMES' => array( '<noframes></noframes>', 'create_fragment_at_current_node' ), - 'SCRIPT' => array( '<script></script>', 'create_fragment_at_current_node' ), - 'SCRIPT with type' => array( '<script type="javascript"></script>', 'create_fragment_at_current_node' ), - 'STYLE' => array( '<style></style>', 'create_fragment_at_current_node' ), - 'TEXTAREA' => array( '<textarea></textarea>', 'create_fragment_at_current_node' ), - 'TITLE' => array( '<title></title>', 'create_fragment_at_current_node' ), - 'XMP' => array( '<xmp></xmp>', 'create_fragment_at_current_node' ), - ); - } -}