From 4a8ca98fc72ed28ec00693a56efe35833c70f751 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Mon, 15 Dec 2025 20:03:45 +0100 Subject: [PATCH 01/60] Auto-escape JavaScript and JSON script tags when necessary --- .../html-api/class-wp-html-tag-processor.php | 245 ++++++++++++++++-- .../wpHtmlTagProcessorModifiableText.php | 149 ++++++++++- 2 files changed, 373 insertions(+), 21 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 31c4bc8a10654..1f2a4a3dc62ac 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -3811,29 +3811,83 @@ public function set_modifiable_text( string $plaintext_content ): bool { switch ( $this->get_tag() ) { case 'SCRIPT': - /** - * This is over-protective, but ensures the update doesn't break - * the HTML structure of the SCRIPT element. + /* + * SCRIPT tag contents can be dangerous. + * + * The text `` could close the SCRIPT element prematurely. * - * More thorough analysis could track the HTML tokenizer states - * and to ensure that the SCRIPT element closes at the expected - * SCRIPT close tag as is done in {@see ::skip_script_data()}. + * The text ``. A SCRIPT element could be prevented from - * closing by contents like `' => array( '', 'Comments end in -->' ), - 'Comment with --!>' => array( '', 'Invalid but legitimate comments end in --!>' ), - 'SCRIPT with ' => array( '', 'Just a ' ), - 'SCRIPT with ' => array( '', 'beforeafter' ), - 'SCRIPT with "', '' => array( '', 'Comments end in -->' ), + 'Comment with --!>' => array( '', 'Invalid but legitimate comments end in --!>' ), + 'Non-JS SCRIPT with ', ' └─────┐ + * │ ▼ │ │ + * │ ┌─────────────────────────────────────────┐ │ + * │ + * │ │ ' + * + * The original source of this graph is included at the bottom of this file. + * * @see https://html.spec.whatwg.org/#restrictions-for-contents-of-script-elements */ private function escape_javascript_script_contents( string $text ): string { @@ -4133,6 +4165,7 @@ static function ( $matches ) { * does not allow backslash escaping of "<", so there's no need to * consider whether the "<" is escaped. * + * @see WP_HTML_Tag_Processor::escape_javascript_script_contents() * @see https://www.json.org/json-en.html */ private function escape_json_script_contents( string $text ): string { @@ -4932,3 +4965,40 @@ public function get_doctype_info(): ?WP_HTML_Doctype_Info { */ const TEXT_IS_WHITESPACE = 'TEXT_IS_WHITESPACE'; } + +/* +# This is the original Graphviz source for the SCRIPT content +# parsinge behavior. It's used in the documention of +# `WP_HTML_Tag_Processor::escape_javascript_script_contents()`. +# ==== +digraph { + rankdir=TB; + + // Entry point + entry [shape=plaintext label="Open script"]; + entry -> script_data; + + // Double-circle states arranged more compactly + data [shape=doublecircle label="Close script"]; + script_data [shape=doublecircle color=blue label="script\ndata"]; + script_data_escaped [shape=circle color=orange label="escaped"]; + script_data_double_escaped [shape=circle color=red label="double\nescaped"]; + + // Group related nodes on same ranks where possible + {rank=same; script_data script_data_escaped script_data_double_escaped} + + script_data -> script_data [label=""]; + script_data_escaped -> script_data_double_escaped [label=" data [label=" script_data [label="-->"]; + script_data_double_escaped -> script_data_escaped [label=" Date: Mon, 22 Dec 2025 18:44:01 +0100 Subject: [PATCH 51/60] Improve linking between escapes --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index fc1f51360daf4..92462873f7dd7 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -4165,7 +4165,7 @@ static function ( $matches ) { * does not allow backslash escaping of "<", so there's no need to * consider whether the "<" is escaped. * - * @see WP_HTML_Tag_Processor::escape_javascript_script_contents() + * For more details, see {@see WP_HTML_Tag_Processor::escape_javascript_script_contents()}. * @see https://www.json.org/json-en.html */ private function escape_json_script_contents( string $text ): string { From 83ff62fab13559e0359a84d3d75e158806dfab62 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Mon, 22 Dec 2025 18:51:16 +0100 Subject: [PATCH 52/60] Fix comments, typos, lints --- .../html-api/class-wp-html-tag-processor.php | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 92462873f7dd7..05df59d40671b 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -3811,14 +3811,14 @@ public function set_modifiable_text( string $plaintext_content ): bool { switch ( $this->get_tag() ) { case 'SCRIPT': - /* - * SCRIPT tag contents can be dangerous: + /** + * Identify risky script contents to escape when possible or reject otherwise: * * - "" could close the SCRIPT element prematurely. - * - "` + * within a a text/plain SCRIPT tag. * * @ticket 61617 * @ticket 62797 @@ -467,7 +468,7 @@ public function test_rejects_dangerous_updates( string $html_with_nonempty_modif $this->assertFalse( $processor->set_modifiable_text( $invalid_update ), - 'Should have reject possibly-compromising modifiable text update.' + 'Should have rejected possibly-compromising modifiable text update.' ); // Flush updates. @@ -490,7 +491,7 @@ public static function data_unallowed_modifiable_text_updates() { 'Comment with -->' => array( '', 'Comments end in -->' ), 'Comment with --!>' => array( '', 'Invalid but legitimate comments end in --!>' ), 'Non-JS SCRIPT with ', '` within a comment or `` - * within a a text/plain SCRIPT tag. + * within a text/plain SCRIPT tag. * * @ticket 61617 * @ticket 62797 From 402ae9f9feb0fb1ff21ffbe44e930d23c5b1ecc2 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Mon, 22 Dec 2025 19:08:04 +0100 Subject: [PATCH 55/60] Fix \c -> \r (carriage return) typo --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 05df59d40671b..a5f46af35077b 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -4134,7 +4134,7 @@ private function is_json_script_tag(): bool { * └─────────▶ ║ ║ └────────── │ escaped │ ─┘ * ╚══════════════╝ └───────────┘ * - * † = Case insensitive 'script' followed by one of ' \t\f\c\n/>' + * † = Case insensitive 'script' followed by one of ' \t\f\r\n/>' * * The original source of this graph is included at the bottom of this file. * @@ -4998,7 +4998,7 @@ public function get_doctype_info(): ?WP_HTML_Doctype_Info { script_data_double_escaped -> script_data [label="-->"]; script_data_double_escaped -> script_data_escaped [label=" Date: Mon, 22 Dec 2025 19:11:16 +0100 Subject: [PATCH 56/60] Add note about not parsing MIME types for JS script tags --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index a5f46af35077b..a0dfcb7e2ff2a 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -3895,6 +3895,9 @@ static function ( $tag_match ) { /** * Indicates if the currently matched tag is a JavaScript script tag. * + * Note that this does not parse a MIME type. This behavior is well-documented in + * in the HTML standard and uses string comparisons, *not* actual MIME Types. + * * @see https://html.spec.whatwg.org/multipage/scripting.html#prepare-the-script-element * * @ignore From eef0ccbd89e00afffc6554a050801ee4cc056718 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Mon, 22 Dec 2025 19:12:13 +0100 Subject: [PATCH 57/60] Add todo comment to is_json_script_tag --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 1 + 1 file changed, 1 insertion(+) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index a0dfcb7e2ff2a..6b87d250288b3 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -4016,6 +4016,7 @@ private function is_javascript_script_tag(): bool { * * @ignore * @todo Consider a public API that is clear and general. + * @todo Use a MIME type parser when available. * * @since 7.0.0 * From 71d268670ba337f763c04baf9830bed074d03546 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Mon, 22 Dec 2025 19:15:04 +0100 Subject: [PATCH 58/60] Re-order tag name termination chars to match elsewhere --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 6b87d250288b3..0ccf83828f15d 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -4146,7 +4146,7 @@ private function is_json_script_tag(): bool { */ private function escape_javascript_script_contents( string $text ): string { return preg_replace_callback( - '~(?Ps)(?Pcript[\\t\\r\\n\\f />])~i', + '~(?Ps)(?Pcript[ \\t\\f\\r\\n/>])~i', static function ( $matches ) { $escaped_s_char = 's' === $matches['S_CHAR'] ? '\\u0073' From d4693a27ffaf7950b7e5aaef4c4d47268c07607d Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Mon, 22 Dec 2025 19:15:37 +0100 Subject: [PATCH 59/60] Fix typo --- tests/phpunit/tests/html-api/wpHtmlTagProcessorScriptTag.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessorScriptTag.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessorScriptTag.php index 9f89f78a2fcd4..f7da39a887d71 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessorScriptTag.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessorScriptTag.php @@ -102,7 +102,7 @@ public static function data_is_javascript_script_tag(): array { 'Script tag with language="jscript"' => array( '', true ), 'Script tag with language="livescript"' => array( '', true ), - // Whitespace is not trimmed in the langauge attribute. + // Whitespace is not trimmed in the language attribute. 'Script tag with language=" javascript"' => array( '', false ), // Non-JavaScript script tags - should NOT be JavaScript. From 4c3b0b21a4d6f795984733d21ceb384a86920cba Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Mon, 29 Dec 2025 17:50:00 +0100 Subject: [PATCH 60/60] Update comments on tag prefixes matching search pattern --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 0ccf83828f15d..d53709051062d 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -3814,8 +3814,8 @@ public function set_modifiable_text( string $plaintext_content ): bool { /** * Identify risky script contents to escape when possible or reject otherwise: * - * - "" could close the SCRIPT element prematurely. - * - "