-
Notifications
You must be signed in to change notification settings - Fork 3.2k
Scripts: Use HTML API to build SCRIPT tags #10639
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: trunk
Are you sure you want to change the base?
Changes from all commits
4a8ca98
4a28ef2
0bef687
cdba027
ba54ae4
a697e9e
2b3d0d0
8246439
1b6b4fd
b3e88e8
27c6371
deebd54
d78cd35
c29c3d9
abeebd6
aaacd6f
3f7f227
1362451
8d30680
3b5ef4e
f8cfdf9
501d201
253b971
bcc02ae
ea03441
c7d1827
a134e82
d4bd4b3
edae8d5
02ca3c0
d058a78
dfb63af
11f51c9
a3e0e27
288b952
a7495dd
614916d
5828382
d8c320c
016a29f
369eefc
d67749d
2869880
d6bfdca
b7099e4
010a2a2
ab53486
3ba2267
64a23b7
c920821
dbd9f55
7bef55a
4333300
504a928
55d4e47
1c84037
2ef0bf0
cb6b990
da28eec
e399bf6
83c1fab
1872681
83ff62f
5979782
d469ae4
402ae9f
6b2c9ba
eef0ccb
71d2686
d4693a2
eb4e091
4c3b0b2
40015c9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3812,28 +3812,29 @@ public function set_modifiable_text( string $plaintext_content ): bool { | |
| switch ( $this->get_tag() ) { | ||
| case 'SCRIPT': | ||
| /** | ||
| * This is over-protective, but ensures the update doesn't break | ||
| * the HTML structure of the SCRIPT element. | ||
| * Identify risky script contents to escape when possible or reject otherwise: | ||
| * | ||
| * More thorough analysis could track the HTML tokenizer states | ||
| * and to ensure that the SCRIPT element closes at the expected | ||
| * SCRIPT close tag as is done in {@see ::skip_script_data()}. | ||
| * - "</script" could close the SCRIPT element prematurely. | ||
| * - "<script" could enter the “script data double escaped state” and prevent the | ||
| * SCRIPT element from closing as expected. | ||
| * | ||
| * A SCRIPT element could be closed prematurely by contents | ||
| * like `</script>`. A SCRIPT element could be prevented from | ||
| * closing by contents like `<!--<script>`. | ||
| * | ||
| * The following strings are essential for dangerous content, | ||
| * although they are insufficient on their own. This trade-off | ||
| * prevents dangerous scripts from being sent to the browser. | ||
| * It is also unlikely to produce HTML that may confuse more | ||
| * basic HTML tooling. | ||
| * @see WP_HTML_Tag_Processor::escape_javascript_script_contents() | ||
| */ | ||
| if ( | ||
| $needs_escaping = | ||
| false !== stripos( $plaintext_content, '</script' ) || | ||
| false !== stripos( $plaintext_content, '<script' ) | ||
| ) { | ||
| return false; | ||
| false !== stripos( $plaintext_content, '<script' ); | ||
| if ( $needs_escaping ) { | ||
| if ( $this->is_javascript_script_tag() ) { | ||
| $plaintext_content = $this->escape_javascript_script_contents( $plaintext_content ); | ||
| } elseif ( $this->is_json_script_tag() ) { | ||
| $plaintext_content = $this->escape_json_script_contents( $plaintext_content ); | ||
| } else { | ||
| /* | ||
| * Other types of script tags cannot be escaped safely because there is | ||
| * no general escaping mechanism for arbitrary types of content. | ||
| */ | ||
| return false; | ||
| } | ||
| } | ||
|
|
||
| $this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement( | ||
|
|
@@ -3891,6 +3892,293 @@ static function ( $tag_match ) { | |
| return false; | ||
| } | ||
|
|
||
| /** | ||
| * Indicates if the currently matched tag is a JavaScript script tag. | ||
| * | ||
| * Note that this does not parse a MIME type. This behavior is well-documented in | ||
| * in the HTML standard and uses string comparisons, *not* actual MIME Types. | ||
| * | ||
| * @see https://html.spec.whatwg.org/multipage/scripting.html#prepare-the-script-element | ||
| * | ||
| * @ignore | ||
| * @todo Consider a public API that is clear and general. | ||
| * | ||
| * @since 7.0.0 | ||
| * | ||
| * @return bool True if the script tag will be evaluated as JavaScript. | ||
| */ | ||
| private function is_javascript_script_tag(): bool { | ||
| if ( 'SCRIPT' !== $this->get_tag() || $this->get_namespace() !== 'html' ) { | ||
| return false; | ||
| } | ||
|
|
||
| /* | ||
| * > If any of the following are true: | ||
| * > - el has a type attribute whose value is the empty string; | ||
| * > - el has no type attribute but it has a language attribute and that attribute's | ||
| * > value is the empty string; or | ||
| * > - el has neither a type attribute nor a language attribute, | ||
| * > then let the script block's type string for this script element be "text/javascript". | ||
| */ | ||
| $type_attr = $this->get_attribute( 'type' ); | ||
| $language_attr = $this->get_attribute( 'language' ); | ||
| if ( true === $type_attr || '' === $type_attr ) { | ||
| return true; | ||
| } | ||
| if ( | ||
| null === $type_attr | ||
| && ( null === $language_attr || true === $language_attr || '' === $language_attr ) | ||
| ) { | ||
| return true; | ||
| } | ||
|
|
||
| /* | ||
| * > Otherwise, if el has a type attribute, then let the script block's type string be | ||
| * > the value of that attribute with leading and trailing ASCII whitespace stripped. | ||
| * > Otherwise, el has a non-empty language attribute; let the script block's type string | ||
| * > be the concatenation of "text/" and the value of el's language attribute. | ||
| */ | ||
| $type_string = null !== $type_attr ? trim( $type_attr, " \t\f\r\n" ) : "text/{$language_attr}"; | ||
|
|
||
| /* | ||
| * > If the script block's type string is a JavaScript MIME type essence match, then | ||
| * > set el's type to "classic". | ||
| * | ||
| * > A string is a JavaScript MIME type essence match if it is an ASCII case-insensitive | ||
| * > match for one of the JavaScript MIME type essence strings. | ||
| * | ||
| * > A JavaScript MIME type is any MIME type whose essence is one of the following: | ||
| * > | ||
| * > - application/ecmascript | ||
| * > - application/javascript | ||
| * > - application/x-ecmascript | ||
| * > - application/x-javascript | ||
| * > - text/ecmascript | ||
| * > - text/javascript | ||
| * > - text/javascript1.0 | ||
| * > - text/javascript1.1 | ||
| * > - text/javascript1.2 | ||
| * > - text/javascript1.3 | ||
| * > - text/javascript1.4 | ||
| * > - text/javascript1.5 | ||
| * > - text/jscript | ||
| * > - text/livescript | ||
| * > - text/x-ecmascript | ||
| * > - text/x-javascript | ||
| * | ||
| * @see https://mimesniff.spec.whatwg.org/#javascript-mime-type-essence-match | ||
| * @see https://mimesniff.spec.whatwg.org/#javascript-mime-type | ||
| */ | ||
| switch ( strtolower( $type_string ) ) { | ||
| case 'application/ecmascript': | ||
| case 'application/javascript': | ||
| case 'application/x-ecmascript': | ||
| case 'application/x-javascript': | ||
| case 'text/ecmascript': | ||
| case 'text/javascript': | ||
| case 'text/javascript1.0': | ||
| case 'text/javascript1.1': | ||
| case 'text/javascript1.2': | ||
| case 'text/javascript1.3': | ||
| case 'text/javascript1.4': | ||
| case 'text/javascript1.5': | ||
| case 'text/jscript': | ||
| case 'text/livescript': | ||
| case 'text/x-ecmascript': | ||
| case 'text/x-javascript': | ||
| return true; | ||
|
|
||
| /* | ||
| * > Otherwise, if the script block's type string is an ASCII case-insensitive match for | ||
| * > the string "module", then set el's type to "module". | ||
| * | ||
| * A module is evaluated as JavaScript. | ||
| */ | ||
| case 'module': | ||
| return true; | ||
| } | ||
|
|
||
| /* | ||
| * > Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "importmap", then set el's type to "importmap". | ||
| * > Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "speculationrules", then set el's type to "speculationrules". | ||
| * | ||
| * These conditions indicate JSON content. | ||
| */ | ||
|
|
||
| /* | ||
| * > Otherwise, return. (No script is executed, and el's type is left as null.) | ||
| */ | ||
| return false; | ||
| } | ||
|
|
||
| /** | ||
| * Indicates if the currently matched tag is a JSON script tag. | ||
| * | ||
| * @ignore | ||
| * @todo Consider a public API that is clear and general. | ||
| * @todo Use a MIME type parser when available. | ||
| * | ||
| * @since 7.0.0 | ||
| * | ||
| * @return bool True if the script tag should be treated as JSON. | ||
| */ | ||
| private function is_json_script_tag(): bool { | ||
| if ( 'SCRIPT' !== $this->get_tag() || $this->get_namespace() !== 'html' ) { | ||
| return false; | ||
| } | ||
|
|
||
| $type = $this->get_attribute( 'type' ); | ||
| if ( null === $type || true === $type || '' === $type ) { | ||
| return false; | ||
| } | ||
| $type = strtolower( trim( $type, " \t\f\r\n" ) ); | ||
|
|
||
| /* | ||
| * > … | ||
| * > Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "importmap", then set el's type to "importmap". | ||
| * > Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "speculationrules", then set el's type to "speculationrules". | ||
| * @see https://html.spec.whatwg.org/#script-processing-model | ||
| * | ||
| * > A JSON MIME type is any MIME type whose subtype ends in "+json" or whose essence | ||
| * > is "application/json" or "text/json". | ||
| * | ||
| * @todo The JSON MIME type handling handles some common cases but when MIME type parsing is available it should be leveraged here. | ||
| * | ||
| * @see https://mimesniff.spec.whatwg.org/#json-mime-type | ||
| */ | ||
| if ( | ||
| 'importmap' === $type || | ||
| 'speculationrules' === $type || | ||
| 'application/json' === $type || | ||
| 'text/json' === $type | ||
| ) { | ||
| return true; | ||
| } | ||
|
|
||
| return false; | ||
| } | ||
|
|
||
| /** | ||
| * Escape JavaScript script tag contents. | ||
| * | ||
| * Prevent JavaScript text from modifying the HTML structure of a document and | ||
| * ensure that it's contained within its enclosing SCRIPT tag as intended. | ||
| * | ||
| * JavaScript can be safely escaped with a few exceptions. This is achieved by | ||
| * replacing dangerous sequences like "<script" and "</script" with a form | ||
| * using a Unicode escape sequence "<\u0073cript>" and "</\u0073cript>". | ||
| * | ||
| * This text may appear in the JavaScript in limited ways, all of which support | ||
| * the use of Unicode escape sequences on the "s" character. The escaping is safe | ||
| * to perform in all JavaScript and the modified JavaScript maintains identical | ||
| * behavior with a few exceptions: | ||
| * | ||
| * - Comments. | ||
| * - Tagged templates like `String.raw()` that access “raw” strings. | ||
| * - The `source` property of a RegExp object. | ||
| * | ||
| * For example, this input JavaScript: | ||
| * | ||
| * // A comment: "</script>" | ||
| * | ||
| * console.log( String.raw`</script>` ); | ||
| * | ||
| * const regex = /<script>/; | ||
| * console.log( regex.source ); | ||
| * | ||
| * Is transformed to: | ||
| * | ||
| * // A comment: "</\u0073cript>" | ||
| * | ||
| * console.log( String.raw`</\u0073cript>` ); | ||
| * | ||
| * const regex = /<\u0073cript>/; | ||
| * console.log( regex.source ); | ||
| * | ||
| * Note that the RegExp's matching behavior is equivalent, meaning that | ||
| * `regex.test( '<script>' ) === true` in both the unescaped and | ||
| * escaped versions. | ||
| * | ||
| * JavaScript that relies on behavior affected by this escaping must provide | ||
| * safe script contents in order to avoid this escaping. For example, a raw string | ||
| * may be split up to make its contents safe or avoided altogether: | ||
| * | ||
| * console.log( String.raw`</script>` ); // !!UNSAFE!! Will be escaped. | ||
| * console.log( String.raw`</\u0073cript>` ); // "</\u0073cript>" | ||
| * console.log( String.raw`</scr` + String.raw`ipt>` ); // "</script>" | ||
| * console.log( String.raw`</${"script"}>` ); // "</script>" | ||
| * console.log( "\x3C/script>" ); // "</script>" | ||
| * console.log( "<\/script>" ); // "</script>" | ||
| * | ||
| * The following graph is a simplified interpretation of how HTML interprets the contents | ||
| * of a SCRIPT tag and identifies the closing tag. It is useful to understand what text | ||
| * is dangerous inside of a SCRIPT tag and why different approaches to escaping work. | ||
| * | ||
| * Open script | ||
| * │ | ||
| * │ | ||
| * ▼ | ||
| * ╔═════════════════════════════════════════╗ <!--(…)> | ||
| * ║ ║ (all dashes) | ||
| * ║ script ║ ───────────────┐ | ||
| * ║ data ║ │ | ||
| * ┌────────── ║ ║ ◀──────────────┘ | ||
| * │ ╚═════════════════════════════════════════╝ | ||
| * │ │ ▲ ▲ | ||
| * │ │ <!-- │ --> └─────┐ | ||
| * │ ▼ │ │ | ||
| * │ ┌─────────────────────────────────────────┐ │ | ||
| * │ </script† │ escaped │ │ | ||
| * │ └─────────────────────────────────────────┘ │ | ||
| * │ │ ▲ │ │ --> | ||
| * │ │ </script† │ </script† │ <script† │ | ||
| * │ ▼ │ ▼ │ | ||
| * │ ╔══════════════╗ │ ┌───────────┐ │ | ||
| * │ ║ Close script ║ │ │ double │ │ | ||
| * └─────────▶ ║ ║ └────────── │ escaped │ ─┘ | ||
| * ╚══════════════╝ └───────────┘ | ||
| * | ||
| * † = Case insensitive 'script' followed by one of ' \t\f\r\n/>' | ||
| * | ||
| * The original source of this graph is included at the bottom of this file. | ||
| * | ||
| * @see https://html.spec.whatwg.org/#restrictions-for-contents-of-script-elements | ||
| */ | ||
| private function escape_javascript_script_contents( string $text ): string { | ||
| return preg_replace_callback( | ||
| '~(?P<HEAD></?)(?P<S_CHAR>s)(?P<TAIL>cript[ \\t\\f\\r\\n/>])~i', | ||
| static function ( $matches ) { | ||
| $escaped_s_char = 's' === $matches['S_CHAR'] | ||
| ? '\\u0073' | ||
| : '\\u0053'; | ||
| return "{$matches['HEAD']}{$escaped_s_char}{$matches['TAIL']}"; | ||
| }, | ||
| $text | ||
| ); | ||
| } | ||
|
|
||
| /** | ||
| * Escape JSON script tag contents. | ||
| * | ||
| * Prevent JSON text from modifying the HTML structure of a document and | ||
| * ensure that it's contained within its enclosing SCRIPT tag as intended. | ||
| * | ||
| * JSON can be escaped simply by replacing "<" with its Unicode escape | ||
| * sequence "\u003C". "<" is not part of the JSON syntax and only appears | ||
| * in JSON strings, so it's always safe to escape. Furthermore, JSON does | ||
| * not allow backslash escaping of "<", so there's no need to consider | ||
| * whether the "<" is preceded by an escaping backslash. | ||
| * | ||
| * For more details, see {@see WP_HTML_Tag_Processor::escape_javascript_script_contents()}. | ||
| * @see https://www.json.org/json-en.html | ||
| */ | ||
| private function escape_json_script_contents( string $text ): string { | ||
| return strtr( | ||
| $text, | ||
| array( '<' => '\\u003C' ) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is certainly easy, but I wonder what the relative value is compared to performing the same escape as with the JavaScript elements. my guess is that from a performance standpoint I expect the picture to be complicated and hard to quantify, as we’re comparing a relatively simple pair of string searches against string allocations and string building.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| ); | ||
| } | ||
|
|
||
| /** | ||
| * Updates or creates a new attribute on the currently matched tag with the passed value. | ||
| * | ||
|
|
@@ -4681,3 +4969,40 @@ public function get_doctype_info(): ?WP_HTML_Doctype_Info { | |
| */ | ||
| const TEXT_IS_WHITESPACE = 'TEXT_IS_WHITESPACE'; | ||
| } | ||
|
|
||
| /* | ||
| # This is the original Graphviz source for the SCRIPT tag | ||
| # parsing behavior. It's used in the documentation for | ||
| # `WP_HTML_Tag_Processor::escape_javascript_script_contents()`. | ||
| # ==== | ||
| digraph { | ||
| rankdir=TB; | ||
|
|
||
| // Entry point | ||
| entry [shape=plaintext label="Open script"]; | ||
| entry -> script_data; | ||
|
|
||
| // Double-circle states arranged more compactly | ||
| data [shape=doublecircle label="Close script"]; | ||
| script_data [shape=doublecircle color=blue label="script\ndata"]; | ||
| script_data_escaped [shape=circle color=orange label="escaped"]; | ||
| script_data_double_escaped [shape=circle color=red label="double\nescaped"]; | ||
|
|
||
| // Group related nodes on same ranks where possible | ||
| {rank=same; script_data script_data_escaped script_data_double_escaped} | ||
|
|
||
| script_data -> script_data [label="<!--(…)>\n(all dashes)"]; | ||
| script_data -> script_data_escaped [label="<!--"]; | ||
| script_data -> data [label="</script†"]; | ||
|
|
||
| script_data_escaped -> script_data [label="-->"]; | ||
| script_data_escaped -> script_data_double_escaped [label="<script†"]; | ||
| script_data_escaped -> data [label="</script†"]; | ||
|
|
||
| script_data_double_escaped -> script_data [label="-->"]; | ||
| script_data_double_escaped -> script_data_escaped [label="</script†"]; | ||
|
|
||
| label="† = Case insensitive 'script' followed by one of ' \\t\\f\\r\\n/>'"; | ||
| labelloc=b; | ||
| } | ||
| */ | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if we think about changing the design of the
is_javascript_script_tag()methods, we could expose something like a MIME type and allow custom filtering of escapes to allow plugins to provide their own escaping routines rather than reject the updates.for now I see no reason to block progress for this, especially since we already rejected these contents before, but I think we can imagine a situation where someone can replace their SCRIPT contents and we can perform the
<scriptand</scriptchecks after the filter, and only then reject if nothing was able to escape