ChangesList: Expose basic properties of lines as data attributes
[lhc/web/wiklou.git] / includes / Sanitizer.php
index 42b166d..5aaa3ed 100644 (file)
@@ -344,12 +344,12 @@ class Sanitizer {
                        $space = '[\x09\x0a\x0c\x0d\x20]';
                        self::$attribsRegex =
                                "/(?:^|$space)({$attribFirst}{$attrib}*)
-                                 ($space*=$space*
+                                       ($space*=$space*
                                        (?:
-                                        # The attribute value: quoted or alone
-                                         \"([^\"]*)(?:\"|\$)
-                                        | '([^']*)(?:'|\$)
-                                         (((?!$space|>).)*)
+                                               # The attribute value: quoted or alone
+                                               \"([^\"]*)(?:\"|\$)
+                                               | '([^']*)(?:'|\$)
+                                               | (((?!$space|>).)*)
                                        )
                                )?(?=$space|\$)/sx";
                }
@@ -545,7 +545,7 @@ class Sanitizer {
                                                        $badtag = true;
                                                } elseif ( in_array( $t, $tagstack ) && !isset( $htmlnest[$t] ) ) {
                                                        $badtag = true;
-                                               #  Is it a self closed htmlpair ? (bug 5487)
+                                               #  Is it a self closed htmlpair ? (T7487)
                                                } elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) {
                                                        // Eventually we'll just remove the self-closing
                                                        // slash, in order to be consistent with HTML5
@@ -782,15 +782,12 @@ class Sanitizer {
 
                        # Allow any attribute beginning with "data-"
                        # However:
-                       # * data-ooui is reserved for ooui
-                       # * data-mw and data-parsoid are reserved for parsoid
-                       # * data-mw-<name here> is reserved for extensions (or core) if
-                       #   they need to communicate some data to the client and want to be
-                       #   sure that it isn't coming from an untrusted user.
+                       # * Disallow data attributes used by MediaWiki code
                        # * Ensure that the attribute is not namespaced by banning
                        #   colons.
-                       if ( !preg_match( '/^data-(?!ooui|mw|parsoid)[^:]*$/i', $attribute )
+                       if ( !preg_match( '/^data-[^:]*$/i', $attribute )
                                && !isset( $whitelist[$attribute] )
+                               || self::isReservedDataAttribute( $attribute )
                        ) {
                                continue;
                        }
@@ -835,7 +832,7 @@ class Sanitizer {
 
                        # NOTE: even though elements using href/src are not allowed directly, supply
                        #       validation code that can be used by tag hook handlers, etc
-                       if ( $attribute === 'href' || $attribute === 'src' ) {
+                       if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
                                if ( !preg_match( $hrefExp, $value ) ) {
                                        continue; // drop any href or src attributes not using an allowed protocol.
                                        // NOTE: this also drops all relative URLs
@@ -858,6 +855,24 @@ class Sanitizer {
                return $out;
        }
 
+       /**
+        * Given an attribute name, checks whether it is a reserved data attribute
+        * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki
+        * core and extension code can safely use it to communicate with frontend code.
+        * @param string $attr Attribute name.
+        * @return bool
+        */
+       public static function isReservedDataAttribute( $attr ) {
+               // data-ooui is reserved for ooui.
+               // data-mw and data-parsoid are reserved for parsoid.
+               // data-mw-<name here> is reserved for extensions (or core) if
+               // they need to communicate some data to the client and want to be
+               // sure that it isn't coming from an untrusted user.
+               // We ignore the possibility of namespaces since user-generated HTML
+               // can't use them anymore.
+               return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
+       }
+
        /**
         * Merge two sets of HTML attributes.  Conflicting items in the second set
         * will override those in the first, except for 'class' attributes which
@@ -922,7 +937,7 @@ class Sanitizer {
 
                // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
                $value = preg_replace_callback(
-                       '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088)
+                       '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (T60088)
                        function ( $matches ) {
                                $cp = UtfNormal\Utils::utf8ToCodepoint( $matches[0] );
                                if ( $cp === false ) {
@@ -1508,7 +1523,7 @@ class Sanitizer {
 
        /**
         * Decode any character references, numeric or named entities,
-        * in the next and normalize the resulting string. (bug 14952)
+        * in the next and normalize the resulting string. (T16952)
         *
         * This is useful for page titles, not for text to be displayed,
         * MediaWiki allows HTML entities to escape normalization as a feature.
@@ -1760,6 +1775,10 @@ class Sanitizer {
                        # true
                        'img'        => array_merge( $common, [ 'alt', 'src', 'width', 'height' ] ),
 
+                       'video'      => array_merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
+                       'source'     => array_merge( $common, [ 'type', 'src' ] ),
+                       'track'      => array_merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
+
                        # 15.2.1
                        'tt'         => $common,
                        'b'          => $common,
@@ -1784,7 +1803,7 @@ class Sanitizer {
                        'rb'         => $common,
                        'rp'         => $common,
                        'rt'         => $common, # array_merge( $common, array( 'rbspan' ) ),
-                       'rtc'         => $common,
+                       'rtc'        => $common,
 
                        # MathML root element, where used for extensions
                        # 'title' may not be 100% valid here; it's XHTML
@@ -1926,7 +1945,7 @@ class Sanitizer {
         *   3.5.
         *
         * This function is an implementation of the specification as requested in
-        * bug 22449.
+        * T24449.
         *
         * Client-side forms will use the same standard validation rules via JS or
         * HTML 5 validation; additional restrictions can be enforced server-side
@@ -1949,7 +1968,7 @@ class Sanitizer {
 
                // Please note strings below are enclosed in brackets [], this make the
                // hyphen "-" a range indicator. Hence it is double backslashed below.
-               // See bug 26948
+               // See T28948
                $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
                $rfc1034_ldh_str = "a-z0-9\\-";