Merge "Sanitizer: Allow attribute names to use any Unicode "Letter" or "Number""
[lhc/web/wiklou.git] / includes / Sanitizer.php
index c4883ba..b08bc69 100644 (file)
@@ -339,8 +339,8 @@ class Sanitizer {
         */
        static function getAttribsRegex() {
                if ( self::$attribsRegex === null ) {
-                       $attribFirst = '[:A-Z_a-z0-9]';
-                       $attrib = '[:A-Z_a-z-.0-9]';
+                       $attribFirst = "[:_\p{L}\p{N}]";
+                       $attrib = "[:_\.\-\p{L}\p{N}]";
                        $space = '[\x09\x0a\x0c\x0d\x20]';
                        self::$attribsRegex =
                                "/(?:^|$space)({$attribFirst}{$attrib}*)
@@ -351,7 +351,7 @@ class Sanitizer {
                                                | '([^']*)(?:'|\$)
                                                | (((?!$space|>).)*)
                                        )
-                               )?(?=$space|\$)/sx";
+                               )?(?=$space|\$)/sxu";
                }
                return self::$attribsRegex;
        }
@@ -782,21 +782,18 @@ class Sanitizer {
 
                        # Allow any attribute beginning with "data-"
                        # However:
-                       # * data-ooui is reserved for ooui
-                       # * data-mw and data-parsoid are reserved for parsoid
-                       # * data-mw-<name here> is reserved for extensions (or core) if
-                       #   they need to communicate some data to the client and want to be
-                       #   sure that it isn't coming from an untrusted user.
+                       # * Disallow data attributes used by MediaWiki code
                        # * Ensure that the attribute is not namespaced by banning
                        #   colons.
-                       if ( !preg_match( '/^data-(?!ooui|mw|parsoid)[^:]*$/i', $attribute )
+                       if ( !preg_match( '/^data-[^:]*$/i', $attribute )
                                && !isset( $whitelist[$attribute] )
+                               || self::isReservedDataAttribute( $attribute )
                        ) {
                                continue;
                        }
 
                        # Strip javascript "expression" from stylesheets.
-                       # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
+                       # https://msdn.microsoft.com/en-us/library/ms537634.aspx
                        if ( $attribute == 'style' ) {
                                $value = Sanitizer::checkCss( $value );
                        }
@@ -858,6 +855,24 @@ class Sanitizer {
                return $out;
        }
 
+       /**
+        * Given an attribute name, checks whether it is a reserved data attribute
+        * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki
+        * core and extension code can safely use it to communicate with frontend code.
+        * @param string $attr Attribute name.
+        * @return bool
+        */
+       public static function isReservedDataAttribute( $attr ) {
+               // data-ooui is reserved for ooui.
+               // data-mw and data-parsoid are reserved for parsoid.
+               // data-mw-<name here> is reserved for extensions (or core) if
+               // they need to communicate some data to the client and want to be
+               // sure that it isn't coming from an untrusted user.
+               // We ignore the possibility of namespaces since user-generated HTML
+               // can't use them anymore.
+               return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
+       }
+
        /**
         * Merge two sets of HTML attributes.  Conflicting items in the second set
         * will override those in the first, except for 'class' attributes which
@@ -891,7 +906,6 @@ class Sanitizer {
         * @return string normalized css
         */
        public static function normalizeCss( $value ) {
-
                // Decode character references like &#123;
                $value = Sanitizer::decodeCharReferences( $value );
 
@@ -1192,7 +1206,7 @@ class Sanitizer {
                ];
 
                $id = urlencode( strtr( $id, ' ', '_' ) );
-               $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
+               $id = strtr( $id, $replace );
 
                if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) {
                        // Initial character must be a letter!