Merge "Improve docs for Title::getInternalURL/getCanonicalURL"
[lhc/web/wiklou.git] / includes / parser / Sanitizer.php
index 84f8083..518846c 100644 (file)
@@ -349,18 +349,18 @@ class Sanitizer {
 
        /**
         * Regular expression to match HTML/XML attribute pairs within a tag.
-        * Allows some... latitude. Based on,
-        * https://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
-        * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
+        * Based on https://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
+        * Used in Sanitizer::decodeTagAttributes
         * @return string
         */
        static function getAttribsRegex() {
                if ( self::$attribsRegex === null ) {
-                       $attribFirst = "[:_\p{L}\p{N}]";
-                       $attrib = "[:_\.\-\p{L}\p{N}]";
-                       $space = '[\x09\x0a\x0c\x0d\x20]';
+                       $spaceChars = '\x09\x0a\x0c\x0d\x20';
+                       $space = "[{$spaceChars}]";
+                       $attrib = "[^{$spaceChars}\/>=]";
+                       $attribFirst = "(?:{$attrib}|=)";
                        self::$attribsRegex =
-                               "/(?:^|$space)({$attribFirst}{$attrib}*)
+                               "/({$attribFirst}{$attrib}*)
                                        ($space*=$space*
                                        (?:
                                                # The attribute value: quoted or alone
@@ -368,11 +368,29 @@ class Sanitizer {
                                                | '([^']*)(?:'|\$)
                                                | (((?!$space|>).)*)
                                        )
-                               )?(?=$space|\$)/sxu";
+                               )?/sxu";
                }
                return self::$attribsRegex;
        }
 
+       /**
+        * Lazy-initialised attribute name regex, see getAttribNameRegex()
+        */
+       private static $attribNameRegex;
+
+       /**
+        * Used in Sanitizer::decodeTagAttributes to filter attributes.
+        * @return string
+        */
+       static function getAttribNameRegex() {
+               if ( self::$attribNameRegex === null ) {
+                       $attribFirst = "[:_\p{L}\p{N}]";
+                       $attrib = "[:_\.\-\p{L}\p{N}]";
+                       self::$attribNameRegex = "/^({$attribFirst}{$attrib}*)$/sxu";
+               }
+               return self::$attribNameRegex;
+       }
+
        /**
         * Return the various lists of recognized tags
         * @param array $extratags For any extra tags to include
@@ -560,10 +578,8 @@ class Sanitizer {
                                                                        $badtag = true;
                                                                }
                                                        }
-                                               } else {
-                                                       if ( $t == 'table' ) {
-                                                               $tagstack = array_pop( $tablestack );
-                                                       }
+                                               } elseif ( $t == 'table' ) {
+                                                       $tagstack = array_pop( $tablestack );
                                                }
                                                $newparams = '';
                                        } else {
@@ -1357,20 +1373,14 @@ class Sanitizer {
 
        /**
         * Given a string containing a space delimited list of ids, escape each id
-        * to match ids escaped by the escapeId() function.
-        *
-        * @todo remove $options completely in 1.32
+        * to match ids escaped by the escapeIdForAttribute() function.
         *
         * @since 1.27
         *
         * @param string $referenceString Space delimited list of ids
-        * @param string|array $options Deprecated and does nothing.
         * @return string
         */
-       static function escapeIdReferenceList( $referenceString, $options = [] ) {
-               if ( $options ) {
-                       wfDeprecated( __METHOD__ . ' with $options', '1.31' );
-               }
+       public static function escapeIdReferenceList( $referenceString ) {
                # Explode the space delimited list string into an array of tokens
                $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
 
@@ -1434,18 +1444,24 @@ class Sanitizer {
                        return [];
                }
 
-               $attribs = [];
                $pairs = [];
                if ( !preg_match_all(
                        self::getAttribsRegex(),
                        $text,
                        $pairs,
                        PREG_SET_ORDER ) ) {
-                       return $attribs;
+                       return [];
                }
 
+               $attribs = [];
                foreach ( $pairs as $set ) {
                        $attribute = strtolower( $set[1] );
+
+                       // Filter attribute names with unacceptable characters
+                       if ( !preg_match( self::getAttribNameRegex(), $attribute ) ) {
+                               continue;
+                       }
+
                        $value = self::getTagAttributeCallback( $set );
 
                        // Normalize whitespace