Merge "Show protection log on creation-protected pages"
[lhc/web/wiklou.git] / includes / Sanitizer.php
index 8424432..907da16 100644 (file)
@@ -56,6 +56,21 @@ class Sanitizer {
        const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
        const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
 
+       /**
+        * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
+        *
+        * @since 1.30
+        */
+       const ID_PRIMARY = 0;
+
+       /**
+        * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false
+        * if no fallback is configured.
+        *
+        * @since 1.30
+        */
+       const ID_FALLBACK = 1;
+
        /**
         * List of all named character entities defined in HTML 4.01
         * https://www.w3.org/TR/html4/sgml/entities.html
@@ -339,8 +354,8 @@ class Sanitizer {
         */
        static function getAttribsRegex() {
                if ( self::$attribsRegex === null ) {
-                       $attribFirst = '[:A-Z_a-z0-9]';
-                       $attrib = '[:A-Z_a-z-.0-9]';
+                       $attribFirst = "[:_\p{L}\p{N}]";
+                       $attrib = "[:_\.\-\p{L}\p{N}]";
                        $space = '[\x09\x0a\x0c\x0d\x20]';
                        self::$attribsRegex =
                                "/(?:^|$space)({$attribFirst}{$attrib}*)
@@ -351,7 +366,7 @@ class Sanitizer {
                                                | '([^']*)(?:'|\$)
                                                | (((?!$space|>).)*)
                                        )
-                               )?(?=$space|\$)/sx";
+                               )?(?=$space|\$)/sxu";
                }
                return self::$attribsRegex;
        }
@@ -465,7 +480,7 @@ class Sanitizer {
                extract( self::getRecognizedTagData( $extratags, $removetags ) );
 
                # Remove HTML comments
-               $text = Sanitizer::removeHTMLcomments( $text );
+               $text = self::removeHTMLcomments( $text );
                $bits = explode( '<', $text );
                $text = str_replace( '>', '&gt;', array_shift( $bits ) );
                if ( !MWTidy::isEnabled() ) {
@@ -583,12 +598,12 @@ class Sanitizer {
                                                        call_user_func_array( $processCallback, [ &$params, $args ] );
                                                }
 
-                                               if ( !Sanitizer::validateTag( $params, $t ) ) {
+                                               if ( !self::validateTag( $params, $t ) ) {
                                                        $badtag = true;
                                                }
 
                                                # Strip non-approved attributes from the tag
-                                               $newparams = Sanitizer::fixTagAttributes( $params, $t );
+                                               $newparams = self::fixTagAttributes( $params, $t );
                                        }
                                        if ( !$badtag ) {
                                                $rest = str_replace( '>', '&gt;', $rest );
@@ -629,11 +644,11 @@ class Sanitizer {
                                                                call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
                                                        }
                                                }
-                                               if ( !Sanitizer::validateTag( $params, $t ) ) {
+                                               if ( !self::validateTag( $params, $t ) ) {
                                                        $badtag = true;
                                                }
 
-                                               $newparams = Sanitizer::fixTagAttributes( $params, $t );
+                                               $newparams = self::fixTagAttributes( $params, $t );
                                                if ( !$badtag ) {
                                                        if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
                                                                # Interpret self-closing tags as empty tags even when
@@ -710,7 +725,7 @@ class Sanitizer {
         * @return bool
         */
        static function validateTag( $params, $element ) {
-               $params = Sanitizer::decodeTagAttributes( $params );
+               $params = self::decodeTagAttributes( $params );
 
                if ( $element == 'meta' || $element == 'link' ) {
                        if ( !isset( $params['itemprop'] ) ) {
@@ -746,8 +761,8 @@ class Sanitizer {
         * @todo Check for unique id attribute :P
         */
        static function validateTagAttributes( $attribs, $element ) {
-               return Sanitizer::validateAttributes( $attribs,
-                       Sanitizer::attributeWhitelist( $element ) );
+               return self::validateAttributes( $attribs,
+                       self::attributeWhitelist( $element ) );
        }
 
        /**
@@ -795,12 +810,12 @@ class Sanitizer {
                        # Strip javascript "expression" from stylesheets.
                        # https://msdn.microsoft.com/en-us/library/ms537634.aspx
                        if ( $attribute == 'style' ) {
-                               $value = Sanitizer::checkCss( $value );
+                               $value = self::checkCss( $value );
                        }
 
                        # Escape HTML id attributes
                        if ( $attribute === 'id' ) {
-                               $value = Sanitizer::escapeId( $value, 'noninitial' );
+                               $value = self::escapeIdForAttribute( $value, Sanitizer::ID_PRIMARY );
                        }
 
                        # Escape HTML id reference lists
@@ -809,7 +824,7 @@ class Sanitizer {
                                || $attribute === 'aria-labelledby'
                                || $attribute === 'aria-owns'
                        ) {
-                               $value = Sanitizer::escapeIdReferenceList( $value, 'noninitial' );
+                               $value = self::escapeIdReferenceList( $value, 'noninitial' );
                        }
 
                        // RDFa and microdata properties allow URLs, URIs and/or CURIs.
@@ -906,9 +921,8 @@ class Sanitizer {
         * @return string normalized css
         */
        public static function normalizeCss( $value ) {
-
                // Decode character references like &#123;
-               $value = Sanitizer::decodeCharReferences( $value );
+               $value = self::decodeCharReferences( $value );
 
                // Decode escape sequences and line continuation
                // See the grammar in the CSS 2 spec, appendix D.
@@ -1088,14 +1102,14 @@ class Sanitizer {
                        return '';
                }
 
-               $decoded = Sanitizer::decodeTagAttributes( $text );
-               $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
+               $decoded = self::decodeTagAttributes( $text );
+               $stripped = self::validateTagAttributes( $decoded, $element );
 
                if ( $sorted ) {
                        ksort( $stripped );
                }
 
-               return Sanitizer::safeEncodeTagAttributes( $stripped );
+               return self::safeEncodeTagAttributes( $stripped );
        }
 
        /**
@@ -1125,7 +1139,7 @@ class Sanitizer {
         * @return string HTML-encoded text fragment
         */
        static function safeEncodeAttribute( $text ) {
-               $encValue = Sanitizer::encodeAttribute( $text );
+               $encValue = self::encodeAttribute( $text );
 
                # Templates and links may be expanded in later parsing,
                # creating invalid or dangerous output. Suppress this.
@@ -1165,6 +1179,8 @@ class Sanitizer {
         * ambiguous if it's part of something that looks like a percent escape
         * (which don't work reliably in fragments cross-browser).
         *
+        * @deprecated since 1.30, use one of this class' escapeIdFor*() functions
+        *
         * @see https://www.w3.org/TR/html401/types.html#type-name Valid characters
         *   in the id and name attributes
         * @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with
@@ -1187,7 +1203,7 @@ class Sanitizer {
                global $wgExperimentalHtmlIds;
                $options = (array)$options;
 
-               $id = Sanitizer::decodeCharReferences( $id );
+               $id = self::decodeCharReferences( $id );
 
                if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
                        $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
@@ -1216,21 +1232,146 @@ class Sanitizer {
                return $id;
        }
 
+       /**
+        * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
+        * a valid HTML id attribute.
+        *
+        * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
+        * be sure to use proper escaping.
+        *
+        * @param string $id String to escape
+        * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding
+        *     should be used.
+        * @return string|bool Escaped ID or false if fallback encoding is requested but it's not
+        *     configured.
+        *
+        * @since 1.30
+        */
+       public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
+               global $wgFragmentMode;
+
+               if ( !isset( $wgFragmentMode[$mode] ) ) {
+                       if ( $mode === self::ID_PRIMARY ) {
+                               throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
+                       }
+                       return false;
+               }
+
+               $internalMode = $wgFragmentMode[$mode];
+
+               return self::escapeIdInternal( $id, $internalMode );
+       }
+
+       /**
+        * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
+        * a valid URL fragment.
+        *
+        * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
+        * be sure to use proper escaping.
+        *
+        * @param string $id String to escape
+        * @return string Escaped ID
+        *
+        * @since 1.30
+        */
+       public static function escapeIdForLink( $id ) {
+               global $wgFragmentMode;
+
+               if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
+                       throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
+               }
+
+               $mode = $wgFragmentMode[self::ID_PRIMARY];
+
+               $id = self::escapeIdInternal( $id, $mode );
+               $id = self::urlEscapeId( $id, $mode );
+
+               return $id;
+       }
+
+       /**
+        * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
+        * a valid URL fragment for external interwikis.
+        *
+        * @param string $id String to escape
+        * @return string Escaped ID
+        *
+        * @since 1.30
+        */
+       public static function escapeIdForExternalInterwiki( $id ) {
+               global $wgExternalInterwikiFragmentMode;
+
+               $id = self::escapeIdInternal( $id, $wgExternalInterwikiFragmentMode );
+               $id = self::urlEscapeId( $id, $wgExternalInterwikiFragmentMode );
+
+               return $id;
+       }
+
+       /**
+        * Helper for escapeIdFor*() functions. URL-escapes the ID if needed.
+        *
+        * @param string $id String to escape
+        * @param string $mode One of modes from $wgFragmentMode
+        * @return string
+        */
+       private static function urlEscapeId( $id, $mode ) {
+               if ( $mode === 'html5' ) {
+                       $id = urlencode( $id );
+                       $id = str_replace( '%3A', ':', $id );
+               }
+
+               return $id;
+       }
+
+       /**
+        * Helper for escapeIdFor*() functions. Performs most of the actual escaping.
+        *
+        * @param string $id String to escape
+        * @param string $mode One of modes from $wgFragmentMode
+        * @return string
+        */
+       private static function escapeIdInternal( $id, $mode ) {
+               $id = Sanitizer::decodeCharReferences( $id );
+
+               switch ( $mode ) {
+                       case 'html5':
+                               $id = str_replace( ' ', '_', $id );
+                               break;
+                       case 'legacy':
+                               // This corresponds to 'noninitial' mode of the old escapeId()
+                               static $replace = [
+                                       '%3A' => ':',
+                                       '%' => '.'
+                               ];
+
+                               $id = urlencode( str_replace( ' ', '_', $id ) );
+                               $id = strtr( $id, $replace );
+                               break;
+                       case 'html5-legacy':
+                               $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
+                               $id = trim( $id, '_' );
+                               if ( $id === '' ) {
+                                       // Must have been all whitespace to start with.
+                                       $id = '_';
+                               }
+                               break;
+                       default:
+                               throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
+               }
+
+               return $id;
+       }
+
        /**
         * Given a string containing a space delimited list of ids, escape each id
         * to match ids escaped by the escapeId() function.
         *
+        * @todo wfDeprecated() uses of $options in 1.31, remove completely in 1.32
+        *
         * @since 1.27
         *
         * @param string $referenceString Space delimited list of ids
-        * @param string|array $options String or array of strings (default is array()):
-        *   'noninitial': This is a non-initial fragment of an id, not a full id,
-        *       so don't pay attention if the first character isn't valid at the
-        *       beginning of an id.  Only matters if $wgExperimentalHtmlIds is
-        *       false.
-        *   'legacy': Behave the way the old HTML 4-based ID escaping worked even
-        *       if $wgExperimentalHtmlIds is used, so we can generate extra
-        *       anchors and links won't break.
+        * @param string|array $options Deprecated and does nothing.
         * @return string
         */
        static function escapeIdReferenceList( $referenceString, $options = [] ) {
@@ -1239,7 +1380,7 @@ class Sanitizer {
 
                # Escape each token as an id
                foreach ( $references as &$ref ) {
-                       $ref = Sanitizer::escapeId( $ref, $options );
+                       $ref = self::escapeIdForAttribute( $ref );
                }
 
                # Merge the array back to a space delimited list string
@@ -1276,7 +1417,7 @@ class Sanitizer {
         * @return string Escaped input
         */
        static function escapeHtmlAllowEntities( $html ) {
-               $html = Sanitizer::decodeCharReferences( $html );
+               $html = self::decodeCharReferences( $html );
                # It seems wise to escape ' as well as ", as a matter of course.  Can't
                # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
                # don't cause the entire string to disappear.
@@ -1318,14 +1459,14 @@ class Sanitizer {
 
                foreach ( $pairs as $set ) {
                        $attribute = strtolower( $set[1] );
-                       $value = Sanitizer::getTagAttributeCallback( $set );
+                       $value = self::getTagAttributeCallback( $set );
 
                        // Normalize whitespace
                        $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
                        $value = trim( $value );
 
                        // Decode character references
-                       $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
+                       $attribs[$attribute] = self::decodeCharReferences( $value );
                }
                return $attribs;
        }
@@ -1341,7 +1482,7 @@ class Sanitizer {
                $attribs = [];
                foreach ( $assoc_array as $attribute => $value ) {
                        $encAttribute = htmlspecialchars( $attribute );
-                       $encValue = Sanitizer::safeEncodeAttribute( $value );
+                       $encValue = self::safeEncodeAttribute( $value );
 
                        $attribs[] = "$encAttribute=\"$encValue\"";
                }
@@ -1428,11 +1569,11 @@ class Sanitizer {
        static function normalizeCharReferencesCallback( $matches ) {
                $ret = null;
                if ( $matches[1] != '' ) {
-                       $ret = Sanitizer::normalizeEntity( $matches[1] );
+                       $ret = self::normalizeEntity( $matches[1] );
                } elseif ( $matches[2] != '' ) {
-                       $ret = Sanitizer::decCharReference( $matches[2] );
+                       $ret = self::decCharReference( $matches[2] );
                } elseif ( $matches[3] != '' ) {
-                       $ret = Sanitizer::hexCharReference( $matches[3] );
+                       $ret = self::hexCharReference( $matches[3] );
                }
                if ( is_null( $ret ) ) {
                        return htmlspecialchars( $matches[0] );
@@ -1469,7 +1610,7 @@ class Sanitizer {
         */
        static function decCharReference( $codepoint ) {
                $point = intval( $codepoint );
-               if ( Sanitizer::validateCodepoint( $point ) ) {
+               if ( self::validateCodepoint( $point ) ) {
                        return sprintf( '&#%d;', $point );
                } else {
                        return null;
@@ -1482,7 +1623,7 @@ class Sanitizer {
         */
        static function hexCharReference( $codepoint ) {
                $point = hexdec( $codepoint );
-               if ( Sanitizer::validateCodepoint( $point ) ) {
+               if ( self::validateCodepoint( $point ) ) {
                        return sprintf( '&#x%x;', $point );
                } else {
                        return null;
@@ -1551,11 +1692,11 @@ class Sanitizer {
         */
        static function decodeCharReferencesCallback( $matches ) {
                if ( $matches[1] != '' ) {
-                       return Sanitizer::decodeEntity( $matches[1] );
+                       return self::decodeEntity( $matches[1] );
                } elseif ( $matches[2] != '' ) {
-                       return Sanitizer::decodeChar( intval( $matches[2] ) );
+                       return self::decodeChar( intval( $matches[2] ) );
                } elseif ( $matches[3] != '' ) {
-                       return Sanitizer::decodeChar( hexdec( $matches[3] ) );
+                       return self::decodeChar( hexdec( $matches[3] ) );
                }
                # Last case should be an ampersand by itself
                return $matches[0];
@@ -1569,7 +1710,7 @@ class Sanitizer {
         * @private
         */
        static function decodeChar( $codepoint ) {
-               if ( Sanitizer::validateCodepoint( $codepoint ) ) {
+               if ( self::validateCodepoint( $codepoint ) ) {
                        return UtfNormal\Utils::codepointToUtf8( $codepoint );
                } else {
                        return UtfNormal\Constants::UTF8_REPLACEMENT;
@@ -1602,7 +1743,7 @@ class Sanitizer {
         * @return array
         */
        static function attributeWhitelist( $element ) {
-               $list = Sanitizer::setupAttributeWhitelist();
+               $list = self::setupAttributeWhitelist();
                return isset( $list[$element] )
                        ? $list[$element]
                        : [];
@@ -1877,7 +2018,7 @@ class Sanitizer {
        static function cleanUrl( $url ) {
                # Normalize any HTML entities in input. They will be
                # re-escaped by makeExternalLink().
-               $url = Sanitizer::decodeCharReferences( $url );
+               $url = self::decodeCharReferences( $url );
 
                # Escape any control characters introduced by the above step
                $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',