ApiLogin: Don't try to add block info if there's no block
[lhc/web/wiklou.git] / includes / Sanitizer.php
index 96193a7..c02bdc9 100644 (file)
@@ -349,9 +349,6 @@ class Sanitizer {
                                          \"([^<\"]*)\"
                                         | '([^<']*)'
                                         |  ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
-                                        |  (\#[0-9a-fA-F]+) # Technically wrong, but lots of
-                                                                                # colors are specified like this.
-                                                                                # We'll be normalizing it.
                                        )
                                )?(?=$space|\$)/sx";
                }
@@ -359,20 +356,13 @@ class Sanitizer {
        }
 
        /**
-        * Cleans up HTML, removes dangerous tags and attributes, and
-        * removes HTML comments
-        * @param string $text
-        * @param callable $processCallback Callback to do any variable or parameter
-        *   replacements in HTML attribute values
-        * @param array|bool $args Arguments for the processing callback
+        * Return the various lists of recognized tags
         * @param array $extratags For any extra tags to include
         * @param array $removetags For any tags (default or extra) to exclude
-        * @return string
+        * @return array
         */
-       public static function removeHTMLtags( $text, $processCallback = null,
-               $args = array(), $extratags = array(), $removetags = array()
-       ) {
-               global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
+       public static function getRecognizedTagData( $extratags = array(), $removetags = array() ) {
+               global $wgAllowMicrodataAttributes, $wgAllowImageTag;
 
                static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
                        $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
@@ -381,7 +371,6 @@ class Sanitizer {
                // are changed (like in the screwed up test system) we will re-initialise the settings.
                $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
                if ( !$staticInitialised || $staticInitialised != $globalContext ) {
-
                        $htmlpairsStatic = array( # Tags that must be closed
                                'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
                                'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
@@ -431,12 +420,44 @@ class Sanitizer {
                        }
                        $staticInitialised = $globalContext;
                }
+
                # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
                $extratags = array_flip( $extratags );
                $removetags = array_flip( $removetags );
                $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
                $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
 
+               return array(
+                       'htmlpairs' => $htmlpairs,
+                       'htmlsingle' => $htmlsingle,
+                       'htmlsingleonly' => $htmlsingleonly,
+                       'htmlnest' => $htmlnest,
+                       'tabletags' => $tabletags,
+                       'htmllist' => $htmllist,
+                       'listtags' => $listtags,
+                       'htmlsingleallowed' => $htmlsingleallowed,
+                       'htmlelements' => $htmlelements,
+               );
+       }
+
+       /**
+        * Cleans up HTML, removes dangerous tags and attributes, and
+        * removes HTML comments
+        * @param string $text
+        * @param callable $processCallback Callback to do any variable or parameter
+        *   replacements in HTML attribute values
+        * @param array|bool $args Arguments for the processing callback
+        * @param array $extratags For any extra tags to include
+        * @param array $removetags For any tags (default or extra) to exclude
+        * @return string
+        */
+       public static function removeHTMLtags( $text, $processCallback = null,
+               $args = array(), $extratags = array(), $removetags = array()
+       ) {
+               global $wgUseTidy;
+
+               extract( self::getRecognizedTagData( $extratags, $removetags ) );
+
                # Remove HTML comments
                $text = Sanitizer::removeHTMLcomments( $text );
                $bits = explode( '<', $text );
@@ -463,9 +484,9 @@ class Sanitizer {
                                                $badtag = true;
                                        } elseif ( $slash ) {
                                                # Closing a tag... is it the one we just opened?
-                                               wfSuppressWarnings();
+                                               MediaWiki\suppressWarnings();
                                                $ot = array_pop( $tagstack );
-                                               wfRestoreWarnings();
+                                               MediaWiki\restoreWarnings();
 
                                                if ( $ot != $t ) {
                                                        if ( isset( $htmlsingleallowed[$ot] ) ) {
@@ -473,32 +494,32 @@ class Sanitizer {
                                                                # and see if we find a match below them
                                                                $optstack = array();
                                                                array_push( $optstack, $ot );
-                                                               wfSuppressWarnings();
+                                                               MediaWiki\suppressWarnings();
                                                                $ot = array_pop( $tagstack );
-                                                               wfRestoreWarnings();
+                                                               MediaWiki\restoreWarnings();
                                                                while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
                                                                        array_push( $optstack, $ot );
-                                                                       wfSuppressWarnings();
+                                                                       MediaWiki\suppressWarnings();
                                                                        $ot = array_pop( $tagstack );
-                                                                       wfRestoreWarnings();
+                                                                       MediaWiki\restoreWarnings();
                                                                }
                                                                if ( $t != $ot ) {
                                                                        # No match. Push the optional elements back again
                                                                        $badtag = true;
-                                                                       wfSuppressWarnings();
+                                                                       MediaWiki\suppressWarnings();
                                                                        $ot = array_pop( $optstack );
-                                                                       wfRestoreWarnings();
+                                                                       MediaWiki\restoreWarnings();
                                                                        while ( $ot ) {
                                                                                array_push( $tagstack, $ot );
-                                                                               wfSuppressWarnings();
+                                                                               MediaWiki\suppressWarnings();
                                                                                $ot = array_pop( $optstack );
-                                                                               wfRestoreWarnings();
+                                                                               MediaWiki\restoreWarnings();
                                                                        }
                                                                }
                                                        } else {
-                                                               wfSuppressWarnings();
+                                                               MediaWiki\suppressWarnings();
                                                                array_push( $tagstack, $ot );
-                                                               wfRestoreWarnings();
+                                                               MediaWiki\restoreWarnings();
 
                                                                # <li> can be nested in <ul> or <ol>, skip those cases:
                                                                if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
@@ -729,7 +750,7 @@ class Sanitizer {
                        }
 
                        # Allow any attribute beginning with "data-"
-                       if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
+                       if ( !preg_match( '/^data-(?!ooui)/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
                                continue;
                        }
 
@@ -942,7 +963,8 @@ class Sanitizer {
                $value = self::normalizeCss( $value );
 
                // Reject problematic keywords and control characters
-               if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ) {
+               if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
+                       strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
                        return '/* invalid control char */';
                } elseif ( preg_match(
                        '! expression
@@ -1239,10 +1261,7 @@ class Sanitizer {
         * @return string
         */
        private static function getTagAttributeCallback( $set ) {
-               if ( isset( $set[6] ) ) {
-                       # Illegal #XXXXXX color with no quotes.
-                       return $set[6];
-               } elseif ( isset( $set[5] ) ) {
+               if ( isset( $set[5] ) ) {
                        # No quotes.
                        return $set[5];
                } elseif ( isset( $set[4] ) ) {
@@ -1252,9 +1271,10 @@ class Sanitizer {
                        # Double-quoted
                        return $set[3];
                } elseif ( !isset( $set[2] ) ) {
-                       # In XHTML, attributes must have a value.
-                       # For 'reduced' form, return explicitly the attribute name here.
-                       return $set[1];
+                       # In XHTML, attributes must have a value so return an empty string.
+                       # See "Empty attribute syntax",
+                       # http://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
+                       return "";
                } else {
                        throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
                }
@@ -1374,15 +1394,19 @@ class Sanitizer {
        }
 
        /**
-        * Returns true if a given Unicode codepoint is a valid character in XML.
+        * Returns true if a given Unicode codepoint is a valid character in
+        * both HTML5 and XML.
         * @param int $codepoint
         * @return bool
         */
        private static function validateCodepoint( $codepoint ) {
+               # U+000C is valid in HTML5 but not allowed in XML.
+               # U+000D is valid in XML but not allowed in HTML5.
+               # U+007F - U+009F are disallowed in HTML5 (control characters).
                return $codepoint == 0x09
                        || $codepoint == 0x0a
-                       || $codepoint == 0x0d
-                       || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
+                       || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
+                       || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
                        || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
                        || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
        }
@@ -1784,6 +1808,11 @@ class Sanitizer {
 
                        $host = preg_replace( $strip, '', $host );
 
+                       // IPv6 host names are bracketed with [].  Url-decode these.
+                       if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 && preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches ) ) {
+                               $host = '//[' . $matches[1] . ']' . $matches[2];
+                       }
+
                        // @todo FIXME: Validate hostnames here
 
                        return $protocol . $host . $rest;