Merge "Correctly use $wgFeedLimit in page history feed"
[lhc/web/wiklou.git] / includes / Sanitizer.php
index e757021..7400a5a 100644 (file)
@@ -1,6 +1,6 @@
 <?php
 /**
- * XHTML sanitizer for %MediaWiki.
+ * HTML sanitizer for %MediaWiki.
  *
  * Copyright © 2002-2005 Brion Vibber <brion@pobox.com> et al
  * http://www.mediawiki.org/
@@ -25,7 +25,7 @@
  */
 
 /**
- * XHTML sanitizer for MediaWiki
+ * HTML sanitizer for MediaWiki
  * @ingroup Parser
  */
 class Sanitizer {
@@ -364,7 +364,7 @@ class Sanitizer {
         * @return string
         */
        static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
-               global $wgUseTidy, $wgHtml5, $wgAllowMicrodataAttributes, $wgAllowImageTag;
+               global $wgUseTidy, $wgAllowMicrodataAttributes, $wgAllowImageTag;
 
                static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
                        $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
@@ -373,7 +373,7 @@ class Sanitizer {
 
                // Base our staticInitialised variable off of the global config state so that if the globals
                // are changed (like in the screwed up test system) we will re-initialise the settings.
-               $globalContext = implode( '-', compact( 'wgHtml5', 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
+               $globalContext = implode( '-', compact( 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
                if ( !$staticInitialised || $staticInitialised != $globalContext ) {
 
                        $htmlpairsStatic = array( # Tags that must be closed
@@ -382,24 +382,22 @@ class Sanitizer {
                                'strike', 'strong', 'tt', 'var', 'div', 'center',
                                'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
                                'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn',
-                               'kbd', 'samp'
+                               'kbd', 'samp', 'data', 'time', 'mark'
                        );
-                       if ( $wgHtml5 ) {
-                               $htmlpairsStatic = array_merge( $htmlpairsStatic, array( 'data', 'time', 'mark' ) );
-                       }
                        $htmlsingle = array(
                                'br', 'hr', 'li', 'dt', 'dd'
                        );
                        $htmlsingleonly = array( # Elements that cannot have close tags
                                'br', 'hr'
                        );
-                       if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
+                       if ( $wgAllowMicrodataAttributes ) {
                                $htmlsingle[] = $htmlsingleonly[] = 'meta';
                                $htmlsingle[] = $htmlsingleonly[] = 'link';
                        }
                        $htmlnest = array( # Tags that can be nested--??
                                'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
-                               'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span'
+                               'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
+                               'var', 'kbd', 'samp'
                        );
                        $tabletags = array( # Can only appear inside table, we will close them
                                'td', 'th', 'tr',
@@ -709,7 +707,7 @@ class Sanitizer {
         * @todo Check for unique id attribute :P
         */
        static function validateAttributes( $attribs, $whitelist ) {
-               global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes, $wgHtml5;
+               global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
 
                $whitelist = array_flip( $whitelist );
                $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
@@ -725,8 +723,8 @@ class Sanitizer {
                                continue;
                        }
 
-                       # Allow any attribute beginning with "data-", if in HTML5 mode
-                       if ( !( $wgHtml5 && preg_match( '/^data-/i', $attribute ) ) && !isset( $whitelist[$attribute] ) ) {
+                       # Allow any attribute beginning with "data-"
+                       if ( !preg_match( '/^data-/i', $attribute ) && !isset( $whitelist[$attribute] ) ) {
                                continue;
                        }
 
@@ -742,7 +740,7 @@ class Sanitizer {
 
                        # WAI-ARIA
                        # http://www.w3.org/TR/wai-aria/
-                       # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#wai-aria
+                       # http://www.whatwg.org/html/elements.html#wai-aria
                        # For now we only support role="presentation" until we work out what roles should be
                        # usable by content and we ensure that our code explicitly rejects patterns that
                        # violate HTML5's ARIA restrictions.
@@ -815,9 +813,10 @@ class Sanitizer {
        /**
         * Pick apart some CSS and check it for forbidden or unsafe structures.
         * Returns a sanitized string. This sanitized string will have
-        * character references and escape sequences decoded, and comments
-        * stripped. If the input is just too evil, only a comment complaining
-        * about evilness will be returned.
+        * character references and escape sequences decoded and comments
+        * stripped (unless it is itself one valid comment, in which case the value
+        * will be passed through). If the input is just too evil, only a comment
+        * complaining about evilness will be returned.
         *
         * Currently URL references, 'expression', 'tps' are forbidden.
         *
@@ -858,19 +857,24 @@ class Sanitizer {
                $value = preg_replace_callback( $decodeRegex,
                        array( __CLASS__, 'cssDecodeCallback' ), $value );
 
-               // Remove any comments; IE gets token splitting wrong
-               // This must be done AFTER decoding character references and
-               // escape sequences, because those steps can introduce comments
-               // This step cannot introduce character references or escape
-               // sequences, because it replaces comments with spaces rather
-               // than removing them completely.
-               $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
-
-               // Remove anything after a comment-start token, to guard against
-               // incorrect client implementations.
-               $commentPos = strpos( $value, '/*' );
-               if ( $commentPos !== false ) {
-                       $value = substr( $value, 0, $commentPos );
+               // Let the value through if it's nothing but a single comment, to
+               // allow other functions which may reject it to pass some error
+               // message through.
+               if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
+                       // Remove any comments; IE gets token splitting wrong
+                       // This must be done AFTER decoding character references and
+                       // escape sequences, because those steps can introduce comments
+                       // This step cannot introduce character references or escape
+                       // sequences, because it replaces comments with spaces rather
+                       // than removing them completely.
+                       $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
+
+                       // Remove anything after a comment-start token, to guard against
+                       // incorrect client implementations.
+                       $commentPos = strpos( $value, '/*' );
+                       if ( $commentPos !== false ) {
+                               $value = substr( $value, 0, $commentPos );
+                       }
                }
 
                // Reject problematic keywords and control characters
@@ -934,14 +938,7 @@ class Sanitizer {
                $decoded = Sanitizer::decodeTagAttributes( $text );
                $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
 
-               $attribs = array();
-               foreach ( $stripped as $attribute => $value ) {
-                       $encAttribute = htmlspecialchars( $attribute );
-                       $encValue = Sanitizer::safeEncodeAttribute( $value );
-
-                       $attribs[] = "$encAttribute=\"$encValue\"";
-               }
-               return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
+               return Sanitizer::safeEncodeTagAttributes( $stripped );
        }
 
        /**
@@ -1014,7 +1011,7 @@ class Sanitizer {
         *                                                          in the id and
         *                                                          name attributes
         * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
-        * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute
+        * @see http://www.whatwg.org/html/elements.html#the-id-attribute
         *   HTML5 definition of id attribute
         *
         * @param string $id id to escape
@@ -1029,10 +1026,10 @@ class Sanitizer {
         * @return String
         */
        static function escapeId( $id, $options = array() ) {
-               global $wgHtml5, $wgExperimentalHtmlIds;
+               global $wgExperimentalHtmlIds;
                $options = (array)$options;
 
-               if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
+               if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
                        $id = Sanitizer::decodeCharReferences( $id );
                        $id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id );
                        $id = trim( $id, '_' );
@@ -1141,6 +1138,24 @@ class Sanitizer {
                return $attribs;
        }
 
+       /**
+        * Build a partial tag string from an associative array of attribute
+        * names and values as returned by decodeTagAttributes.
+        *
+        * @param $assoc_array Array
+        * @return String
+        */
+       public static function safeEncodeTagAttributes( $assoc_array ) {
+               $attribs = array();
+               foreach ( $assoc_array as $attribute => $value ) {
+                       $encAttribute = htmlspecialchars( $attribute );
+                       $encValue = Sanitizer::safeEncodeAttribute( $value );
+
+                       $attribs[] = "$encAttribute=\"$encValue\"";
+               }
+               return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
+       }
+
        /**
         * Pick the appropriate attribute value from a match set from the
         * attribs regex matches.
@@ -1307,12 +1322,12 @@ class Sanitizer {
         * @return Boolean
         */
        private static function validateCodepoint( $codepoint ) {
-               return ($codepoint ==    0x09)
-                       || ($codepoint ==    0x0a)
-                       || ($codepoint ==    0x0d)
-                       || ($codepoint >=    0x20 && $codepoint <=   0xd7ff)
-                       || ($codepoint >=  0xe000 && $codepoint <=   0xfffd)
-                       || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
+               return $codepoint == 0x09
+                       || $codepoint == 0x0a
+                       || $codepoint == 0x0d
+                       || ( $codepoint >= 0x20 && $codepoint <= 0xd7ff )
+                       || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
+                       || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
        }
 
        /**
@@ -1422,10 +1437,10 @@ class Sanitizer {
         * @return Array
         */
        static function setupAttributeWhitelist() {
-               global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
+               global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
 
                static $whitelist, $staticInitialised;
-               $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgHtml5', 'wgAllowMicrodataAttributes' ) );
+               $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgAllowMicrodataAttributes' ) );
 
                if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
                        return $whitelist;
@@ -1451,8 +1466,8 @@ class Sanitizer {
                        ) );
                }
 
-               if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
-                       # add HTML5 microdata tags as specified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
+               if ( $wgAllowMicrodataAttributes ) {
+                       # add HTML5 microdata tags as specified by http://www.whatwg.org/html/microdata.html#the-microdata-model
                        $common = array_merge( $common, array(
                                'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
                        ) );
@@ -1590,8 +1605,8 @@ class Sanitizer {
                        # 15.3
                        'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
 
-                       # XHTML Ruby annotation text module, simple ruby only.
-                       # http://www.w3c.org/TR/ruby/
+                       # HTML Ruby annotation text module, simple ruby only.
+                       # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
                        'ruby'       => $common,
                        # rbc
                        # rtc
@@ -1607,25 +1622,20 @@ class Sanitizer {
                        # HTML 5 section 4.6
                        'bdi' => $common,
 
-               );
-
-               if ( $wgHtml5 ) {
                        # HTML5 elements, defined by:
-                       # http://www.whatwg.org/specs/web-apps/current-work/multipage/
-                       $whitelist += array(
-                               'data' => array_merge( $common, array( 'value' ) ),
-                               'time' => array_merge( $common, array( 'datetime' ) ),
-                               'mark' => $common,
-
-                               // meta and link are only permitted by removeHTMLtags when Microdata
-                               // is enabled so we don't bother adding a conditional to hide these
-                               // Also meta and link are only valid in WikiText as Microdata elements
-                               // (ie: validateTag rejects tags missing the attributes needed for Microdata)
-                               // So we don't bother including $common attributes that have no purpose.
-                               'meta' => array( 'itemprop', 'content' ),
-                               'link' => array( 'itemprop', 'href' ),
-                       );
-               }
+                       # http://www.whatwg.org/html/
+                       'data' => array_merge( $common, array( 'value' ) ),
+                       'time' => array_merge( $common, array( 'datetime' ) ),
+                       'mark' => $common,
+
+                       // meta and link are only permitted by removeHTMLtags when Microdata
+                       // is enabled so we don't bother adding a conditional to hide these
+                       // Also meta and link are only valid in WikiText as Microdata elements
+                       // (ie: validateTag rejects tags missing the attributes needed for Microdata)
+                       // So we don't bother including $common attributes that have no purpose.
+                       'meta' => array( 'itemprop', 'content' ),
+                       'link' => array( 'itemprop', 'href' ),
+               );
 
                $staticInitialised = $globalContext;
 
@@ -1730,7 +1740,7 @@ class Sanitizer {
         * Does a string look like an e-mail address?
         *
         * This validates an email address using an HTML5 specification found at:
-        * http://www.whatwg.org/specs/web-apps/current-work/multipage/states-of-the-type-attribute.html#valid-e-mail-address
+        * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address
         * Which as of 2011-01-24 says:
         *
         *   A valid e-mail address is a string that matches the ABNF production