Merge "Correctly use $wgFeedLimit in page history feed"

[lhc/web/wiklou.git] / includes / Sanitizer.php
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php

index ed01235..7400a5a 100644 (file)
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -740,7 +740,7 @@ class Sanitizer {
  
                         # WAI-ARIA
                         # http://www.w3.org/TR/wai-aria/
-                       # http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#wai-aria
+                       # http://www.whatwg.org/html/elements.html#wai-aria
                         # For now we only support role="presentation" until we work out what roles should be
                         # usable by content and we ensure that our code explicitly rejects patterns that
                         # violate HTML5's ARIA restrictions.
@@ -813,9 +813,10 @@ class Sanitizer {
         /**
          * Pick apart some CSS and check it for forbidden or unsafe structures.
          * Returns a sanitized string. This sanitized string will have
-        * character references and escape sequences decoded, and comments
-        * stripped. If the input is just too evil, only a comment complaining
-        * about evilness will be returned.
+        * character references and escape sequences decoded and comments
+        * stripped (unless it is itself one valid comment, in which case the value
+        * will be passed through). If the input is just too evil, only a comment
+        * complaining about evilness will be returned.
          *
          * Currently URL references, 'expression', 'tps' are forbidden.
          *
@@ -856,19 +857,24 @@ class Sanitizer {
                 $value = preg_replace_callback( $decodeRegex,
                         array( __CLASS__, 'cssDecodeCallback' ), $value );
  
-               // Remove any comments; IE gets token splitting wrong
-               // This must be done AFTER decoding character references and
-               // escape sequences, because those steps can introduce comments
-               // This step cannot introduce character references or escape
-               // sequences, because it replaces comments with spaces rather
-               // than removing them completely.
-               $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
-
-               // Remove anything after a comment-start token, to guard against
-               // incorrect client implementations.
-               $commentPos = strpos( $value, '/*' );
-               if ( $commentPos !== false ) {
-                       $value = substr( $value, 0, $commentPos );
+               // Let the value through if it's nothing but a single comment, to
+               // allow other functions which may reject it to pass some error
+               // message through.
+               if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
+                       // Remove any comments; IE gets token splitting wrong
+                       // This must be done AFTER decoding character references and
+                       // escape sequences, because those steps can introduce comments
+                       // This step cannot introduce character references or escape
+                       // sequences, because it replaces comments with spaces rather
+                       // than removing them completely.
+                       $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
+
+                       // Remove anything after a comment-start token, to guard against
+                       // incorrect client implementations.
+                       $commentPos = strpos( $value, '/*' );
+                       if ( $commentPos !== false ) {
+                               $value = substr( $value, 0, $commentPos );
+                       }
                 }
  
                 // Reject problematic keywords and control characters
@@ -932,14 +938,7 @@ class Sanitizer {
                 $decoded = Sanitizer::decodeTagAttributes( $text );
                 $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
  
-               $attribs = array();
-               foreach ( $stripped as $attribute => $value ) {
-                       $encAttribute = htmlspecialchars( $attribute );
-                       $encValue = Sanitizer::safeEncodeAttribute( $value );
-
-                       $attribs[] = "$encAttribute=\"$encValue\"";
-               }
-               return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
+               return Sanitizer::safeEncodeTagAttributes( $stripped );
         }
  
         /**
@@ -1012,7 +1011,7 @@ class Sanitizer {
          *                                                          in the id and
          *                                                          name attributes
          * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
-        * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute
+        * @see http://www.whatwg.org/html/elements.html#the-id-attribute
          *   HTML5 definition of id attribute
          *
          * @param string $id id to escape
@@ -1139,6 +1138,24 @@ class Sanitizer {
                 return $attribs;
         }
  
+       /**
+        * Build a partial tag string from an associative array of attribute
+        * names and values as returned by decodeTagAttributes.
+        *
+        * @param $assoc_array Array
+        * @return String
+        */
+       public static function safeEncodeTagAttributes( $assoc_array ) {
+               $attribs = array();
+               foreach ( $assoc_array as $attribute => $value ) {
+                       $encAttribute = htmlspecialchars( $attribute );
+                       $encValue = Sanitizer::safeEncodeAttribute( $value );
+
+                       $attribs[] = "$encAttribute=\"$encValue\"";
+               }
+               return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
+       }
+
         /**
          * Pick the appropriate attribute value from a match set from the
          * attribs regex matches.
@@ -1450,7 +1467,7 @@ class Sanitizer {
                 }
  
                 if ( $wgAllowMicrodataAttributes ) {
-                       # add HTML5 microdata tags as specified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
+                       # add HTML5 microdata tags as specified by http://www.whatwg.org/html/microdata.html#the-microdata-model
                         $common = array_merge( $common, array(
                                 'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
                         ) );
@@ -1589,7 +1606,7 @@ class Sanitizer {
                         'hr'         => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
  
                         # HTML Ruby annotation text module, simple ruby only.
-                       # http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#the-ruby-element
+                       # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
                         'ruby'       => $common,
                         # rbc
                         # rtc
@@ -1606,7 +1623,7 @@ class Sanitizer {
                         'bdi' => $common,
  
                         # HTML5 elements, defined by:
-                       # http://www.whatwg.org/specs/web-apps/current-work/multipage/
+                       # http://www.whatwg.org/html/
                         'data' => array_merge( $common, array( 'value' ) ),
                         'time' => array_merge( $common, array( 'datetime' ) ),
                         'mark' => $common,
@@ -1723,7 +1740,7 @@ class Sanitizer {
          * Does a string look like an e-mail address?
          *
          * This validates an email address using an HTML5 specification found at:
-        * http://www.whatwg.org/specs/web-apps/current-work/multipage/states-of-the-type-attribute.html#valid-e-mail-address
+        * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address
          * Which as of 2011-01-24 says:
          *
          *   A valid e-mail address is a string that matches the ABNF production