* Use local context to get messages

[lhc/web/wiklou.git] / includes / Sanitizer.php
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php

index d6abfa2..196abd9 100644 (file)
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -33,18 +33,27 @@ class Sanitizer {
          * Regular expression to match various types of character references in
          * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
          */
-       const CHAR_REFS_REGEX = 
+       const CHAR_REFS_REGEX =
                 '/&([A-Za-z0-9\x80-\xff]+);
                  |&\#([0-9]+);
                  |&\#[xX]([0-9A-Fa-f]+);
                  |(&)/x';
  
+       /**
+        * Blacklist for evil uris like javascript:
+        * WARNING: DO NOT use this in any place that actually requires blacklisting
+        * for security reasons. There are NUMEROUS[1] ways to bypass blacklisting, the
+        * only way to be secure from javascript: uri based xss vectors is to whitelist
+        * things that you know are safe and deny everything else.
+        * [1]: http://ha.ckers.org/xss.html
+        */
         const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
         const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
  
         /**
          * List of all named character entities defined in HTML 4.01
          * http://www.w3.org/TR/html4/sgml/entities.html
+        * As well as &apos; which is only defined starting in XHTML1.
          * @private
          */
         static $htmlEntities = array(
@@ -63,6 +72,7 @@ class Sanitizer {
                 'amp'      => 38,
                 'and'      => 8743,
                 'ang'      => 8736,
+               'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
                 'Aring'    => 197,
                 'aring'    => 229,
                 'asymp'    => 8776,
@@ -325,7 +335,7 @@ class Sanitizer {
                         $attribFirst = '[:A-Z_a-z0-9]';
                         $attrib = '[:A-Z_a-z-.0-9]';
                         $space = '[\x09\x0a\x0d\x20]';
-                       self::$attribsRegex = 
+                       self::$attribsRegex =
                                 "/(?:^|$space)({$attribFirst}{$attrib}*)
                                   ($space*=$space*
                                         (?:
@@ -447,16 +457,26 @@ class Sanitizer {
                                                                 # and see if we find a match below them
                                                                 $optstack = array();
                                                                 array_push( $optstack, $ot );
-                                                               $ot = @array_pop( $tagstack );
+                                                               wfSuppressWarnings();
+                                                               $ot = array_pop( $tagstack );
+                                                               wfRestoreWarnings();
                                                                 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
                                                                         array_push( $optstack, $ot );
-                                                                       $ot = @array_pop( $tagstack );
+                                                                       wfSuppressWarnings();
+                                                                       $ot = array_pop( $tagstack );
+                                                                       wfRestoreWarnings();
                                                                 }
                                                                 if ( $t != $ot ) {
                                                                         # No match. Push the optional elements back again
                                                                         $badtag = true;
-                                                                       while ( $ot = @array_pop( $optstack ) ) {
+                                                                       wfSuppressWarnings();
+                                                                       $ot = array_pop( $optstack );
+                                                                       wfRestoreWarnings();
+                                                                       while ( $ot ) {
                                                                                 array_push( $tagstack, $ot );
+                                                                               wfSuppressWarnings();
+                                                                               $ot = array_pop( $optstack );
+                                                                               wfRestoreWarnings();
                                                                         }
                                                                 }
                                                         } else {
@@ -592,6 +612,102 @@ class Sanitizer {
                 return $text;
         }
  
+       /**
+        * Take an array of attribute names and values and fix some deprecated values
+        * for the given element type.
+        * This does not validate properties, so you should ensure that you call
+        * validateTagAttributes AFTER this to ensure that the resulting style rule
+        * this may add is safe.
+        *
+        * - Converts most presentational attributes like align into inline css
+        *
+        * @param $attribs Array
+        * @param $element String
+        * @return Array
+        */
+       static function fixDeprecatedAttributes( $attribs, $element ) {
+               global $wgHtml5, $wgCleanupPresentationalAttributes;
+
+               // presentational attributes were removed from html5, we can leave them
+               // in when html5 is turned off
+               if ( !$wgHtml5 || !$wgCleanupPresentationalAttributes ) {
+                       return $attribs;
+               }
+
+               $table = array( 'table' );
+               $cells = array( 'td', 'th' );
+               $colls = array( 'col', 'colgroup' );
+               $tblocks = array( 'tbody', 'tfoot', 'thead' );
+               $h = array( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' );
+
+               $presentationalAttribs = array(
+                       'align' => array( 'text-align', array_merge( array( 'caption', 'hr', 'div', 'p', 'tr' ), $table, $cells, $colls, $tblocks, $h ) ),
+                       'clear' => array( 'clear', array( 'br' ) ),
+                       'height' => array( 'height', $cells ),
+                       'nowrap' => array( 'white-space', $cells ),
+                       'size' => array( 'height', array( 'hr' ) ),
+                       'type' => array( 'list-style-type', array( 'li', 'ol', 'ul' ) ),
+                       'valign' => array( 'vertical-align', array_merge( $cells, $colls, $tblocks ) ),
+                       'width' => array( 'width', array_merge( array( 'hr', 'pre' ), $table, $cells, $colls ) ),
+               );
+
+               // Ensure that any upper case or mixed case attributes are converted to lowercase
+               foreach ( $attribs as $attribute => $value ) {
+                       if ( $attribute !== strtolower( $attribute ) && array_key_exists( strtolower( $attribute ), $presentationalAttribs ) ) {
+                               $attribs[strtolower( $attribute )] = $value;
+                               unset( $attribs[$attribute] );
+                       }
+               }
+
+               $style = "";
+               foreach ( $presentationalAttribs as $attribute => $info ) {
+                       list( $property, $elements ) = $info;
+
+                       // Skip if this attribute is not relevant to this element
+                       if ( !in_array( $element, $elements ) ) {
+                               continue;
+                       }
+
+                       // Skip if the attribute is not used
+                       if ( !array_key_exists( $attribute, $attribs ) ) {
+                               continue;
+                       }
+
+                       $value = $attribs[$attribute];
+
+                       // For nowrap the value should be nowrap instead of whatever text is in the value
+                       if ( $attribute === 'nowrap' ) {
+                               $value = 'nowrap';
+                       }
+
+                       // clear="all" is clear: both; in css
+                       if ( $attribute === 'clear' && strtolower( $value ) === 'all' ) {
+                               $value = 'both';
+                       }
+
+                       // Size based properties should have px applied to them if they have no unit
+                       if ( in_array( $attribute, array( 'height', 'width', 'size' ) ) ) {
+                               if ( preg_match( '/^[\d.]+$/', $value ) ) {
+                                       $value = "{$value}px";
+                               }
+                       }
+
+                       $style .= " $property: $value;";
+
+                       unset( $attribs[$attribute] );
+               }
+
+               if ( $style ) {
+                       // Prepend our style rules so that they can be overridden by user css
+                       if ( isset($attribs['style']) ) {
+                               $style .= " " . $attribs['style'];
+                       }
+                       $attribs['style'] = trim($style);
+               }
+
+               return $attribs;
+       }
+
         /**
          * Take an array of attribute names and values and normalize or discard
          * illegal values for the given element type.
@@ -660,7 +776,7 @@ class Sanitizer {
                         }
  
                         //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
-                       if ( $attribute === 'rel' || $attribute === 'rev' || 
+                       if ( $attribute === 'rel' || $attribute === 'rev' ||
                                 $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
                                 $attribute === 'datatype' || $attribute === 'typeof' ||                             #RDFa
                                 $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
@@ -668,7 +784,7 @@ class Sanitizer {
  
                                 //Paranoia. Allow "simple" values but suppress javascript
                                 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
-                                       continue; 
+                                       continue;
                                 }
                         }
  
@@ -722,34 +838,40 @@ class Sanitizer {
  
         /**
          * Pick apart some CSS and check it for forbidden or unsafe structures.
-        * Returns a sanitized string, or false if it was just too evil.
+        * Returns a sanitized string. This sanitized string will have
+        * character references and escape sequences decoded, and comments
+        * stripped. If the input is just too evil, only a comment complaining
+        * about evilness will be returned.
          *
          * Currently URL references, 'expression', 'tps' are forbidden.
          *
+        * NOTE: Despite the fact that character references are decoded, the
+        * returned string may contain character references given certain
+        * clever input strings. These character references must
+        * be escaped before the return value is embedded in HTML.
+        *
          * @param $value String
-        * @return Mixed
+        * @return String
          */
         static function checkCss( $value ) {
+               // Decode character references like &#123;
                 $value = Sanitizer::decodeCharReferences( $value );
  
-               // Remove any comments; IE gets token splitting wrong
-               $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
-
-               // Remove anything after a comment-start token, to guard against
-               // incorrect client implementations.
-               $commentPos = strpos( $value, '/*' );
-               if ( $commentPos !== false ) {
-                       $value = substr( $value, 0, $commentPos );
-               }
-
                 // Decode escape sequences and line continuation
                 // See the grammar in the CSS 2 spec, appendix D.
+               // This has to be done AFTER decoding character references.
+               // This means it isn't possible for this function to return
+               // unsanitized escape sequences. It is possible to manufacture
+               // input that contains character references that decode to
+               // escape sequences that decode to character references, but
+               // it's OK for the return value to contain character references
+               // because the caller is supposed to escape those anyway.
                 static $decodeRegex;
                 if ( !$decodeRegex ) {
                         $space = '[\\x20\\t\\r\\n\\f]';
                         $nl = '(?:\\n|\\r\\n|\\r|\\f)';
                         $backslash = '\\\\';
-                       $decodeRegex = "/ $backslash 
+                       $decodeRegex = "/ $backslash
                                 (?:
                                         ($nl) |  # 1. Line continuation
                                         ([0-9A-Fa-f]{1,6})$space? |  # 2. character number
@@ -760,6 +882,21 @@ class Sanitizer {
                 $value = preg_replace_callback( $decodeRegex,
                         array( __CLASS__, 'cssDecodeCallback' ), $value );
  
+               // Remove any comments; IE gets token splitting wrong
+               // This must be done AFTER decoding character references and
+               // escape sequences, because those steps can introduce comments
+               // This step cannot introduce character references or escape
+               // sequences, because it replaces comments with spaces rather
+               // than removing them completely.
+               $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
+
+               // Remove anything after a comment-start token, to guard against
+               // incorrect client implementations.
+               $commentPos = strpos( $value, '/*' );
+               if ( $commentPos !== false ) {
+                       $value = substr( $value, 0, $commentPos );
+               }
+
                 // Reject problematic keywords and control characters
                 if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) {
                         return '/* invalid control char */';
@@ -769,6 +906,10 @@ class Sanitizer {
                 return $value;
         }
  
+       /**
+        * @param $matches array
+        * @return String
+        */
         static function cssDecodeCallback( $matches ) {
                 if ( $matches[1] !== '' ) {
                         // Line continuation
@@ -814,8 +955,9 @@ class Sanitizer {
                         return '';
                 }
  
-               $stripped = Sanitizer::validateTagAttributes(
-                       Sanitizer::decodeTagAttributes( $text ), $element );
+               $decoded = Sanitizer::decodeTagAttributes( $text );
+               $decoded = Sanitizer::fixDeprecatedAttributes( $decoded, $element );
+               $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
  
                 $attribs = array();
                 foreach( $stripped as $attribute => $value ) {
@@ -1070,6 +1212,10 @@ class Sanitizer {
                                 Sanitizer::normalizeCharReferences( $text ) ) );
         }
  
+       /**
+        * @param $text string
+        * @return mixed
+        */
         private static function normalizeWhitespace( $text ) {
                 return preg_replace(
                         '/\r\n|[\x20\x0d\x0a\x09]/',
@@ -1153,6 +1299,10 @@ class Sanitizer {
                 }
         }
  
+       /**
+        * @param $codepoint
+        * @return null|string
+        */
         static function decCharReference( $codepoint ) {
                 $point = intval( $codepoint );
                 if( Sanitizer::validateCodepoint( $point ) ) {
@@ -1162,6 +1312,10 @@ class Sanitizer {
                 }
         }
  
+       /**
+        * @param $codepoint
+        * @return null|string
+        */
         static function hexCharReference( $codepoint ) {
                 $point = hexdec( $codepoint );
                 if( Sanitizer::validateCodepoint( $point ) ) {
@@ -1259,7 +1413,7 @@ class Sanitizer {
          * return the UTF-8 encoding of that character. Otherwise, returns
          * pseudo-entity source (eg &foo;)
          *
-        * @param $name Strings
+        * @param $name String
          * @return String
          */
         static function decodeEntity( $name ) {
@@ -1302,7 +1456,7 @@ class Sanitizer {
                 if ( $wgAllowRdfaAttributes ) {
                         #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
                         $common = array_merge( $common, array(
-                           'about', 'property', 'resource', 'datatype', 'typeof', 
+                           'about', 'property', 'resource', 'datatype', 'typeof',
                         ) );
                 }
  
@@ -1419,7 +1573,7 @@ class Sanitizer {
                         'th'         => array_merge( $common, $tablecell, $tablealign ),
  
                         # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
-                       'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa 
+                       'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa
  
                         # 13.2
                         # Not usually allowed, but may be used for extension-style hooks
@@ -1500,13 +1654,17 @@ class Sanitizer {
                 return $out;
         }
  
+       /**
+        * @param $url string
+        * @return mixed|string
+        */
         static function cleanUrl( $url ) {
                 # Normalize any HTML entities in input. They will be
                 # re-escaped by makeExternalLink().
                 $url = Sanitizer::decodeCharReferences( $url );
  
                 # Escape any control characters introduced by the above step
-               $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/', 
+               $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
                         array( __CLASS__, 'cleanUrlCallback' ), $url );
  
                 # Validate hostname portion
@@ -1530,12 +1688,12 @@ class Sanitizer {
                                 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
                                 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
                                 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
-                               [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
+                               [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
                                 /xuD";
  
                         $host = preg_replace( $strip, '', $host );
  
-                       // @todo Fixme: validate hostnames here
+                       // @todo FIXME: Validate hostnames here
  
                         return $protocol . $host . $rest;
                 } else {
@@ -1543,7 +1701,63 @@ class Sanitizer {
                 }
         }
  
+       /**
+        * @param $matches array
+        * @return string
+        */
         static function cleanUrlCallback( $matches ) {
                 return urlencode( $matches[0] );
         }
+
+       /**
+        * Does a string look like an e-mail address?
+        *
+        * This validates an email address using an HTML5 specification found at:
+        * http://www.whatwg.org/specs/web-apps/current-work/multipage/states-of-the-type-attribute.html#valid-e-mail-address
+        * Which as of 2011-01-24 says:
+        *
+        *   A valid e-mail address is a string that matches the ABNF production
+        *   1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined
+        *   in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section
+        *   3.5.
+        *
+        * This function is an implementation of the specification as requested in
+        * bug 22449.
+        *
+        * Client-side forms will use the same standard validation rules via JS or
+        * HTML 5 validation; additional restrictions can be enforced server-side
+        * by extensions via the 'isValidEmailAddr' hook.
+        *
+        * Note that this validation doesn't 100% match RFC 2822, but is believed
+        * to be liberal enough for wide use. Some invalid addresses will still
+        * pass validation here.
+        *
+        * @since 1.18
+        *
+        * @param $addr String E-mail address
+        * @return Bool
+        */
+       public static function validateEmail( $addr ) {
+               $result = null;
+               if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
+                       return $result;
+               }
+
+               // Please note strings below are enclosed in brackets [], this make the
+               // hyphen "-" a range indicator. Hence it is double backslashed below.
+               // See bug 26948
+               $rfc5322_atext   = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~" ;
+               $rfc1034_ldh_str = "a-z0-9\\-" ;
+
+               $HTML5_email_regexp = "/
+               ^                      # start of string
+               [$rfc5322_atext\\.]+    # user part which is liberal :p
+               @                      # 'apostrophe'
+               [$rfc1034_ldh_str]+       # First domain part
+               (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
+               $                      # End of string
+               /ix" ; // case Insensitive, eXtended
+
+               return (bool) preg_match( $HTML5_email_regexp, $addr );
+       }
  }