Add support for Number grouping(commafy) based on CLDR number grouping patterns like...
[lhc/web/wiklou.git] / includes / Sanitizer.php
index 27b17ce..227fa7c 100644 (file)
@@ -39,12 +39,21 @@ class Sanitizer {
                 |&\#[xX]([0-9A-Fa-f]+);
                 |(&)/x';
 
+       /**
+        * Blacklist for evil uris like javascript:
+        * WARNING: DO NOT use this in any place that actually requires blacklisting
+        * for security reasons. There are NUMEROUS[1] ways to bypass blacklisting, the
+        * only way to be secure from javascript: uri based xss vectors is to whitelist
+        * things that you know are safe and deny everything else.
+        * [1]: http://ha.ckers.org/xss.html
+        */
        const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
        const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
 
        /**
         * List of all named character entities defined in HTML 4.01
         * http://www.w3.org/TR/html4/sgml/entities.html
+        * As well as ' which is only defined starting in XHTML1.
         * @private
         */
        static $htmlEntities = array(
@@ -63,6 +72,7 @@ class Sanitizer {
                'amp'      => 38,
                'and'      => 8743,
                'ang'      => 8736,
+               'apos'     => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
                'Aring'    => 197,
                'aring'    => 229,
                'asymp'    => 8776,
@@ -592,6 +602,89 @@ class Sanitizer {
                return $text;
        }
 
+       /**
+        * Take an array of attribute names and values and fix some deprecated values
+        * for the given element type.
+        * This does not validate properties, so you should ensure that you call
+        * validateTagAttributes AFTER this to ensure that the resulting style rule
+        * this may add is safe.
+        * 
+        * - Converts most presentational attributes like align into inline css
+        *
+        * @param $attribs Array
+        * @param $element String
+        * @return Array
+        */
+       static function fixDeprecatedAttributes( $attribs, $element ) {
+               global $wgHtml5, $wgCleanupPresentationalAttributes;
+               
+               // presentational attributes were removed from html5, we can leave them
+               // in when html5 is turned off
+               if ( !$wgHtml5 || !$wgCleanupPresentationalAttributes ) {
+                       return $attribs;
+               }
+               
+               $table = array( 'table' );
+               $cells = array( 'td', 'th' );
+               $colls = array( 'col', 'colgroup' );
+               $tblocks = array( 'tbody', 'tfoot', 'thead' );
+               $h = array( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' );
+               
+               $presentationalAttribs = array(
+                       'align' => array( 'text-align', array_merge( array( 'caption', 'hr', 'div', 'p', 'tr' ), $table, $cells, $colls, $tblocks, $h ) ),
+                       'clear' => array( 'clear', array( 'br' ) ),
+                       'height' => array( 'height', $cells ),
+                       'nowrap' => array( 'white-space', $cells ),
+                       'size' => array( 'height', array( 'hr' ) ),
+                       'type' => array( 'list-style-type', array( 'li', 'ol', 'ul' ) ),
+                       'valign' => array( 'vertical-align', array_merge( $cells, $colls, $tblocks ) ),
+                       'width' => array( 'width', array_merge( array( 'hr', 'pre' ), $table, $cells, $colls ) ),
+               );
+               
+               $style = "";
+               foreach ( $presentationalAttribs as $attribute => $info ) {
+                       list( $property, $elements ) = $info;
+                       
+                       // Skip if this attribute is not relevant to this element
+                       if ( !in_array( $element, $elements ) ) {
+                               continue;
+                       }
+                       
+                       // Skip if the attribute is not used
+                       if ( !array_key_exists( $attribute, $attribs ) ) {
+                               continue;
+                       }
+                       
+                       $value = $attribs[$attribute];
+                       
+                       // For nowrap the value should be nowrap instead of whatever text is in the value
+                       if ( $attribute === 'nowrap' ) {
+                               $value = 'nowrap';
+                       }
+                       
+                       // Size based properties should have px applied to them if they have no unit
+                       if ( in_array( $attribute, array( 'height', 'width', 'size' ) ) ) {
+                               if ( preg_match( '/^[\d.]+$/', $value ) ) {
+                                       $value = "{$value}px";
+                               }
+                       }
+                       
+                       $style .= " $property: $value;";
+                       
+                       unset( $attribs[$attribute] );
+               }
+               
+               if ( !empty($style) ) {
+                       // Prepend our style rules so that they can be overridden by user css
+                       if ( isset($attribs['style']) ) {
+                               $style .= " " . $attribs['style'];
+                       }
+                       $attribs['style'] = trim($style);
+               }
+               
+               return $attribs;
+       }
+
        /**
         * Take an array of attribute names and values and normalize or discard
         * illegal values for the given element type.
@@ -722,28 +815,34 @@ class Sanitizer {
 
        /**
         * Pick apart some CSS and check it for forbidden or unsafe structures.
-        * Returns a sanitized string, or false if it was just too evil.
+        * Returns a sanitized string. This sanitized string will have
+        * character references and escape sequences decoded, and comments
+        * stripped. If the input is just too evil, only a comment complaining
+        * about evilness will be returned.
         *
         * Currently URL references, 'expression', 'tps' are forbidden.
         *
+        * NOTE: Despite the fact that character references are decoded, the
+        * returned string may contain character references given certain
+        * clever input strings. These character references must
+        * be escaped before the return value is embedded in HTML.
+        * 
         * @param $value String
-        * @return Mixed
+        * @return String
         */
        static function checkCss( $value ) {
+               // Decode character references like {
                $value = Sanitizer::decodeCharReferences( $value );
 
-               // Remove any comments; IE gets token splitting wrong
-               $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
-
-               // Remove anything after a comment-start token, to guard against
-               // incorrect client implementations.
-               $commentPos = strpos( $value, '/*' );
-               if ( $commentPos !== false ) {
-                       $value = substr( $value, 0, $commentPos );
-               }
-
                // Decode escape sequences and line continuation
                // See the grammar in the CSS 2 spec, appendix D.
+               // This has to be done AFTER decoding character references.
+               // This means it isn't possible for this function to return
+               // unsanitized escape sequences. It is possible to manufacture
+               // input that contains character references that decode to
+               // escape sequences that decode to character references, but
+               // it's OK for the return value to contain character references
+               // because the caller is supposed to escape those anyway.
                static $decodeRegex;
                if ( !$decodeRegex ) {
                        $space = '[\\x20\\t\\r\\n\\f]';
@@ -759,6 +858,21 @@ class Sanitizer {
                }
                $value = preg_replace_callback( $decodeRegex,
                        array( __CLASS__, 'cssDecodeCallback' ), $value );
+               
+               // Remove any comments; IE gets token splitting wrong
+               // This must be done AFTER decoding character references and
+               // escape sequences, because those steps can introduce comments
+               // This step cannot introduce character references or escape
+               // sequences, because it replaces comments with spaces rather
+               // than removing them completely.
+               $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
+
+               // Remove anything after a comment-start token, to guard against
+               // incorrect client implementations.
+               $commentPos = strpos( $value, '/*' );
+               if ( $commentPos !== false ) {
+                       $value = substr( $value, 0, $commentPos );
+               }
 
                // Reject problematic keywords and control characters
                if ( preg_match( '/[\000-\010\016-\037\177]/', $value ) ) {
@@ -769,6 +883,10 @@ class Sanitizer {
                return $value;
        }
 
+       /**
+        * @param $matches array
+        * @return String
+        */
        static function cssDecodeCallback( $matches ) {
                if ( $matches[1] !== '' ) {
                        // Line continuation
@@ -814,8 +932,9 @@ class Sanitizer {
                        return '';
                }
 
-               $stripped = Sanitizer::validateTagAttributes(
-                       Sanitizer::decodeTagAttributes( $text ), $element );
+               $decoded = Sanitizer::decodeTagAttributes( $text );
+               $decoded = Sanitizer::fixDeprecatedAttributes( $decoded, $element );
+               $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
 
                $attribs = array();
                foreach( $stripped as $attribute => $value ) {
@@ -1070,6 +1189,10 @@ class Sanitizer {
                                Sanitizer::normalizeCharReferences( $text ) ) );
        }
 
+       /**
+        * @param $text string
+        * @return mixed
+        */
        private static function normalizeWhitespace( $text ) {
                return preg_replace(
                        '/\r\n|[\x20\x0d\x0a\x09]/',
@@ -1153,6 +1276,10 @@ class Sanitizer {
                }
        }
 
+       /**
+        * @param $codepoint
+        * @return null|string
+        */
        static function decCharReference( $codepoint ) {
                $point = intval( $codepoint );
                if( Sanitizer::validateCodepoint( $point ) ) {
@@ -1162,6 +1289,10 @@ class Sanitizer {
                }
        }
 
+       /**
+        * @param $codepoint
+        * @return null|string
+        */
        static function hexCharReference( $codepoint ) {
                $point = hexdec( $codepoint );
                if( Sanitizer::validateCodepoint( $point ) ) {
@@ -1259,7 +1390,7 @@ class Sanitizer {
         * return the UTF-8 encoding of that character. Otherwise, returns
         * pseudo-entity source (eg &foo;)
         *
-        * @param $name Strings
+        * @param $name String
         * @return String
         */
        static function decodeEntity( $name ) {
@@ -1500,13 +1631,18 @@ class Sanitizer {
                return $out;
        }
 
+       /**
+        * @param $url string
+        * @return mixed|string
+        */
        static function cleanUrl( $url ) {
                # Normalize any HTML entities in input. They will be
                # re-escaped by makeExternalLink().
                $url = Sanitizer::decodeCharReferences( $url );
 
                # Escape any control characters introduced by the above step
-               $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F\|]/e', "urlencode('\\0')", $url );
+               $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/', 
+                       array( __CLASS__, 'cleanUrlCallback' ), $url );
 
                # Validate hostname portion
                $matches = array();
@@ -1534,7 +1670,7 @@ class Sanitizer {
 
                        $host = preg_replace( $strip, '', $host );
 
-                       // @todo Fixme: validate hostnames here
+                       // @todo FIXME: Validate hostnames here
 
                        return $protocol . $host . $rest;
                } else {
@@ -1542,4 +1678,63 @@ class Sanitizer {
                }
        }
 
+       /**
+        * @param $matches array
+        * @return string
+        */
+       static function cleanUrlCallback( $matches ) {
+               return urlencode( $matches[0] );
+       }
+
+       /**
+        * Does a string look like an e-mail address?
+        *
+        * This validates an email address using an HTML5 specification found at:
+        * http://www.whatwg.org/specs/web-apps/current-work/multipage/states-of-the-type-attribute.html#valid-e-mail-address
+        * Which as of 2011-01-24 says:
+        *
+        *   A valid e-mail address is a string that matches the ABNF production
+        *   1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined
+        *   in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section
+        *   3.5.
+        *
+        * This function is an implementation of the specification as requested in
+        * bug 22449.
+        *
+        * Client-side forms will use the same standard validation rules via JS or
+        * HTML 5 validation; additional restrictions can be enforced server-side
+        * by extensions via the 'isValidEmailAddr' hook.
+        *
+        * Note that this validation doesn't 100% match RFC 2822, but is believed
+        * to be liberal enough for wide use. Some invalid addresses will still
+        * pass validation here.
+        *
+        * @since 1.18
+        *
+        * @param $addr String E-mail address
+        * @return Bool
+        */
+       public static function validateEmail( $addr ) {
+               $result = null;
+               if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
+                       return $result;
+               }
+
+               // Please note strings below are enclosed in brackets [], this make the
+               // hyphen "-" a range indicator. Hence it is double backslashed below.
+               // See bug 26948
+               $rfc5322_atext   = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~" ;
+               $rfc1034_ldh_str = "a-z0-9\\-" ;
+
+               $HTML5_email_regexp = "/
+               ^                      # start of string
+               [$rfc5322_atext\\.]+    # user part which is liberal :p
+               @                      # 'apostrophe'
+               [$rfc1034_ldh_str]+       # First domain part
+               (\\.[$rfc1034_ldh_str]+)*  # Following part prefixed with a dot
+               $                      # End of string
+               /ix" ; // case Insensitive, eXtended
+
+               return (bool) preg_match( $HTML5_email_regexp, $addr );
+       }
 }