* Fix for r60163: in RC/RCL, hash together all the options, not just namespace, in...
[lhc/web/wiklou.git] / includes / Sanitizer.php
index ab521e8..8533990 100644 (file)
@@ -43,7 +43,7 @@ define( 'MW_CHAR_REFS_REGEX',
 $attrib = '[A-Za-z0-9]';
 $space = '[\x09\x0a\x0d\x20]';
 define( 'MW_ATTRIBS_REGEX',
-       "/(?:^|$space)($attrib+)
+       "/(?:^|$space)((?:xml:|xmlns:)?$attrib+)
          ($space*=$space*
                (?:
                 # The attribute value: quoted or alone
@@ -56,6 +56,16 @@ define( 'MW_ATTRIBS_REGEX',
                )
           )?(?=$space|\$)/sx" );
 
+/**
+ * Regular expression to match URIs that could trigger script execution
+ */
+define( 'MW_EVIL_URI_PATTERN', '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i' );
+
+/**
+ * Regular expression to match namespace attributes
+ */
+define( 'MW_XMLNS_ATTRIBUTE_PATTRN', "/^xmlns:$attrib+$/" );
+
 /**
  * List of all named character entities defined in HTML 4.01
  * http://www.w3.org/TR/html4/sgml/entities.html
@@ -353,7 +363,7 @@ class Sanitizer {
                if ( !$staticInitialised ) {
 
                        $htmlpairsStatic = array( # Tags that must be closed
-                               'a', 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
+                               'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
                                'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
                                'strike', 'strong', 'tt', 'var', 'div', 'center',
                                'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
@@ -400,37 +410,43 @@ class Sanitizer {
                $text = Sanitizer::removeHTMLcomments( $text );
                $bits = explode( '<', $text );
                $text = str_replace( '>', '&gt;', array_shift( $bits ) );
-               if(!$wgUseTidy) {
+               if ( !$wgUseTidy ) {
                        $tagstack = $tablestack = array();
                        foreach ( $bits as $x ) {
                                $regs = array();
+                               # $slash: Does the current element start with a '/'?
+                               # $t: Current element name
+                               # $params: String between element name and >
+                               # $brace: Ending '>' or '/>'
+                               # $rest: Everything until the next element of $bits
                                if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
                                        list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
                                } else {
                                        $slash = $t = $params = $brace = $rest = null;
                                }
 
-                               $badtag = ;
+                               $badtag = false;
                                if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
                                        # Check our stack
-                                       if ( $slash ) {
-                                               # Closing a tag...
-                                               if( isset( $htmlsingleonly[$t] ) ) {
-                                                       $badtag = 1;
-                                               } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
+                                       if ( $slash && isset( $htmlsingleonly[$t] ) ) {
+                                               $badtag = true;
+                                       } elseif ( $slash ) {
+                                               # Closing a tag... is it the one we just opened?
+                                               $ot = @array_pop( $tagstack );
+                                               if ( $ot != $t ) {
                                                        if ( isset( $htmlsingleallowed[$ot] ) ) {
                                                                # Pop all elements with an optional close tag
                                                                # and see if we find a match below them
                                                                $optstack = array();
-                                                               array_push ($optstack, $ot);
-                                                               while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
-                                                                               isset( $htmlsingleallowed[$ot] ) )
-                                                               {
-                                                                       array_push ($optstack, $ot);
+                                                               array_push( $optstack, $ot );
+                                                               $ot = @array_pop( $tagstack );
+                                                               while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
+                                                                       array_push( $optstack, $ot );
+                                                                       $ot = @array_pop( $tagstack );
                                                                }
                                                                if ( $t != $ot ) {
-                                                                       # No match. Push the optinal elements back again
-                                                                       $badtag = 1;
+                                                                       # No match. Push the optional elements back again
+                                                                       $badtag = true;
                                                                        while ( $ot = @array_pop( $optstack ) ) {
                                                                                array_push( $tagstack, $ot );
                                                                        }
@@ -438,8 +454,8 @@ class Sanitizer {
                                                        } else {
                                                                @array_push( $tagstack, $ot );
                                                                # <li> can be nested in <ul> or <ol>, skip those cases:
-                                                               if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
-                                                                       $badtag = 1;
+                                                               if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
+                                                                       $badtag = true;
                                                                }
                                                        }
                                                } else {
@@ -451,23 +467,23 @@ class Sanitizer {
                                        } else {
                                                # Keep track for later
                                                if ( isset( $tabletags[$t] ) &&
-                                               ! in_array( 'table', $tagstack ) ) {
-                                                       $badtag = 1;
-                                               } else if ( in_array( $t, $tagstack ) &&
-                                               ! isset( $htmlnest [$t ] ) ) {
-                                                       $badtag = ;
+                                               !in_array( 'table', $tagstack ) ) {
+                                                       $badtag = true;
+                                               } elseif ( in_array( $t, $tagstack ) &&
+                                               !isset( $htmlnest [$t ] ) ) {
+                                                       $badtag = true;
                                                # Is it a self closed htmlpair ? (bug 5487)
-                                               } else if( $brace == '/>' &&
+                                               } elseif ( $brace == '/>' &&
                                                isset( $htmlpairs[$t] ) ) {
-                                                       $badtag = 1;
-                                               } elseif( isset( $htmlsingleonly[$t] ) ) {
+                                                       $badtag = true;
+                                               } elseif ( isset( $htmlsingleonly[$t] ) ) {
                                                        # Hack to force empty tag for uncloseable elements
                                                        $brace = '/>';
-                                               } else if( isset( $htmlsingle[$t] ) ) {
+                                               } elseif ( isset( $htmlsingle[$t] ) ) {
                                                        # Hack to not close $htmlsingle tags
-                                                       $brace = NULL;
-                                               } else if( isset( $tabletags[$t] )
-                                               &&  in_array($t ,$tagstack) ) {
+                                                       $brace = null;
+                                               } elseif ( isset( $tabletags[$t] )
+                                               && in_array( $t, $tagstack ) ) {
                                                        // New table tag but forgot to close the previous one
                                                        $text .= "</$t>";
                                                } else {
@@ -487,7 +503,7 @@ class Sanitizer {
                                                # Strip non-approved attributes from the tag
                                                $newparams = Sanitizer::fixTagAttributes( $params, $t );
                                        }
-                                       if ( ! $badtag ) {
+                                       if ( !$badtag ) {
                                                $rest = str_replace( '>', '&gt;', $rest );
                                                $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
                                                $text .= "<$slash$t$newparams$close>$rest";
@@ -604,14 +620,26 @@ class Sanitizer {
         * @todo Check for unique id attribute :P
         */
        static function validateAttributes( $attribs, $whitelist ) {
+               global $wgAllowRdfaAttributes, $wgAllowMicrodataAttributes;
+
                $whitelist = array_flip( $whitelist );
                $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
 
                $out = array();
                foreach( $attribs as $attribute => $value ) {
+                       #allow XML namespace declaration if RDFa is enabled
+                       if ( $wgAllowRdfaAttributes && preg_match( MW_XMLNS_ATTRIBUTE_PATTRN, $attribute ) ) {
+                               if ( !preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
+                                       $out[$attribute] = $value;
+                               }
+
+                               continue;
+                       }
+
                        if( !isset( $whitelist[$attribute] ) ) {
                                continue;
                        }
+
                        # Strip javascript "expression" from stylesheets.
                        # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
                        if( $attribute == 'style' ) {
@@ -623,11 +651,24 @@ class Sanitizer {
                        }
 
                        if ( $attribute === 'id' ) {
-                               global $wgEnforceHtmlIds;
-                               $value = Sanitizer::escapeId( $value,
-                                       $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
+                               $value = Sanitizer::escapeId( $value, 'noninitial' );
+                       }
+
+                       //RDFa and microdata properties allow URLs, URIs and/or CURIs. check them for sanity
+                       if ( $attribute === 'rel' || $attribute === 'rev' || 
+                               $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' || #RDFa
+                               $attribute === 'datatype' || $attribute === 'typeof' ||                             #RDFa
+                               $attribute === 'itemid' || $attribute === 'itemprop' || $attribute === 'itemref' || #HTML5 microdata
+                               $attribute === 'itemscope' || $attribute === 'itemtype' ) {                         #HTML5 microdata
+
+                               //Paranoia. Allow "simple" values but suppress javascript
+                               if ( preg_match( MW_EVIL_URI_PATTERN, $value ) ) {
+                                       continue; 
+                               }
                        }
 
+                       # NOTE: even though elements using href/src are not allowed directly, supply
+                       #       validation code that can be used by tag hook handlers, etc
                        if ( $attribute === 'href' || $attribute === 'src' ) {
                                if ( !preg_match( $hrefExp, $value ) ) {
                                        continue; //drop any href or src attributes not using an allowed protocol.
@@ -635,20 +676,33 @@ class Sanitizer {
                                }
                        }
 
-                       //RDFa properties allow URIs. check them
-                       if ( $attribute === 'rel' || $attribute === 'rev' || 
-                               $attribute === 'about' || $attribute === 'property' || $attribute === 'resource' ||
-                               $attribute === 'datatype' || $attribute === 'typeof' ) {  
-                               //Paranoia. Allow "simple" values but suppress javascript
-                               if ( preg_match( '/(^|\s)javascript\s*:/i', $value ) ) {
-                                       continue; 
-                               }
-                       }
-
                        // If this attribute was previously set, override it.
                        // Output should only have one attribute of each name.
                        $out[$attribute] = $value;
                }
+
+               if ( $wgAllowMicrodataAttributes ) {
+                       # There are some complicated validity constraints we need to
+                       # enforce here.  First of all, we don't want to allow non-standard
+                       # itemtypes.
+                       $allowedTypes = array(
+                               'http://microformats.org/profile/hcard',
+                               'http://microformats.org/profile/hcalendar#vevent',
+                               'http://n.whatwg.org/work',
+                       );
+                       if ( isset( $out['itemtype'] ) && !in_array( $out['itemtype'],
+                       $allowedTypes ) ) {
+                               # Kill everything
+                               unset( $out['itemscope'] );
+                       }
+                       # itemtype, itemid, itemref don't make sense without itemscope
+                       if ( !array_key_exists( 'itemscope', $out ) ) {
+                               unset( $out['itemtype'] );
+                               unset( $out['itemid'] );
+                               unset( $out['itemref'] );
+                       }
+                       # TODO: Strip itemprop if we aren't descendants of an itemscope.
+               }
                return $out;
        }
 
@@ -795,63 +849,64 @@ class Sanitizer {
        }
 
        /**
-        * Given a value escape it so that it can be used in an id attribute and
-        * return it, this does not validate the value however (see first link)
+        * Given a value, escape it so that it can be used in an id attribute and
+        * return it.  This will use HTML5 validation if $wgExperimentalHtmlIds is
+        * true, allowing anything but ASCII whitespace.  Otherwise it will use
+        * HTML 4 rules, which means a narrow subset of ASCII, with bad characters
+        * escaped with lots of dots.
+        *
+        * To ensure we don't have to bother escaping anything, we also strip ', ",
+        * & even if $wgExperimentalIds is true.  TODO: Is this the best tactic?
+        * We also strip # because it upsets IE6.
         *
         * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
         *                                                          in the id and
         *                                                          name attributes
         * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
+        * @see http://www.whatwg.org/specs/web-apps/current-work/multipage/elements.html#the-id-attribute
+        *   HTML5 definition of id attribute
         *
-        * @param $id String: id to validate
+        * @param $id String: id to escape
         * @param $options Mixed: string or array of strings (default is array()):
         *   'noninitial': This is a non-initial fragment of an id, not a full id,
         *       so don't pay attention if the first character isn't valid at the
-        *       beginning of an id.
-        *   'xml': Don't restrict the id to be HTML4-compatible.  This option
-        *       allows any alphabetic character to be used, per the XML standard.
-        *       Therefore, it also completely changes the type of escaping: instead
-        *       of weird dot-encoding, runs of invalid characters (mostly
-        *       whitespace) are just compressed into a single underscore.
+        *       beginning of an id.  Only matters if $wgExperimentalHtmlIds is
+        *       false.
+        *   'legacy': Behave the way the old HTML 4-based ID escaping worked even
+        *       if $wgExperimentalHtmlIds is used, so we can generate extra
+        *       anchors and links won't break.
         * @return String
         */
        static function escapeId( $id, $options = array() ) {
+               global $wgHtml5, $wgExperimentalHtmlIds;
                $options = (array)$options;
 
-               if ( !in_array( 'xml', $options ) ) {
-                       # HTML4-style escaping
-                       static $replace = array(
-                               '%3A' => ':',
-                               '%' => '.'
-                       );
-
-                       $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
-                       $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
-
-                       if ( !preg_match( '/^[a-zA-Z]/', $id )
-                       && !in_array( 'noninitial', $options ) )  {
-                               // Initial character must be a letter!
-                               $id = "x$id";
+               if ( $wgHtml5 && $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) {
+                       $id = Sanitizer::decodeCharReferences( $id );
+                       $id = preg_replace( '/[ \t\n\r\f_\'"&#]+/', '_', $id );
+                       $id = trim( $id, '_' );
+                       if ( $id === '' ) {
+                               # Must have been all whitespace to start with.
+                               return '_';
+                       } else {
+                               return $id;
                        }
-                       return $id;
                }
 
-               # XML-style escaping.  For the patterns used, see the XML 1.0 standard,
-               # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
-               $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
-                       . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
-                       . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
-               $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
-                       . '\x{203F}-\x{2040}';
-               # Replace _ as well so we don't get multiple consecutive underscores
-               $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
-               $id = trim( $id, '_' );
-
-               if ( !preg_match( "/^[$nameStartChar]/u", $id )
-               && !in_array( 'noninitial', $options ) ) {
-                       $id = "_$id";
-               }
+               # HTML4-style escaping
+               static $replace = array(
+                       '%3A' => ':',
+                       '%' => '.'
+               );
 
+               $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
+               $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
+
+               if ( !preg_match( '/^[a-zA-Z]/', $id )
+               && !in_array( 'noninitial', $options ) )  {
+                       // Initial character must be a letter!
+                       $id = "x$id";
+               }
                return $id;
        }
 
@@ -908,12 +963,11 @@ class Sanitizer {
         * @return Array
         */
        public static function decodeTagAttributes( $text ) {
-               $attribs = array();
-
                if( trim( $text ) == '' ) {
-                       return $attribs;
+                       return array();
                }
 
+               $attribs = array();
                $pairs = array();
                if( !preg_match_all(
                        MW_ATTRIBS_REGEX,
@@ -1173,10 +1227,23 @@ class Sanitizer {
         * @return Array
         */
        static function setupAttributeWhitelist() {
-               $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style',
-                                #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
-                                'about', 'property', 'resource', 'datatype', 'typeof', 
-                               );
+               global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
+
+               $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style', 'xml:lang' );
+
+               if ( $wgAllowRdfaAttributes ) {
+                       #RDFa attributes as specified in section 9 of http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
+                       $common = array_merge( $common, array(
+                           'about', 'property', 'resource', 'datatype', 'typeof', 
+                       ) );
+               }
+
+               if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
+                       # add HTML5 microdata tages as pecified by http://www.whatwg.org/specs/web-apps/current-work/multipage/microdata.html#the-microdata-model
+                       $common = array_merge( $common, array(
+                           'itemid', 'itemprop', 'itemref', 'itemscope', 'itemtype'
+                       ) );
+               }
 
                $block = array_merge( $common, array( 'align' ) );
                $tablealign = array( 'align', 'char', 'charoff', 'valign' );
@@ -1283,7 +1350,7 @@ class Sanitizer {
                        'td'         => array_merge( $common, $tablecell, $tablealign ),
                        'th'         => array_merge( $common, $tablecell, $tablealign ),
 
-                       # 12.2
+                       # 12.2 # NOTE: <a> is not allowed directly, but the attrib whitelist is used from the Parser object
                        'a'          => array_merge( $common, array( 'href', 'rel', 'rev' ) ), # rel/rev esp. for RDFa 
 
                        # 13.2
@@ -1399,7 +1466,7 @@ class Sanitizer {
 
                        $host = preg_replace( $strip, '', $host );
 
-                       // @fixme: validate hostnames here
+                       // @todo Fixme: validate hostnames here
 
                        return $protocol . $host . $rest;
                } else {