Merge "Show no size links on image pages with errors"
[lhc/web/wiklou.git] / includes / Sanitizer.php
index 5aa0545..cf6c106 100644 (file)
@@ -364,30 +364,40 @@ class Sanitizer {
         * @return string
         */
        static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
-               global $wgUseTidy;
+               global $wgUseTidy, $wgHtml5, $wgAllowMicrodataAttributes, $wgAllowImageTag;
 
                static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
                        $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
 
                wfProfileIn( __METHOD__ );
 
-               if ( !$staticInitialised ) {
+               // Base our staticInitialised variable off of the global config state so that if the globals
+               // are changed (like in the secrewed up test system) we will re-initialise the settings.
+               $globalContext = implode( '-', compact( 'wgHtml5', 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
+               if ( !$staticInitialised || $staticInitialised != $globalContext ) {
 
                        $htmlpairsStatic = array( # Tags that must be closed
                                'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
                                'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
                                'strike', 'strong', 'tt', 'var', 'div', 'center',
                                'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
-                               'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn',
+                               'ruby', 'rt', 'rb', 'rp', 'p', 'span', 'abbr', 'dfn',
                                'kbd', 'samp'
                        );
+                       if ( $wgHtml5 ) {
+                               $htmlpairsStatic = array_merge( $htmlpairsStatic, array( 'data', 'time', 'mark' ) );
+                       }
                        $htmlsingle = array(
                                'br', 'hr', 'li', 'dt', 'dd'
                        );
                        $htmlsingleonly = array( # Elements that cannot have close tags
                                'br', 'hr'
                        );
-                       $htmlnest = array( # Tags that can be nested directly or indirectly
+                       if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
+                               $htmlsingle[] = $htmlsingleonly[] = 'meta';
+                               $htmlsingle[] = $htmlsingleonly[] = 'link';
+                       }
+                       $htmlnest = array( # Tags that can be nested--??
                                'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
                                'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span'
                        );
@@ -401,7 +411,6 @@ class Sanitizer {
                                'li',
                        );
 
-                       global $wgAllowImageTag;
                        if ( $wgAllowImageTag ) {
                                $htmlsingle[] = 'img';
                                $htmlsingleonly[] = 'img';
@@ -416,13 +425,13 @@ class Sanitizer {
                        foreach ( $vars as $var ) {
                                $$var = array_flip( $$var );
                        }
-                       $staticInitialised = true;
+                       $staticInitialised = $globalContext;
                }
                # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
                $extratags = array_flip( $extratags );
                $removetags = array_flip( $removetags );
                $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
-               $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags );
+               $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
 
                # Remove HTML comments
                $text = Sanitizer::removeHTMLcomments( $text );
@@ -532,6 +541,10 @@ class Sanitizer {
                                                        call_user_func_array( $processCallback, array( &$params, $args ) );
                                                }
 
+                                               if ( !Sanitizer::validateTag( $params, $t ) ) {
+                                                       $badtag = true;
+                                               }
+
                                                # Strip non-approved attributes from the tag
                                                $newparams = Sanitizer::fixTagAttributes( $params, $t );
                                        }
@@ -555,16 +568,24 @@ class Sanitizer {
                                preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
                                $x, $regs );
                                @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
+                               $badtag = false;
                                if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
                                        if( is_callable( $processCallback ) ) {
                                                call_user_func_array( $processCallback, array( &$params, $args ) );
                                        }
+
+                                       if ( !Sanitizer::validateTag( $params, $t ) ) {
+                                               $badtag = true;
+                                       }
+
                                        $newparams = Sanitizer::fixTagAttributes( $params, $t );
-                                       $rest = str_replace( '>', '&gt;', $rest );
-                                       $text .= "<$slash$t$newparams$brace$rest";
-                               } else {
-                                       $text .= '&lt;' . str_replace( '>', '&gt;', $x);
+                                       if ( !$badtag ) {
+                                               $rest = str_replace( '>', '&gt;', $rest );
+                                               $text .= "<$slash$t$newparams$brace$rest";
+                                               continue;
+                                       }
                                }
+                               $text .= '&lt;' . str_replace( '>', '&gt;', $x);
                        }
                }
                wfProfileOut( __METHOD__ );
@@ -583,9 +604,9 @@ class Sanitizer {
         */
        static function removeHTMLcomments( $text ) {
                wfProfileIn( __METHOD__ );
-               while (($start = strpos($text, '<!--')) !== false) {
-                       $end = strpos($text, '-->', $start + 4);
-                       if ($end === false) {
+               while ( ($start = strpos( $text, '<!--' ) ) !== false ) {
+                       $end = strpos( $text, '-->', $start + 4 );
+                       if ( $end === false ) {
                                # Unterminated comment; bail out
                                break;
                        }
@@ -594,22 +615,22 @@ class Sanitizer {
 
                        # Trim space and newline if the comment is both
                        # preceded and followed by a newline
-                       $spaceStart = max($start - 1, 0);
+                       $spaceStart = max( $start - 1, 0 );
                        $spaceLen = $end - $spaceStart;
-                       while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
+                       while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
                                $spaceStart--;
                                $spaceLen++;
                        }
-                       while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
+                       while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' )
                                $spaceLen++;
-                       if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
+                       if ( substr( $text, $spaceStart, 1 ) === "\n" and substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
                                # Remove the comment, leading and trailing
                                # spaces, and leave only one newline.
-                               $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
+                               $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
                        }
                        else {
                                # Remove just the comment.
-                               $text = substr_replace($text, '', $start, $end - $start);
+                               $text = substr_replace( $text, '', $start, $end - $start );
                        }
                }
                wfProfileOut( __METHOD__ );
@@ -617,111 +638,35 @@ class Sanitizer {
        }
 
        /**
-        * Take an array of attribute names and values and fix some deprecated values
-        * for the given element type.
-        * This does not validate properties, so you should ensure that you call
-        * validateTagAttributes AFTER this to ensure that the resulting style rule
-        * this may add is safe.
-        *
-        * - Converts most presentational attributes like align into inline css
+        * Takes attribute names and values for a tag and the tag name and
+        * validates that the tag is allowed to be present.
+        * This DOES NOT validate the attributes, nor does it validate the
+        * tags themselves. This method only handles the special circumstances
+        * where we may want to allow a tag within content but ONLY when it has
+        * specific attributes set.
         *
-        * @param $attribs Array
-        * @param $element String
-        * @return Array
+        * @param $params
+        * @param $element
         */
-       static function fixDeprecatedAttributes( $attribs, $element ) {
-               global $wgHtml5, $wgCleanupPresentationalAttributes;
-
-               // presentational attributes were removed from html5, we can leave them
-               // in when html5 is turned off
-               if ( !$wgHtml5 || !$wgCleanupPresentationalAttributes ) {
-                       return $attribs;
-               }
-
-               $table = array( 'table' );
-               $cells = array( 'td', 'th' );
-               $colls = array( 'col', 'colgroup' );
-               $tblocks = array( 'tbody', 'tfoot', 'thead' );
-               $h = array( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' );
-
-               $presentationalAttribs = array(
-                       'align' => array( 'text-align', array_merge( array( 'caption', 'hr', 'div', 'p', 'tr' ), $table, $cells, $colls, $tblocks, $h ) ),
-                       'clear' => array( 'clear', array( 'br' ) ),
-                       'height' => array( 'height', $cells ),
-                       'nowrap' => array( 'white-space', $cells ),
-                       'size' => array( 'height', array( 'hr' ) ),
-                       'type' => array( 'list-style-type', array( 'li', 'ol', 'ul' ) ),
-                       'valign' => array( 'vertical-align', array_merge( $cells, $colls, $tblocks ) ),
-                       'width' => array( 'width', array_merge( array( 'hr', 'pre' ), $table, $cells, $colls ) ),
-               );
-
-               // Ensure that any upper case or mixed case attributes are converted to lowercase
-               foreach ( $attribs as $attribute => $value ) {
-                       if ( $attribute !== strtolower( $attribute ) && array_key_exists( strtolower( $attribute ), $presentationalAttribs ) ) {
-                               $attribs[strtolower( $attribute )] = $value;
-                               unset( $attribs[$attribute] );
-                       }
-               }
-
-               $style = "";
-               foreach ( $presentationalAttribs as $attribute => $info ) {
-                       list( $property, $elements ) = $info;
+       static function validateTag( $params, $element ) {
+               $params = Sanitizer::decodeTagAttributes( $params );
 
-                       // Skip if this attribute is not relevant to this element
-                       if ( !in_array( $element, $elements ) ) {
-                               continue;
-                       }
-
-                       // Skip if the attribute is not used
-                       if ( !array_key_exists( $attribute, $attribs ) ) {
-                               continue;
+               if ( $element == 'meta' || $element == 'link' ) {
+                       if ( !isset( $params['itemprop'] ) ) {
+                               // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
+                               return false;
                        }
-
-                       $value = $attribs[$attribute];
-
-                       // For nowrap the value should be nowrap instead of whatever text is in the value
-                       if ( $attribute === 'nowrap' ) {
-                               $value = 'nowrap';
+                       if ( $element == 'meta' && !isset( $params['content'] ) ) {
+                               // <meta> must have a content="" for the itemprop
+                               return false;
                        }
-
-                       // clear="all" is clear: both; in css
-                       if ( $attribute === 'clear' && strtolower( $value ) === 'all' ) {
-                               $value = 'both';
+                       if ( $element == 'link' && !isset( $params['href'] ) ) {
+                               // <link> must have an associated href=""
+                               return false;
                        }
-
-                       // Size based properties should have px applied to them if they have no unit
-                       if ( in_array( $attribute, array( 'height', 'width', 'size' ) ) ) {
-                               if ( preg_match( '/^[\d.]+$/', $value ) ) {
-                                       $value = "{$value}px";
-                               }
-                       }
-
-                       // Table align is special, it's about block alignment instead of
-                       // content align (see also bug 40306)
-                       if ( $attribute === 'align' && in_array( $element, $table ) ) {
-                               if ( $value === 'center' ) {
-                                       $style .= ' margin-left: auto;';
-                                       $property = 'margin-right';
-                                       $value = 'auto';
-                               } else {
-                                       $property = 'float';
-                               }
-                       }
-
-                       $style .= " $property: $value;";
-
-                       unset( $attribs[$attribute] );
-               }
-
-               if ( $style ) {
-                       // Prepend our style rules so that they can be overridden by user css
-                       if ( isset($attribs['style']) ) {
-                               $style .= " " . $attribs['style'];
-                       }
-                       $attribs['style'] = trim($style);
                }
 
-               return $attribs;
+               return true;
        }
 
        /**
@@ -825,7 +770,7 @@ class Sanitizer {
                                unset( $out['itemid'] );
                                unset( $out['itemref'] );
                        }
-                       # TODO: Strip itemprop if we aren't descendants of an itemscope.
+                       # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
                }
                return $out;
        }
@@ -972,7 +917,6 @@ class Sanitizer {
                }
 
                $decoded = Sanitizer::decodeTagAttributes( $text );
-               $decoded = Sanitizer::fixDeprecatedAttributes( $decoded, $element );
                $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
 
                $attribs = array();
@@ -1095,7 +1039,7 @@ class Sanitizer {
                $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
 
                if ( !preg_match( '/^[a-zA-Z]/', $id )
-               && !in_array( 'noninitial', $options ) )  {
+               && !in_array( 'noninitial', $options ) ) {
                        // Initial character must be a letter!
                        $id = "x$id";
                }
@@ -1451,10 +1395,7 @@ class Sanitizer {
         * @return Array
         */
        static function attributeWhitelist( $element ) {
-               static $list;
-               if( !isset( $list ) ) {
-                       $list = Sanitizer::setupAttributeWhitelist();
-               }
+               $list = Sanitizer::setupAttributeWhitelist();
                return isset( $list[$element] )
                        ? $list[$element]
                        : array();
@@ -1468,6 +1409,13 @@ class Sanitizer {
        static function setupAttributeWhitelist() {
                global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
 
+               static $whitelist, $staticInitialised;
+               $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgHtml5', 'wgAllowMicrodataAttributes' ) );
+
+               if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
+                       return $whitelist;
+               }
+
                $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 
                if ( $wgAllowRdfaAttributes ) {
@@ -1500,7 +1448,7 @@ class Sanitizer {
 
                # Numbers refer to sections in HTML 4.01 standard describing the element.
                # See: http://www.w3.org/TR/html4/
-               $whitelist = array (
+               $whitelist = array(
                        # 7.5.4
                        'div'        => $block,
                        'center'     => $common, # deprecated
@@ -1632,7 +1580,28 @@ class Sanitizer {
                        # HTML 5 section 4.6
                        'bdi' => $common,
 
+               );
+
+               if ( $wgHtml5 ) {
+                       # HTML5 elements, defined by:
+                       # http://www.whatwg.org/specs/web-apps/current-work/multipage/
+                       $whitelist += array(
+                               'data' => array_merge( $common, array( 'value' ) ),
+                               'time' => array_merge( $common, array( 'datetime' ) ),
+                               'mark' => $common,
+
+                               // meta and link are only permitted by removeHTMLtags when Microdata
+                               // is enabled so we don't bother adding a conditional to hide these
+                               // Also meta and link are only valid in WikiText as Microdata elements
+                               // (ie: validateTag rejects tags missing the attributes needed for Microdata)
+                               // So we don't bother including $common attributes that have no purpose.
+                               'meta' => array( 'itemprop', 'content' ),
+                               'link' => array( 'itemprop', 'href' ),
                        );
+               }
+
+               $staticInitialised = $globalContext;
+
                return $whitelist;
        }