Merge "Less wild whitespace"
[lhc/web/wiklou.git] / includes / Sanitizer.php
index 5aa0545..0034afe 100644 (file)
@@ -364,14 +364,17 @@ class Sanitizer {
         * @return string
         */
        static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
-               global $wgUseTidy;
+               global $wgUseTidy, $wgHtml5, $wgAllowMicrodataAttributes, $wgAllowImageTag;
 
                static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
                        $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
 
                wfProfileIn( __METHOD__ );
 
-               if ( !$staticInitialised ) {
+               // Base our staticInitialised variable off of the global config state so that if the globals
+               // are changed (like in the secrewed up test system) we will re-initialise the settings.
+               $globalContext = implode( '-', compact( 'wgHtml5', 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
+               if ( !$staticInitialised || $staticInitialised != $globalContext ) {
 
                        $htmlpairsStatic = array( # Tags that must be closed
                                'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
@@ -381,13 +384,20 @@ class Sanitizer {
                                'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn',
                                'kbd', 'samp'
                        );
+                       if ( $wgHtml5 ) {
+                               $htmlpairsStatic = array_merge( $htmlpairsStatic, array( 'data', 'time', 'mark' ) );
+                       }
                        $htmlsingle = array(
                                'br', 'hr', 'li', 'dt', 'dd'
                        );
                        $htmlsingleonly = array( # Elements that cannot have close tags
                                'br', 'hr'
                        );
-                       $htmlnest = array( # Tags that can be nested directly or indirectly
+                       if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
+                               $htmlsingle[] = $htmlsingleonly[] = 'meta';
+                               $htmlsingle[] = $htmlsingleonly[] = 'link';
+                       }
+                       $htmlnest = array( # Tags that can be nested--??
                                'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
                                'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span'
                        );
@@ -401,7 +411,6 @@ class Sanitizer {
                                'li',
                        );
 
-                       global $wgAllowImageTag;
                        if ( $wgAllowImageTag ) {
                                $htmlsingle[] = 'img';
                                $htmlsingleonly[] = 'img';
@@ -416,7 +425,7 @@ class Sanitizer {
                        foreach ( $vars as $var ) {
                                $$var = array_flip( $$var );
                        }
-                       $staticInitialised = true;
+                       $staticInitialised = $globalContext;
                }
                # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
                $extratags = array_flip( $extratags );
@@ -532,6 +541,10 @@ class Sanitizer {
                                                        call_user_func_array( $processCallback, array( &$params, $args ) );
                                                }
 
+                                               if ( !Sanitizer::validateTag( $params, $t ) ) {
+                                                       $badtag = true;
+                                               }
+
                                                # Strip non-approved attributes from the tag
                                                $newparams = Sanitizer::fixTagAttributes( $params, $t );
                                        }
@@ -555,16 +568,24 @@ class Sanitizer {
                                preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
                                $x, $regs );
                                @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
+                               $badtag = false;
                                if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
                                        if( is_callable( $processCallback ) ) {
                                                call_user_func_array( $processCallback, array( &$params, $args ) );
                                        }
+
+                                       if ( !Sanitizer::validateTag( $params, $t ) ) {
+                                               $badtag = true;
+                                       }
+
                                        $newparams = Sanitizer::fixTagAttributes( $params, $t );
-                                       $rest = str_replace( '>', '&gt;', $rest );
-                                       $text .= "<$slash$t$newparams$brace$rest";
-                               } else {
-                                       $text .= '&lt;' . str_replace( '>', '&gt;', $x);
+                                       if ( !$badtag ) {
+                                               $rest = str_replace( '>', '&gt;', $rest );
+                                               $text .= "<$slash$t$newparams$brace$rest";
+                                               continue;
+                                       }
                                }
+                               $text .= '&lt;' . str_replace( '>', '&gt;', $x);
                        }
                }
                wfProfileOut( __METHOD__ );
@@ -617,111 +638,35 @@ class Sanitizer {
        }
 
        /**
-        * Take an array of attribute names and values and fix some deprecated values
-        * for the given element type.
-        * This does not validate properties, so you should ensure that you call
-        * validateTagAttributes AFTER this to ensure that the resulting style rule
-        * this may add is safe.
-        *
-        * - Converts most presentational attributes like align into inline css
+        * Takes attribute names and values for a tag and the tag name and
+        * validates that the tag is allowed to be present.
+        * This DOES NOT validate the attributes, nor does it validate the
+        * tags themselves. This method only handles the special circumstances
+        * where we may want to allow a tag within content but ONLY when it has
+        * specific attributes set.
         *
-        * @param $attribs Array
-        * @param $element String
-        * @return Array
+        * @param $params
+        * @param $element
         */
-       static function fixDeprecatedAttributes( $attribs, $element ) {
-               global $wgHtml5, $wgCleanupPresentationalAttributes;
-
-               // presentational attributes were removed from html5, we can leave them
-               // in when html5 is turned off
-               if ( !$wgHtml5 || !$wgCleanupPresentationalAttributes ) {
-                       return $attribs;
-               }
-
-               $table = array( 'table' );
-               $cells = array( 'td', 'th' );
-               $colls = array( 'col', 'colgroup' );
-               $tblocks = array( 'tbody', 'tfoot', 'thead' );
-               $h = array( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' );
-
-               $presentationalAttribs = array(
-                       'align' => array( 'text-align', array_merge( array( 'caption', 'hr', 'div', 'p', 'tr' ), $table, $cells, $colls, $tblocks, $h ) ),
-                       'clear' => array( 'clear', array( 'br' ) ),
-                       'height' => array( 'height', $cells ),
-                       'nowrap' => array( 'white-space', $cells ),
-                       'size' => array( 'height', array( 'hr' ) ),
-                       'type' => array( 'list-style-type', array( 'li', 'ol', 'ul' ) ),
-                       'valign' => array( 'vertical-align', array_merge( $cells, $colls, $tblocks ) ),
-                       'width' => array( 'width', array_merge( array( 'hr', 'pre' ), $table, $cells, $colls ) ),
-               );
-
-               // Ensure that any upper case or mixed case attributes are converted to lowercase
-               foreach ( $attribs as $attribute => $value ) {
-                       if ( $attribute !== strtolower( $attribute ) && array_key_exists( strtolower( $attribute ), $presentationalAttribs ) ) {
-                               $attribs[strtolower( $attribute )] = $value;
-                               unset( $attribs[$attribute] );
-                       }
-               }
-
-               $style = "";
-               foreach ( $presentationalAttribs as $attribute => $info ) {
-                       list( $property, $elements ) = $info;
+       static function validateTag( $params, $element ) {
+               $params = Sanitizer::decodeTagAttributes( $params );
 
-                       // Skip if this attribute is not relevant to this element
-                       if ( !in_array( $element, $elements ) ) {
-                               continue;
-                       }
-
-                       // Skip if the attribute is not used
-                       if ( !array_key_exists( $attribute, $attribs ) ) {
-                               continue;
+               if ( $element == 'meta' || $element == 'link' ) {
+                       if ( !isset( $params['itemprop'] ) ) {
+                               // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
+                               return false;
                        }
-
-                       $value = $attribs[$attribute];
-
-                       // For nowrap the value should be nowrap instead of whatever text is in the value
-                       if ( $attribute === 'nowrap' ) {
-                               $value = 'nowrap';
+                       if ( $element == 'meta' && !isset( $params['content'] ) ) {
+                               // <meta> must have a content="" for the itemprop
+                               return false;
                        }
-
-                       // clear="all" is clear: both; in css
-                       if ( $attribute === 'clear' && strtolower( $value ) === 'all' ) {
-                               $value = 'both';
+                       if ( $element == 'link' && !isset( $params['href'] ) ) {
+                               // <link> must have an associated href=""
+                               return false;
                        }
-
-                       // Size based properties should have px applied to them if they have no unit
-                       if ( in_array( $attribute, array( 'height', 'width', 'size' ) ) ) {
-                               if ( preg_match( '/^[\d.]+$/', $value ) ) {
-                                       $value = "{$value}px";
-                               }
-                       }
-
-                       // Table align is special, it's about block alignment instead of
-                       // content align (see also bug 40306)
-                       if ( $attribute === 'align' && in_array( $element, $table ) ) {
-                               if ( $value === 'center' ) {
-                                       $style .= ' margin-left: auto;';
-                                       $property = 'margin-right';
-                                       $value = 'auto';
-                               } else {
-                                       $property = 'float';
-                               }
-                       }
-
-                       $style .= " $property: $value;";
-
-                       unset( $attribs[$attribute] );
-               }
-
-               if ( $style ) {
-                       // Prepend our style rules so that they can be overridden by user css
-                       if ( isset($attribs['style']) ) {
-                               $style .= " " . $attribs['style'];
-                       }
-                       $attribs['style'] = trim($style);
                }
 
-               return $attribs;
+               return true;
        }
 
        /**
@@ -825,7 +770,7 @@ class Sanitizer {
                                unset( $out['itemid'] );
                                unset( $out['itemref'] );
                        }
-                       # TODO: Strip itemprop if we aren't descendants of an itemscope.
+                       # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
                }
                return $out;
        }
@@ -972,7 +917,6 @@ class Sanitizer {
                }
 
                $decoded = Sanitizer::decodeTagAttributes( $text );
-               $decoded = Sanitizer::fixDeprecatedAttributes( $decoded, $element );
                $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
 
                $attribs = array();
@@ -1451,10 +1395,7 @@ class Sanitizer {
         * @return Array
         */
        static function attributeWhitelist( $element ) {
-               static $list;
-               if( !isset( $list ) ) {
-                       $list = Sanitizer::setupAttributeWhitelist();
-               }
+               $list = Sanitizer::setupAttributeWhitelist();
                return isset( $list[$element] )
                        ? $list[$element]
                        : array();
@@ -1468,6 +1409,13 @@ class Sanitizer {
        static function setupAttributeWhitelist() {
                global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
 
+               static $whitelist, $staticInitialised;
+               $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgHtml5', 'wgAllowMicrodataAttributes' ) );
+
+               if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
+                       return $whitelist;
+               }
+
                $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 
                if ( $wgAllowRdfaAttributes ) {
@@ -1500,7 +1448,7 @@ class Sanitizer {
 
                # Numbers refer to sections in HTML 4.01 standard describing the element.
                # See: http://www.w3.org/TR/html4/
-               $whitelist = array (
+               $whitelist = array(
                        # 7.5.4
                        'div'        => $block,
                        'center'     => $common, # deprecated
@@ -1632,7 +1580,28 @@ class Sanitizer {
                        # HTML 5 section 4.6
                        'bdi' => $common,
 
+               );
+
+               if ( $wgHtml5 ) {
+                       # HTML5 elements, defined by:
+                       # http://www.whatwg.org/specs/web-apps/current-work/multipage/
+                       $whitelist += array(
+                               'data' => array_merge( $common, array( 'value' ) ),
+                               'time' => array_merge( $common, array( 'datetime' ) ),
+                               'mark' => $common,
+
+                               // meta and link are only permitted by removeHTMLtags when Microdata
+                               // is enabled so we don't bother adding a conditional to hide these
+                               // Also meta and link are only valid in WikiText as Microdata elements
+                               // (ie: validateTag rejects tags missing the attributes needed for Microdata)
+                               // So we don't bother including $common attributes that have no purpose.
+                               'meta' => array( 'itemprop', 'content' ),
+                               'link' => array( 'itemprop', 'href' ),
                        );
+               }
+
+               $staticInitialised = $globalContext;
+
                return $whitelist;
        }