support for HTML5/Microdata elements
authorDaniel Friesen <pub-github@nadir-seen-fire.com>
Wed, 4 Apr 2012 06:54:39 +0000 (23:54 -0700)
committerAntoine Musso <hashar@free.fr>
Mon, 19 Nov 2012 08:00:50 +0000 (09:00 +0100)
Patch let us handle the <data>, <time>, <meta>, and <link> elements.

* handles one part of bug 32545 requesting us to support the <time>
  element in WikiText.
* Partially fix bug 28776 about whitelisting global HTML5 semantic
  attributes and inline meta element.
* <meta> and <link> are only permitted when Microdata is enabled using
* the global $wgAllowMicrodataAttributes. For for security reason, the
  links are only allowed to be actual elements when they have a
  strict set of attributes set.

Change-Id: Ica11be186bd62eb154e1ebc400acb515c10fb65f

RELEASE-NOTES-1.21
includes/Sanitizer.php
includes/parser/Tidy.php
includes/tidy.conf
tests/parser/parserTests.txt

index eda9f23..66b3927 100644 (file)
@@ -37,6 +37,8 @@ production.
 * (bug 40876) Icon for PSD (Adobe Photoshop) file types.
 * (bug 40641) Implemented Special:Version/Credits with a list of contributors.
 * (bug 7851) Implemented one-click AJAX patrolling.
+* The <data>, <time>, <meta>, and <link> elements are allowed within WikiText
+  for use with Microdata.
 
 === Bug fixes in 1.21 ===
 * (bug 40353) SpecialDoubleRedirect should support interwiki redirects.
index 5aa0545..c529417 100644 (file)
@@ -364,14 +364,17 @@ class Sanitizer {
         * @return string
         */
        static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) {
-               global $wgUseTidy;
+               global $wgUseTidy, $wgHtml5, $wgAllowMicrodataAttributes, $wgAllowImageTag;
 
                static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
                        $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
 
                wfProfileIn( __METHOD__ );
 
-               if ( !$staticInitialised ) {
+               // Base our staticInitialised variable off of the global config state so that if the globals
+               // are changed (like in the secrewed up test system) we will re-initialise the settings.
+               $globalContext = implode( '-', compact( 'wgHtml5', 'wgAllowMicrodataAttributes', 'wgAllowImageTag' ) );
+               if ( !$staticInitialised || $staticInitialised != $globalContext ) {
 
                        $htmlpairsStatic = array( # Tags that must be closed
                                'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
@@ -381,13 +384,20 @@ class Sanitizer {
                                'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn',
                                'kbd', 'samp'
                        );
+                       if ( $wgHtml5 ) {
+                               $htmlpairsStatic = array_merge( $htmlpairsStatic, array( 'data', 'time' ) );
+                       }
                        $htmlsingle = array(
                                'br', 'hr', 'li', 'dt', 'dd'
                        );
                        $htmlsingleonly = array( # Elements that cannot have close tags
                                'br', 'hr'
                        );
-                       $htmlnest = array( # Tags that can be nested directly or indirectly
+                       if ( $wgHtml5 && $wgAllowMicrodataAttributes ) {
+                               $htmlsingle[] = $htmlsingleonly[] = 'meta';
+                               $htmlsingle[] = $htmlsingleonly[] = 'link';
+                       }
+                       $htmlnest = array( # Tags that can be nested--??
                                'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
                                'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span'
                        );
@@ -401,7 +411,6 @@ class Sanitizer {
                                'li',
                        );
 
-                       global $wgAllowImageTag;
                        if ( $wgAllowImageTag ) {
                                $htmlsingle[] = 'img';
                                $htmlsingleonly[] = 'img';
@@ -416,7 +425,7 @@ class Sanitizer {
                        foreach ( $vars as $var ) {
                                $$var = array_flip( $$var );
                        }
-                       $staticInitialised = true;
+                       $staticInitialised = $globalContext;
                }
                # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
                $extratags = array_flip( $extratags );
@@ -532,6 +541,10 @@ class Sanitizer {
                                                        call_user_func_array( $processCallback, array( &$params, $args ) );
                                                }
 
+                                               if ( !Sanitizer::validateTag( $params, $t ) ) {
+                                                       $badtag = true;
+                                               }
+
                                                # Strip non-approved attributes from the tag
                                                $newparams = Sanitizer::fixTagAttributes( $params, $t );
                                        }
@@ -555,16 +568,24 @@ class Sanitizer {
                                preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
                                $x, $regs );
                                @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
+                               $badtag = false;
                                if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
                                        if( is_callable( $processCallback ) ) {
                                                call_user_func_array( $processCallback, array( &$params, $args ) );
                                        }
+
+                                       if ( !Sanitizer::validateTag( $params, $t ) ) {
+                                               $badtag = true;
+                                       }
+
                                        $newparams = Sanitizer::fixTagAttributes( $params, $t );
-                                       $rest = str_replace( '>', '&gt;', $rest );
-                                       $text .= "<$slash$t$newparams$brace$rest";
-                               } else {
-                                       $text .= '&lt;' . str_replace( '>', '&gt;', $x);
+                                       if ( !$badtag ) {
+                                               $rest = str_replace( '>', '&gt;', $rest );
+                                               $text .= "<$slash$t$newparams$brace$rest";
+                                               continue;
+                                       }
                                }
+                               $text .= '&lt;' . str_replace( '>', '&gt;', $x);
                        }
                }
                wfProfileOut( __METHOD__ );
@@ -724,6 +745,37 @@ class Sanitizer {
                return $attribs;
        }
 
+       /**
+        * Takes attribute names and values for a tag and the tah name and
+        * validates that the tag is allowed to be present.
+        * This DOES NOT validate the attributes, nor does it validate the
+        * tags themselves. This method only handles the special circumstances
+        * where we may want to allow a tag within content but ONLY when it has
+        * specific attributes set.
+        *
+        * @param $
+        */
+       static function validateTag( $params, $element ) {
+               $params = Sanitizer::decodeTagAttributes( $params );
+
+               if ( $element == 'meta' || $element == 'link' ) {
+                       if ( !isset( $params['itemprop'] ) ) {
+                               // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
+                               return false;
+                       }
+                       if ( $element == 'meta' && !isset( $params['content'] ) ) {
+                               // <meta> must have a content="" for the itemprop
+                               return false;
+                       }
+                       if ( $element == 'link' && !isset( $params['href'] ) ) {
+                               // <link> must have an associated href=""
+                               return false;
+                       }
+               }
+
+               return true;
+       }
+
        /**
         * Take an array of attribute names and values and normalize or discard
         * illegal values for the given element type.
@@ -825,7 +877,7 @@ class Sanitizer {
                                unset( $out['itemid'] );
                                unset( $out['itemref'] );
                        }
-                       # TODO: Strip itemprop if we aren't descendants of an itemscope.
+                       # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
                }
                return $out;
        }
@@ -1451,10 +1503,7 @@ class Sanitizer {
         * @return Array
         */
        static function attributeWhitelist( $element ) {
-               static $list;
-               if( !isset( $list ) ) {
-                       $list = Sanitizer::setupAttributeWhitelist();
-               }
+               $list = Sanitizer::setupAttributeWhitelist();
                return isset( $list[$element] )
                        ? $list[$element]
                        : array();
@@ -1468,6 +1517,13 @@ class Sanitizer {
        static function setupAttributeWhitelist() {
                global $wgAllowRdfaAttributes, $wgHtml5, $wgAllowMicrodataAttributes;
 
+               static $whitelist, $staticInitialised;
+               $globalContext = implode( '-', compact( 'wgAllowRdfaAttributes', 'wgHtml5', 'wgAllowMicrodataAttributes' ) );
+
+               if ( isset( $whitelist ) && $staticInitialised == $globalContext ) {
+                       return $whitelist;
+               }
+
                $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
 
                if ( $wgAllowRdfaAttributes ) {
@@ -1500,7 +1556,7 @@ class Sanitizer {
 
                # Numbers refer to sections in HTML 4.01 standard describing the element.
                # See: http://www.w3.org/TR/html4/
-               $whitelist = array (
+               $whitelist = array(
                        # 7.5.4
                        'div'        => $block,
                        'center'     => $common, # deprecated
@@ -1632,7 +1688,27 @@ class Sanitizer {
                        # HTML 5 section 4.6
                        'bdi' => $common,
 
+               );
+
+               if ( $wgHtml5 ) {
+                       # HTML5 elements, defined by:
+                       # http://www.whatwg.org/specs/web-apps/current-work/multipage/
+                       $whitelist += array(
+                               'data' => array_merge( $common, array( 'value' ) ),
+                               'time' => array_merge( $common, array( 'datetime' ) ),
+
+                               // meta and link are only permitted by removeHTMLtags when Microdata
+                               // is enabled so we don't bother adding a conditional to hide these
+                               // Also meta and link are only valid in WikiText as Microdata elements
+                               // (ie: validateTag rejects tags missing the attributes needed for Microdata)
+                               // So we don't bother including $common attributes that have no purpose.
+                               'meta' => array( 'itemprop', 'content' ),
+                               'link' => array( 'itemprop', 'href' ),
                        );
+               }
+
+               $staticInitialised = $globalContext;
+
                return $whitelist;
        }
 
index ed2d436..5cc1b0f 100644 (file)
@@ -59,9 +59,15 @@ class MWTidyWrapper {
                        dechex( mt_rand( 0, 0x7fffffff ) ) . dechex( mt_rand( 0, 0x7fffffff ) );
                $this->mMarkerIndex = 0;
 
+               // Replace <mw:editsection> elements with placeholders
                $wrappedtext = preg_replace_callback( ParserOutput::EDITSECTION_REGEX,
                        array( &$this, 'replaceEditSectionLinksCallback' ), $text );
 
+               // Modify inline Microdata <link> and <meta> elements so they say <html-link> and <html-meta> so
+               // we can trick Tidy into not stripping them out by including them in tidy's new-empty-tags config
+               $wrappedtext = preg_replace( '!<(link|meta)([^>]*?)(/{0,1}>)!', '<html-$1$2$3', $wrappedtext );
+
+               // Wrap the whole thing in a doctype and body for Tidy.
                $wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"'.
                        ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>'.
                        '<head><title>test</title></head><body>'.$wrappedtext.'</body></html>';
@@ -86,7 +92,13 @@ class MWTidyWrapper {
         * @return string
         */
        public function postprocess( $text ) {
-               return $this->mTokens->replace( $text );
+               // Revert <html-{link,meta}> back to <{link,meta}>
+               $text = preg_replace( '!<html-(link|meta)([^>]*?)(/{0,1}>)!', '<$1$2$3', $text );
+
+               // Restore the contents of placeholder tokens
+               $text = $this->mTokens->replace( $text );
+
+               return $text;
        }
 
 }
index aa333fc..bced6fd 100644 (file)
@@ -16,4 +16,7 @@ quiet: yes
 quote-nbsp: yes
 fix-backslash: no
 fix-uri: no
-new-inline-tags: video,audio,source,track,bdi
+# Don't strip html5 elements we support
+# html-{meta,link} is a hack we use to prevent Tidy from stripping <meta> and <link> used in the body for Microdata
+new-empty-tags: html-meta, html-link
+new-inline-tags: video, audio, source, track, bdi, data, time
index 3255583..ac3113e 100644 (file)
@@ -7983,6 +7983,30 @@ disabled
 Something need to be done. foo-2 ? 
 !! end
 
+!! test
+Sanitizer: Validating that <meta> and <link> work, but only for Microdata
+!! input
+<div itemscope>
+       <meta itemprop="hello" content="world">
+       <meta http-equiv="refresh" content="5">
+       <meta itemprop="hello" http-equiv="refresh" content="5">
+       <link itemprop="hello" href="{{SERVER}}">
+       <link rel="stylesheet" href="{{SERVER}}">
+       <link rel="stylesheet" itemprop="hello" href="{{SERVER}}">
+</div>
+!! result
+<div itemscope="itemscope">
+<p>    <meta itemprop="hello" content="world" />
+       &lt;meta http-equiv="refresh" content="5"&gt;
+       <meta itemprop="hello" content="5" />
+</p>
+       <link itemprop="hello" href="http&#58;//Britney-Spears" />
+       &lt;link rel="stylesheet" href="<a rel="nofollow" class="external free" href="http://Britney-Spears">http://Britney-Spears</a>"&gt;
+       <link itemprop="hello" href="http&#58;//Britney-Spears" />
+</div>
+
+!! end
+
 !! test
 Language converter: output gets cut off unexpectedly (bug 5757)
 !! options