Made suppress link bold

[lhc/web/wiklou.git] / includes / Sanitizer.php
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php

index ffe10f9..5d58b03 100644 (file)
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -20,7 +20,8 @@
   * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
   * http://www.gnu.org/copyleft/gpl.html
   *
- * @addtogroup Parser
+ * @file
+ * @ingroup Parser
   */
  
  /**
@@ -327,7 +328,7 @@ $wgHtmlEntityAliases = array(
  
  /**
   * XHTML sanitizer for MediaWiki
- * @addtogroup Parser
+ * @ingroup Parser
   */
  class Sanitizer {
         /**
@@ -339,7 +340,7 @@ class Sanitizer {
          * @param array $args for the processing callback
          * @return string
          */
-       static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
+       static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array() ) {
                 global $wgUseTidy;
  
                 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
@@ -349,13 +350,13 @@ class Sanitizer {
  
                 if ( !$staticInitialised ) {
  
-                       $htmlpairs = array( # Tags that must be closed
+                       $htmlpairs = array_merge( $extratags, array( # Tags that must be closed
                                 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
                                 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
                                 'strike', 'strong', 'tt', 'var', 'div', 'center',
                                 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
                                 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
-                       );
+                       ) );
                         $htmlsingle = array(
                                 'br', 'hr', 'li', 'dt', 'dd'
                         );
@@ -380,7 +381,7 @@ class Sanitizer {
                         $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
  
                         # Convert them all to hashtables for faster lookup
-                       $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', 
+                       $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
                                 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
                         foreach ( $vars as $var ) {
                                 $$var = array_flip( $$var );
@@ -416,7 +417,7 @@ class Sanitizer {
                                                                 $optstack = array();
                                                                 array_push ($optstack, $ot);
                                                                 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
-                                                                               isset( $htmlsingleallowed[$ot] ) ) 
+                                                                               isset( $htmlsingleallowed[$ot] ) )
                                                                 {
                                                                         array_push ($optstack, $ot);
                                                                 }
@@ -579,7 +580,7 @@ class Sanitizer {
                 return Sanitizer::validateAttributes( $attribs,
                         Sanitizer::attributeWhitelist( $element ) );
         }
-       
+
         /**
          * Take an array of attribute names and values and normalize or discard
          * illegal values for the given whitelist.
@@ -612,8 +613,11 @@ class Sanitizer {
                                 }
                         }
  
-                       if ( $attribute === 'id' )
-                               $value = Sanitizer::escapeId( $value );
+                       if ( $attribute === 'id' ) {
+                               global $wgEnforceHtmlIds;
+                               $value = Sanitizer::escapeId( $value,
+                                       $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
+                       }
  
                         // If this attribute was previously set, override it.
                         // Output should only have one attribute of each name.
@@ -621,12 +625,11 @@ class Sanitizer {
                 }
                 return $out;
         }
-       
+
         /**
-        * Merge two sets of HTML attributes.
-        * Conflicting items in the second set will override those
-        * in the first, except for 'class' attributes which will be
-        * combined.
+        * Merge two sets of HTML attributes.  Conflicting items in the second set
+        * will override those in the first, except for 'class' attributes which
+        * will be combined (if they're both strings).
          *
          * @todo implement merging for other attributes such as style
          * @param array $a
@@ -635,20 +638,16 @@ class Sanitizer {
          */
         static function mergeAttributes( $a, $b ) {
                 $out = array_merge( $a, $b );
-               if( isset( $a['class'] )
-                       && isset( $b['class'] )
-                       && $a['class'] !== $b['class'] ) {
-                       
-                       $out['class'] = implode( ' ',
-                               array_unique(
-                                       preg_split( '/\s+/',
-                                               $a['class'] . ' ' . $b['class'],
-                                               -1,
-                                               PREG_SPLIT_NO_EMPTY ) ) );
+               if( isset( $a['class'] ) && isset( $b['class'] )
+               && is_string( $a['class'] ) && is_string( $b['class'] )
+               && $a['class'] !== $b['class'] ) {
+                       $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
+                               -1, PREG_SPLIT_NO_EMPTY );
+                       $out['class'] = implode( ' ', array_unique( $classes ) );
                 }
                 return $out;
         }
-       
+
         /**
          * Pick apart some CSS and check it for forbidden or unsafe structures.
          * Returns a sanitized string, or false if it was just too evil.
@@ -663,7 +662,7 @@ class Sanitizer {
  
                 // Remove any comments; IE gets token splitting wrong
                 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
-               
+
                 $value = $stripped;
  
                 // ... and continue checks
@@ -675,7 +674,7 @@ class Sanitizer {
                         # haxx0r
                         return false;
                 }
-               
+
                 return $value;
         }
  
@@ -722,7 +721,7 @@ class Sanitizer {
          * @return HTML-encoded text fragment
          */
         static function encodeAttribute( $text ) {
-               $encValue = htmlspecialchars( $text );
+               $encValue = htmlspecialchars( $text, ENT_QUOTES );
  
                 // Whitespace is normalized during attribute decoding,
                 // so if we've been passed non-spaces we must encode them
@@ -778,20 +777,56 @@ class Sanitizer {
          *                                                          name attributes
          * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
          *
-        * @static
-        *
-        * @param string $id
+        * @param string $id      Id to validate
+        * @param mixed  $options String or array of strings (default is array()):
+        *   'noninitial': This is a non-initial fragment of an id, not a full id,
+        *       so don't pay attention if the first character isn't valid at the
+        *       beginning of an id.
+        *   'xml': Don't restrict the id to be HTML4-compatible.  This option
+        *       allows any alphabetic character to be used, per the XML standard.
+        *       Therefore, it also completely changes the type of escaping: instead
+        *       of weird dot-encoding, runs of invalid characters (mostly
+        *       whitespace) are just compressed into a single underscore.
          * @return string
          */
-       static function escapeId( $id ) {
-               static $replace = array(
-                       '%3A' => ':',
-                       '%' => '.'
-               );
+       static function escapeId( $id, $options = array() ) {
+               $options = (array)$options;
+
+               if ( !in_array( 'xml', $options ) ) {
+                       # HTML4-style escaping
+                       static $replace = array(
+                               '%3A' => ':',
+                               '%' => '.'
+                       );
  
-               $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
+                       $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
+                       $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
+
+                       if ( !preg_match( '/^[a-zA-Z]/', $id )
+                       && !in_array( 'noninitial', $options ) )  {
+                               // Initial character must be a letter!
+                               $id = "x$id";
+                       }
+                       return $id;
+               }
  
-               return str_replace( array_keys( $replace ), array_values( $replace ), $id );
+               # XML-style escaping.  For the patterns used, see the XML 1.0 standard,
+               # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
+               $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
+                       . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
+                       . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
+               $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
+                       . '\x{203F}-\x{2040}';
+               # Replace _ as well so we don't get multiple consecutive underscores
+               $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
+               $id = trim( $id, '_' );
+
+               if ( !preg_match( "/^[$nameStartChar]/u", $id )
+               && !in_array( 'noninitial', $options ) ) {
+                       $id = "_$id";
+               }
+
+               return $id;
         }
  
         /**
@@ -813,6 +848,22 @@ class Sanitizer {
                         $class ), '_');
         }
  
+       /**
+        * Given HTML input, escape with htmlspecialchars but un-escape entites.
+        * This allows (generally harmless) entities like &nbsp; to survive.
+        *
+        * @param  string $html String to escape
+        * @return string Escaped input
+        */
+       static function escapeHtmlAllowEntities( $html ) {
+               # It seems wise to escape ' as well as ", as a matter of course.  Can't
+               # hurt.
+               $html = htmlspecialchars( $html, ENT_QUOTES );
+               $html = str_replace( '&amp;', '&', $html );
+               $html = Sanitizer::normalizeCharReferences( $html );
+               return $html;
+       }
+
         /**
          * Regex replace callback for armoring links against further processing.
          * @param array $matches
@@ -831,7 +882,7 @@ class Sanitizer {
          * @param string
          * @return array
          */
-       static function decodeTagAttributes( $text ) {
+       public static function decodeTagAttributes( $text ) {
                 $attribs = array();
  
                 if( trim( $text ) == '' ) {
@@ -908,7 +959,7 @@ class Sanitizer {
                         self::normalizeWhitespace(
                                 Sanitizer::normalizeCharReferences( $text ) ) );
         }
-       
+
         private static function normalizeWhitespace( $text ) {
                 return preg_replace(
                         '/\r\n|[\x20\x0d\x0a\x09]/',
@@ -960,8 +1011,8 @@ class Sanitizer {
  
         /**
          * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
-        * return the named entity reference as is. If the entity is a 
-        * MediaWiki-specific alias, returns the HTML equivalent. Otherwise, 
+        * return the named entity reference as is. If the entity is a
+        * MediaWiki-specific alias, returns the HTML equivalent. Otherwise,
          * returns HTML-escaped text of pseudo-entity source (eg &amp;foo;)
          *
          * @param string $name
@@ -1098,7 +1149,8 @@ class Sanitizer {
         }
  
         /**
-        * @todo Document it a bit
+        * Foreach array key (an allowed HTML element), return an array
+        * of allowed attributes
          * @return array
          */
         static function setupAttributeWhitelist() {
@@ -1207,7 +1259,7 @@ class Sanitizer {
                         # 11.2.6
                         'td'         => array_merge( $common, $tablecell, $tablealign ),
                         'th'         => array_merge( $common, $tablecell, $tablealign ),
-                       
+
                         # 13.2
                         # Not usually allowed, but may be used for extension-style hooks
                         # such as <math> when it is rasterized
@@ -1238,7 +1290,7 @@ class Sanitizer {
                         'rb'         => $common,
                         'rt'         => $common, #array_merge( $common, array( 'rbspan' ) ),
                         'rp'         => $common,
-                       
+
                         # MathML root element, where used for extensions
                         # 'title' may not be 100% valid here; it's XHTML
                         # http://www.w3.org/TR/REC-MathML/
@@ -1288,7 +1340,7 @@ class Sanitizer {
                 return $out;
         }
  
-       static function cleanUrl( $url, $hostname=true ) {
+       static function cleanUrl( $url ) {
                 # Normalize any HTML entities in input. They will be
                 # re-escaped by makeExternalLink().
                 $url = Sanitizer::decodeCharReferences( $url );
@@ -1331,5 +1383,3 @@ class Sanitizer {
         }
  
  }
-
-