* Special:Import/importDump fixes: report XML parse errors, accept <minor/>
[lhc/web/wiklou.git] / includes / Sanitizer.php
index cac176b..e0217ba 100644 (file)
@@ -1,5 +1,4 @@
 <?php
-
 /**
  * (X)HTML sanitizer for MediaWiki
  *
@@ -317,6 +316,7 @@ $wgHtmlEntities = array(
        'zwj'      => 8205,
        'zwnj'     => 8204 );
 
+/** @package MediaWiki */
 class Sanitizer {
        /**
         * Cleans up HTML, removes dangerous tags and attributes, and
@@ -343,6 +343,9 @@ class Sanitizer {
                        $htmlsingle = array(
                                'br', 'hr', 'li', 'dt', 'dd'
                        );
+                       $htmlsingleonly = array( # Elements that cannot have close tags
+                               'br', 'hr'
+                       );
                        $htmlnest = array( # Tags that can be nested--??
                                'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
                                'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
@@ -369,7 +372,7 @@ class Sanitizer {
                        $tagstack = array(); $tablestack = array();
                        foreach ( $bits as $x ) {
                                $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
-                               preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
+                               preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
                                $x, $regs );
                                list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
                                error_reporting( $prev );
@@ -379,7 +382,9 @@ class Sanitizer {
                                        # Check our stack
                                        if ( $slash ) {
                                                # Closing a tag...
-                                               if ( ! in_array( $t, $htmlsingle ) &&
+                                               if( in_array( $t, $htmlsingleonly ) ) {
+                                                       $badtag = 1;
+                                               } elseif( !in_array( $t, $htmlsingle ) &&
                                                ( $ot = @array_pop( $tagstack ) ) != $t ) {
                                                        @array_push( $tagstack, $ot );
                                                        $badtag = 1;
@@ -397,6 +402,9 @@ class Sanitizer {
                                                } else if ( in_array( $t, $tagstack ) &&
                                                ! in_array ( $t , $htmlnest ) ) {
                                                        $badtag = 1 ;
+                                               } elseif( in_array( $t, $htmlsingleonly ) ) {
+                                                       # Hack to force empty tag for uncloseable elements
+                                                       $brace = '/>';
                                                } else if ( ! in_array( $t, $htmlsingle ) ) {
                                                        if ( $t == 'table' ) {
                                                                array_push( $tablestack, $tagstack );
@@ -416,7 +424,8 @@ class Sanitizer {
                                        }
                                        if ( ! $badtag ) {
                                                $rest = str_replace( '>', '&gt;', $rest );
-                                               $text .= "<$slash$t$newparams$brace$rest";
+                                               $close = ( $brace == '/>' ) ? ' /' : '';
+                                               $text .= "<$slash$t$newparams$close>$rest";
                                                continue;
                                        }
                                }
@@ -430,7 +439,7 @@ class Sanitizer {
                } else {
                        # this might be possible using tidy itself
                        foreach ( $bits as $x ) {
-                               preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
+                               preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
                                $x, $regs );
                                @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
                                if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
@@ -516,6 +525,7 @@ class Sanitizer {
         * @todo Check for unique id attribute :P
         */
        function fixTagAttributes( $text, $element ) {
+               global $wgUrlProtocols;
                if( trim( $text ) == '' ) {
                        return '';
                }
@@ -561,13 +571,16 @@ class Sanitizer {
                                'RFC'  => '&#82;FC',
                                'PMID' => '&#80;MID',
                        ) );
-                       $value = preg_replace(
-                               '/(' . URL_PROTOCOLS . '):/',
-                               '\\1&#58;', $value );
                        
-                       if( !isset( $attribs[$attribute] ) ) {
-                               $attribs[$attribute] = "$attribute=\"$value\"";
-                       }
+                       # Stupid hack
+                       $value = preg_replace_callback(
+                               '/(' . $wgUrlProtocols . ')/',
+                               array( 'Sanitizer', 'armorLinksCallback' ),
+                               $value );
+                       
+                       // If this attribute was previously set, override it.
+                       // Output should only have one attribute of each name.
+                       $attribs[$attribute] = "$attribute=\"$value\"";
                }
                if( empty( $attribs ) ) {
                        return '';
@@ -576,6 +589,16 @@ class Sanitizer {
                }
        }
        
+       /**
+        * Regex replace callback for armoring links against further processing.
+        * @param array $matches
+        * @return string
+        * @access private
+        */
+       function armorLinksCallback( $matches ) {
+               return str_replace( ':', '&#58;', $matches[1] );
+       }
+       
        /**
         * Return an associative array of attribute names and values from
         * a partial tag string. Attribute names are forces to lowercase,
@@ -717,7 +740,7 @@ class Sanitizer {
        }
        
        function decCharReference( $codepoint ) {
-               $point = IntVal( $codepoint );
+               $point = intval( $codepoint );
                if( Sanitizer::validateCodepoint( $point ) ) {
                        return sprintf( '&#%d;', $point );
                } else {