X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2FSanitizer.php;h=47259fb5b3679276353e776b26380b61ebb4ec40;hb=ba9468ec2441599965484826a45d569b6f6c825c;hp=21ea4d9842fe1377d62ddf128e410693c0b2c29d;hpb=81f6aa1462ffd4bcd7892b68edda6880eb393773;p=lhc%2Fweb%2Fwiklou.git diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index 21ea4d9842..47259fb5b3 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -20,8 +20,8 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * http://www.gnu.org/copyleft/gpl.html * - * @package MediaWiki - * @subpackage Parser + * @file + * @ingroup Parser */ /** @@ -29,7 +29,7 @@ * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences */ define( 'MW_CHAR_REFS_REGEX', - '/&([A-Za-z0-9]+); + '/&([A-Za-z0-9\x80-\xff]+); |&\#([0-9]+); |&\#x([0-9A-Za-z]+); |&\#X([0-9A-Za-z]+); @@ -316,29 +316,48 @@ $wgHtmlEntities = array( 'zwj' => 8205, 'zwnj' => 8204 ); -/** @package MediaWiki */ +/** + * Character entity aliases accepted by MediaWiki + */ +global $wgHtmlEntityAliases; +$wgHtmlEntityAliases = array( + 'רלמ' => 'rlm', + 'رلم' => 'rlm', +); + + +/** + * XHTML sanitizer for MediaWiki + * @ingroup Parser + */ class Sanitizer { /** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments * @private - * @param string $text - * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values - * @param array $args for the processing callback + * @param $text String + * @param $processCallback Callback to do any variable or parameter replacements in HTML attribute values + * @param $args Array for the processing callback + * @param $extratags Array for any extra tags to include + * @param $removetags Array for any tags (default or extra) to exclude * @return string */ - function removeHTMLtags( $text, $processCallback = null, $args = array() ) { - global $wgUseTidy, $wgUserHtml; - $fname = 'Parser::removeHTMLtags'; - wfProfileIn( $fname ); + static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array(), $removetags = array() ) { + global $wgUseTidy; + + static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, + $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; + + wfProfileIn( __METHOD__ ); - if( $wgUserHtml ) { - $htmlpairs = array( # Tags that must be closed + if ( !$staticInitialised ) { + + $htmlpairsStatic = array( # Tags that must be closed 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', - 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span' + 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u' ); $htmlsingle = array( 'br', 'hr', 'li', 'dt', 'dd' @@ -350,63 +369,107 @@ class Sanitizer { 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' ); - $tabletags = array( # Can only appear inside table - 'td', 'th', 'tr' + $tabletags = array( # Can only appear inside table, we will close them + 'td', 'th', 'tr', + ); + $htmllist = array( # Tags used by list + 'ul','ol', + ); + $listtags = array( # Tags that can appear in a list + 'li', ); - } else { - $htmlpairs = array(); - $htmlsingle = array(); - $htmlnest = array(); - $tabletags = array(); - } - $htmlsingle = array_merge( $tabletags, $htmlsingle ); - $htmlelements = array_merge( $htmlsingle, $htmlpairs ); + $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); + $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); + + # Convert them all to hashtables for faster lookup + $vars = array( 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', + 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ); + foreach ( $vars as $var ) { + $$var = array_flip( $$var ); + } + $staticInitialised = true; + } + # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays + $extratags = array_flip( $extratags ); + $removetags = array_flip( $removetags ); + $htmlpairs = array_merge( $extratags, $htmlpairsStatic ); + $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ) , $removetags ); # Remove HTML comments $text = Sanitizer::removeHTMLcomments( $text ); - $bits = explode( '<', $text ); - $text = array_shift( $bits ); + $text = str_replace( '>', '>', array_shift( $bits ) ); if(!$wgUseTidy) { - $tagstack = array(); $tablestack = array(); + $tagstack = $tablestack = array(); foreach ( $bits as $x ) { - $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) ); - preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/', - $x, $regs ); - list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs; - error_reporting( $prev ); + $regs = array(); + if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) { + list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; + } else { + $slash = $t = $params = $brace = $rest = null; + } $badtag = 0 ; - if ( in_array( $t = strtolower( $t ), $htmlelements ) ) { + if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { # Check our stack if ( $slash ) { # Closing a tag... - if( in_array( $t, $htmlsingleonly ) ) { + if( isset( $htmlsingleonly[$t] ) ) { $badtag = 1; } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) { - @array_push( $tagstack, $ot ); - $badtag = 1; + if ( isset( $htmlsingleallowed[$ot] ) ) { + # Pop all elements with an optional close tag + # and see if we find a match below them + $optstack = array(); + array_push ($optstack, $ot); + while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) && + isset( $htmlsingleallowed[$ot] ) ) + { + array_push ($optstack, $ot); + } + if ( $t != $ot ) { + # No match. Push the optinal elements back again + $badtag = 1; + while ( $ot = @array_pop( $optstack ) ) { + array_push( $tagstack, $ot ); + } + } + } else { + @array_push( $tagstack, $ot ); + #
  • can be nested in