X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2FSanitizer.php;h=c1c8daf37affbb40f27444e115068ab392dee7c9;hb=7f6453944e7684185dcb37a6a966156df3c870a4;hp=d28ed93af0ab8bd77ac961c9d90d54e822ffe77f;hpb=fe40fe9da89278bc33549162f46490af91e60d11;p=lhc%2Fweb%2Fwiklou.git diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php index d28ed93af0..c1c8daf37a 100644 --- a/includes/Sanitizer.php +++ b/includes/Sanitizer.php @@ -1,14 +1,13 @@ et al * http://www.mediawiki.org/ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or + * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, @@ -18,11 +17,10 @@ * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., - * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * http://www.gnu.org/copyleft/gpl.html * - * @package MediaWiki - * @subpackage Parser + * @addtogroup Parser */ /** @@ -30,16 +28,37 @@ * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences */ define( 'MW_CHAR_REFS_REGEX', - '/&([A-Za-z0-9]+); + '/&([A-Za-z0-9\x80-\xff]+); |&\#([0-9]+); |&\#x([0-9A-Za-z]+); |&\#X([0-9A-Za-z]+); |(&)/x' ); +/** + * Regular expression to match HTML/XML attribute pairs within a tag. + * Allows some... latitude. + * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes + */ +$attrib = '[A-Za-z0-9]'; +$space = '[\x09\x0a\x0d\x20]'; +define( 'MW_ATTRIBS_REGEX', + "/(?:^|$space)($attrib+) + ($space*=$space* + (?: + # The attribute value: quoted or alone + \"([^<\"]*)\" + | '([^<']*)' + | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+) + | (\#[0-9a-fA-F]+) # Technically wrong, but lots of + # colors are specified like this. + # We'll be normalizing it. + ) + )?(?=$space|\$)/sx" ); + /** * List of all named character entities defined in HTML 4.01 * http://www.w3.org/TR/html4/sgml/entities.html - * @access private + * @private */ global $wgHtmlEntities; $wgHtmlEntities = array( @@ -296,97 +315,177 @@ $wgHtmlEntities = array( 'zwj' => 8205, 'zwnj' => 8204 ); +/** + * Character entity aliases accepted by MediaWiki + */ +global $wgHtmlEntityAliases; +$wgHtmlEntityAliases = array( + 'רלמ' => 'rlm', + 'رلم' => 'rlm', +); + + +/** + * XHTML sanitizer for MediaWiki + * @addtogroup Parser + */ class Sanitizer { + const NONE = 0; + const INITIAL_NONLETTER = 1; + /** * Cleans up HTML, removes dangerous tags and attributes, and * removes HTML comments - * @access private + * @private * @param string $text + * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values + * @param array $args for the processing callback * @return string */ - function removeHTMLtags( $text ) { - global $wgUseTidy, $wgUserHtml; - $fname = 'Parser::removeHTMLtags'; - wfProfileIn( $fname ); + static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array() ) { + global $wgUseTidy; + + static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, + $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised; + + wfProfileIn( __METHOD__ ); - if( $wgUserHtml ) { - $htmlpairs = array( # Tags that must be closed + if ( !$staticInitialised ) { + + $htmlpairs = array_merge( $extratags, array( # Tags that must be closed 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', 'strike', 'strong', 'tt', 'var', 'div', 'center', 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', - 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span' - ); + 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u' + ) ); $htmlsingle = array( 'br', 'hr', 'li', 'dt', 'dd' ); + $htmlsingleonly = array( # Elements that cannot have close tags + 'br', 'hr' + ); $htmlnest = array( # Tags that can be nested--?? 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span' ); - $tabletags = array( # Can only appear inside table - 'td', 'th', 'tr' + $tabletags = array( # Can only appear inside table, we will close them + 'td', 'th', 'tr', + ); + $htmllist = array( # Tags used by list + 'ul','ol', + ); + $listtags = array( # Tags that can appear in a list + 'li', ); - } else { - $htmlpairs = array(); - $htmlsingle = array(); - $htmlnest = array(); - $tabletags = array(); - } - $htmlsingle = array_merge( $tabletags, $htmlsingle ); - $htmlelements = array_merge( $htmlsingle, $htmlpairs ); + $htmlsingleallowed = array_merge( $htmlsingle, $tabletags ); + $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest ); + + # Convert them all to hashtables for faster lookup + $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', + 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' ); + foreach ( $vars as $var ) { + $$var = array_flip( $$var ); + } + $staticInitialised = true; + } # Remove HTML comments $text = Sanitizer::removeHTMLcomments( $text ); - $bits = explode( '<', $text ); - $text = array_shift( $bits ); + $text = str_replace( '>', '>', array_shift( $bits ) ); if(!$wgUseTidy) { - $tagstack = array(); $tablestack = array(); + $tagstack = $tablestack = array(); foreach ( $bits as $x ) { - $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) ); - preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/', - $x, $regs ); - list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs; - error_reporting( $prev ); + $regs = array(); + if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) { + list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; + } else { + $slash = $t = $params = $brace = $rest = null; + } $badtag = 0 ; - if ( in_array( $t = strtolower( $t ), $htmlelements ) ) { + if ( isset( $htmlelements[$t = strtolower( $t )] ) ) { # Check our stack if ( $slash ) { # Closing a tag... - if ( ! in_array( $t, $htmlsingle ) && - ( $ot = @array_pop( $tagstack ) ) != $t ) { - @array_push( $tagstack, $ot ); + if( isset( $htmlsingleonly[$t] ) ) { $badtag = 1; + } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) { + if ( isset( $htmlsingleallowed[$ot] ) ) { + # Pop all elements with an optional close tag + # and see if we find a match below them + $optstack = array(); + array_push ($optstack, $ot); + while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) && + isset( $htmlsingleallowed[$ot] ) ) + { + array_push ($optstack, $ot); + } + if ( $t != $ot ) { + # No match. Push the optinal elements back again + $badtag = 1; + while ( $ot = @array_pop( $optstack ) ) { + array_push( $tagstack, $ot ); + } + } + } else { + @array_push( $tagstack, $ot ); + #
  • can be nested in