40016d93b2d144f505ba1ec6e924615ed5da0e6c
4 * (X)HTML sanitizer for MediaWiki
6 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
7 * http://www.mediawiki.org/
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 * http://www.gnu.org/copyleft/gpl.html
29 * Regular expression to match various types of character references in
30 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
32 define( 'MW_CHAR_REFS_REGEX',
40 * Regular expression to match HTML/XML attribute pairs within a tag.
41 * Allows some... latitude.
42 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
44 $attrib = '[A-Za-z0-9]';
45 $space = '[\x09\x0a\x0d\x20]';
46 define( 'MW_ATTRIBS_REGEX',
47 "/(?:^|$space)($attrib+)
50 # The attribute value: quoted or alone
53 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
54 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
55 # colors are specified like this.
56 # We'll be normalizing it.
58 )?(?=$space|\$)/sx" );
61 * List of all named character entities defined in HTML 4.01
62 * http://www.w3.org/TR/html4/sgml/entities.html
65 global $wgHtmlEntities;
66 $wgHtmlEntities = array(
322 * Cleans up HTML, removes dangerous tags and attributes, and
323 * removes HTML comments
325 * @param string $text
328 function removeHTMLtags( $text ) {
329 global $wgUseTidy, $wgUserHtml;
330 $fname = 'Parser::removeHTMLtags';
331 wfProfileIn( $fname );
334 $htmlpairs = array( # Tags that must be closed
335 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
336 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
337 'strike', 'strong', 'tt', 'var', 'div', 'center',
338 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
339 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
342 'br', 'hr', 'li', 'dt', 'dd'
344 $htmlnest = array( # Tags that can be nested--??
345 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
346 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
348 $tabletags = array( # Can only appear inside table
352 $htmlpairs = array();
353 $htmlsingle = array();
355 $tabletags = array();
358 $htmlsingle = array_merge( $tabletags, $htmlsingle );
359 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
361 # Remove HTML comments
362 $text = Sanitizer
::removeHTMLcomments( $text );
364 $bits = explode( '<', $text );
365 $text = array_shift( $bits );
367 $tagstack = array(); $tablestack = array();
368 foreach ( $bits as $x ) {
369 $prev = error_reporting( E_ALL
& ~
( E_NOTICE | E_WARNING
) );
370 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
372 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
373 error_reporting( $prev );
376 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
380 if ( ! in_array( $t, $htmlsingle ) &&
381 ( $ot = @array_pop
( $tagstack ) ) != $t ) {
382 @array_push
( $tagstack, $ot );
385 if ( $t == 'table' ) {
386 $tagstack = array_pop( $tablestack );
391 # Keep track for later
392 if ( in_array( $t, $tabletags ) &&
393 ! in_array( 'table', $tagstack ) ) {
395 } else if ( in_array( $t, $tagstack ) &&
396 ! in_array ( $t , $htmlnest ) ) {
398 } else if ( ! in_array( $t, $htmlsingle ) ) {
399 if ( $t == 'table' ) {
400 array_push( $tablestack, $tagstack );
403 array_push( $tagstack, $t );
405 # Strip non-approved attributes from the tag
406 $newparams = Sanitizer
::fixTagAttributes( $params, $t );
409 $rest = str_replace( '>', '>', $rest );
410 $text .= "<$slash$t$newparams$brace$rest";
414 $text .= '<' . str_replace( '>', '>', $x);
416 # Close off any remaining tags
417 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
419 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
422 # this might be possible using tidy itself
423 foreach ( $bits as $x ) {
424 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
426 @list
( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
427 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
428 $newparams = Sanitizer
::fixTagAttributes( $params, $t );
429 $rest = str_replace( '>', '>', $rest );
430 $text .= "<$slash$t$newparams$brace$rest";
432 $text .= '<' . str_replace( '>', '>', $x);
436 wfProfileOut( $fname );
441 * Remove '<!--', '-->', and everything between.
442 * To avoid leaving blank lines, when a comment is both preceded
443 * and followed by a newline (ignoring spaces), trim leading and
444 * trailing spaces and one of the newlines.
447 * @param string $text
450 function removeHTMLcomments( $text ) {
451 $fname='Parser::removeHTMLcomments';
452 wfProfileIn( $fname );
453 while (($start = strpos($text, '<!--')) !== false) {
454 $end = strpos($text, '-->', $start +
4);
455 if ($end === false) {
456 # Unterminated comment; bail out
462 # Trim space and newline if the comment is both
463 # preceded and followed by a newline
464 $spaceStart = max($start - 1, 0);
465 $spaceLen = $end - $spaceStart;
466 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
470 while (substr($text, $spaceStart +
$spaceLen, 1) === ' ')
472 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart +
$spaceLen, 1) === "\n") {
473 # Remove the comment, leading and trailing
474 # spaces, and leave only one newline.
475 $text = substr_replace($text, "\n", $spaceStart, $spaceLen +
1);
478 # Remove just the comment.
479 $text = substr_replace($text, '', $start, $end - $start);
482 wfProfileOut( $fname );
487 * Take a tag soup fragment listing an HTML element's attributes
488 * and normalize it to well-formed XML, discarding unwanted attributes.
490 * - Normalizes attribute names to lowercase
491 * - Discards attributes not on a whitelist for the given element
492 * - Turns broken or invalid entities into plaintext
493 * - Double-quotes all attribute values
494 * - Attributes without values are given the name as attribute
495 * - Double attributes are discarded
496 * - Unsafe style attributes are discarded
497 * - Prepends space if there are attributes.
499 * @param string $text
500 * @param string $element
503 * @todo Check for legal values where the DTD limits things.
504 * @todo Check for unique id attribute :P
506 function fixTagAttributes( $text, $element ) {
507 if( trim( $text ) == '' ) {
512 # Since we quote this later, this can be anything distinguishable
513 # from the end of the attribute
522 $whitelist = array_flip( Sanitizer
::attributeWhitelist( $element ) );
524 foreach( $pairs as $set ) {
525 $attribute = strtolower( $set[1] );
526 if( !isset( $whitelist[$attribute] ) ) {
530 $raw = Sanitizer
::getTagAttributeCallback( $set );
531 $value = Sanitizer
::normalizeAttributeValue( $raw );
533 # Strip javascript "expression" from stylesheets.
534 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
535 if( $attribute == 'style' && preg_match(
536 '/(expression|tps*:\/\/|url\\s*\().*/is',
537 Sanitizer
::decodeCharReferences( $value ) ) ) {
542 if( !isset( $attribs[$attribute] ) ) {
543 $attribs[$attribute] = "$attribute=\"$value\"";
546 if( empty( $attribs ) ) {
549 return ' ' . implode( ' ', $attribs );
554 * Return an associative array of attribute names and values from
555 * a partial tag string. Attribute names are forces to lowercase,
556 * character references are decoded to UTF-8 text.
561 function decodeTagAttributes( $text ) {
564 if( trim( $text ) == '' ) {
576 foreach( $pairs as $set ) {
577 $attribute = strtolower( $set[1] );
578 $value = Sanitizer
::getTagAttributeCallback( $set );
579 $attribs[$attribute] = Sanitizer
::decodeCharReferences( $value );
585 * Pick the appropriate attribute value from a match set from the
586 * MW_ATTRIBS_REGEX matches.
592 function getTagAttributeCallback( $set ) {
593 if( isset( $set[6] ) ) {
594 # Illegal #XXXXXX color with no quotes.
596 } elseif( isset( $set[5] ) ) {
599 } elseif( isset( $set[4] ) ) {
602 } elseif( isset( $set[3] ) ) {
605 } elseif( !isset( $set[2] ) ) {
606 # In XHTML, attributes must have a value.
607 # For 'reduced' form, return explicitly the attribute name here.
610 wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
615 * Normalize whitespace and character references in an XML source-
616 * encoded text for an attribute value.
618 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
619 * but note that we're not returning the value, but are returning
620 * XML source fragments that will be slapped into output.
622 * @param string $text
626 function normalizeAttributeValue( $text ) {
627 return str_replace( '"', '"',
629 '/\r\n|[\x20\x0d\x0a\x09]/',
631 Sanitizer
::normalizeCharReferences( $text ) ) );
635 * Ensure that any entities and character references are legal
636 * for XML and XHTML specifically. Any stray bits will be
637 * &-escaped to result in a valid text fragment.
639 * a. any named char refs must be known in XHTML
640 * b. any numeric char refs must be legal chars, not invalid or forbidden
641 * c. use &#x, not &#X
642 * d. fix or reject non-valid attributes
644 * @param string $text
648 function normalizeCharReferences( $text ) {
649 return preg_replace_callback(
651 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
655 * @param string $matches
658 function normalizeCharReferencesCallback( $matches ) {
660 if( $matches[1] != '' ) {
661 $ret = Sanitizer
::normalizeEntity( $matches[1] );
662 } elseif( $matches[2] != '' ) {
663 $ret = Sanitizer
::decCharReference( $matches[2] );
664 } elseif( $matches[3] != '' ) {
665 $ret = Sanitizer
::hexCharReference( $matches[3] );
666 } elseif( $matches[4] != '' ) {
667 $ret = Sanitizer
::hexCharReference( $matches[4] );
669 if( is_null( $ret ) ) {
670 return htmlspecialchars( $matches[0] );
677 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
678 * return the named entity reference as is. Otherwise, returns
679 * HTML-escaped text of pseudo-entity source (eg &foo;)
681 * @param string $name
684 function normalizeEntity( $name ) {
685 global $wgHtmlEntities;
686 if( isset( $wgHtmlEntities[$name] ) ) {
689 return "&$name;";
693 function decCharReference( $codepoint ) {
694 $point = IntVal( $codepoint );
695 if( Sanitizer
::validateCodepoint( $point ) ) {
696 return sprintf( '&#%d;', $point );
702 function hexCharReference( $codepoint ) {
703 $point = hexdec( $codepoint );
704 if( Sanitizer
::validateCodepoint( $point ) ) {
705 return sprintf( '&#x%x;', $point );
712 * Returns true if a given Unicode codepoint is a valid character in XML.
713 * @param int $codepoint
716 function validateCodepoint( $codepoint ) {
717 return ($codepoint == 0x09)
718 ||
($codepoint == 0x0a)
719 ||
($codepoint == 0x0d)
720 ||
($codepoint >= 0x20 && $codepoint <= 0xd7ff)
721 ||
($codepoint >= 0xe000 && $codepoint <= 0xfffd)
722 ||
($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
726 * Decode any character references, numeric or named entities,
727 * in the text and return a UTF-8 string.
729 * @param string $text
733 function decodeCharReferences( $text ) {
734 return preg_replace_callback(
736 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
741 * @param string $matches
744 function decodeCharReferencesCallback( $matches ) {
745 if( $matches[1] != '' ) {
746 return Sanitizer
::decodeEntity( $matches[1] );
747 } elseif( $matches[2] != '' ) {
748 return Sanitizer
::decodeChar( intval( $matches[2] ) );
749 } elseif( $matches[3] != '' ) {
750 return Sanitizer
::decodeChar( hexdec( $matches[3] ) );
751 } elseif( $matches[4] != '' ) {
752 return Sanitizer
::decodeChar( hexdec( $matches[4] ) );
754 # Last case should be an ampersand by itself
759 * Return UTF-8 string for a codepoint if that is a valid
760 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
761 * @param int $codepoint
765 function decodeChar( $codepoint ) {
766 if( Sanitizer
::validateCodepoint( $codepoint ) ) {
767 return codepointToUtf8( $codepoint );
769 return UTF8_REPLACEMENT
;
774 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
775 * return the UTF-8 encoding of that character. Otherwise, returns
776 * pseudo-entity source (eg &foo;)
778 * @param string $name
781 function decodeEntity( $name ) {
782 global $wgHtmlEntities;
783 if( isset( $wgHtmlEntities[$name] ) ) {
784 return codepointToUtf8( $wgHtmlEntities[$name] );
791 * Fetch the whitelist of acceptable attributes for a given
794 * @param string $element
797 function attributeWhitelist( $element ) {
799 if( !isset( $list ) ) {
800 $list = Sanitizer
::setupAttributeWhitelist();
802 return isset( $list[$element] )
810 function setupAttributeWhitelist() {
811 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
812 $block = array_merge( $common, array( 'align' ) );
813 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
814 $tablecell = array( 'abbr',
820 'nowrap', # deprecated
821 'width', # deprecated
822 'height', # deprecated
823 'bgcolor' # deprecated
826 # Numbers refer to sections in HTML 4.01 standard describing the element.
827 # See: http://www.w3.org/TR/html4/
831 'center' => $common, # deprecated
832 'span' => $block, # ??
861 'blockquote' => array_merge( $common, array( 'cite' ) ),
872 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
875 'pre' => array_merge( $common, array( 'width' ) ),
878 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
879 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
882 'ul' => array_merge( $common, array( 'type' ) ),
883 'ol' => array_merge( $common, array( 'type', 'start' ) ),
884 'li' => array_merge( $common, array( 'type', 'value' ) ),
892 'table' => array_merge( $common,
893 array( 'summary', 'width', 'border', 'frame',
894 'rules', 'cellspacing', 'cellpadding',
895 'align', 'bgcolor', 'frame', 'rules',
899 'caption' => array_merge( $common, array( 'align' ) ),
902 'thead' => array_merge( $common, $tablealign ),
903 'tfoot' => array_merge( $common, $tablealign ),
904 'tbody' => array_merge( $common, $tablealign ),
907 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
908 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
911 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
914 'td' => array_merge( $common, $tablecell, $tablealign ),
915 'th' => array_merge( $common, $tablecell, $tablealign ),
928 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
932 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
934 # XHTML Ruby annotation text module, simple ruby only.
935 # http://www.w3c.org/TR/ruby/
940 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
947 * Take a fragment of (potentially invalid) HTML and return
948 * a version with any tags removed, encoded suitably for literal
949 * inclusion in an attribute value.
951 * @param string $text HTML fragment
954 function stripAllTags( $text ) {
956 $text = preg_replace( '/<[^>]*>/', '', $text );
958 # Normalize &entities and whitespace
959 $text = Sanitizer
::normalizeAttributeValue( $text );
961 # Will be placed into "double-quoted" attributes,
962 # make sure remaining bits are safe.
964 array('<', '>', '"'),
965 array('<', '>', '"'),