4 * (X)HTML sanitizer for MediaWiki
6 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
7 * http://www.mediawiki.org/
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 * http://www.gnu.org/copyleft/gpl.html
29 * Regular expression to match various types of character references in
30 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
32 define( 'MW_CHAR_REFS_REGEX',
40 * Regular expression to match HTML/XML attribute pairs within a tag.
41 * Allows some... latitude.
42 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
44 $attrib = '[A-Za-z0-9]';
45 $space = '[\x09\x0a\x0d\x20]';
46 define( 'MW_ATTRIBS_REGEX',
47 "/(?:^|$space)($attrib+)
50 # The attribute value: quoted or alone
53 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
54 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
55 # colors are specified like this.
56 # We'll be normalizing it.
58 )?(?=$space|\$)/sx" );
61 * List of all named character entities defined in HTML 4.01
62 * http://www.w3.org/TR/html4/sgml/entities.html
65 global $wgHtmlEntities;
66 $wgHtmlEntities = array(
322 * Cleans up HTML, removes dangerous tags and attributes, and
323 * removes HTML comments
325 * @param string $text
328 function removeHTMLtags( $text ) {
329 global $wgUseTidy, $wgUserHtml;
330 $fname = 'Parser::removeHTMLtags';
331 wfProfileIn( $fname );
334 $htmlpairs = array( # Tags that must be closed
335 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
336 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
337 'strike', 'strong', 'tt', 'var', 'div', 'center',
338 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
339 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
342 'br', 'hr', 'li', 'dt', 'dd'
344 $htmlnest = array( # Tags that can be nested--??
345 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
346 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
348 $tabletags = array( # Can only appear inside table
352 $htmlpairs = array();
353 $htmlsingle = array();
355 $tabletags = array();
358 $htmlsingle = array_merge( $tabletags, $htmlsingle );
359 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
361 # Remove HTML comments
362 $text = Sanitizer
::removeHTMLcomments( $text );
364 $bits = explode( '<', $text );
365 $text = array_shift( $bits );
367 $tagstack = array(); $tablestack = array();
368 foreach ( $bits as $x ) {
369 $prev = error_reporting( E_ALL
& ~
( E_NOTICE | E_WARNING
) );
370 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
372 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
373 error_reporting( $prev );
376 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
380 if ( ! in_array( $t, $htmlsingle ) &&
381 ( $ot = @array_pop
( $tagstack ) ) != $t ) {
382 @array_push
( $tagstack, $ot );
385 if ( $t == 'table' ) {
386 $tagstack = array_pop( $tablestack );
391 # Keep track for later
392 if ( in_array( $t, $tabletags ) &&
393 ! in_array( 'table', $tagstack ) ) {
395 } else if ( in_array( $t, $tagstack ) &&
396 ! in_array ( $t , $htmlnest ) ) {
398 } else if ( ! in_array( $t, $htmlsingle ) ) {
399 if ( $t == 'table' ) {
400 array_push( $tablestack, $tagstack );
403 array_push( $tagstack, $t );
405 # Strip non-approved attributes from the tag
406 $newparams = Sanitizer
::fixTagAttributes( $params, $t );
409 $rest = str_replace( '>', '>', $rest );
410 $text .= "<$slash$t$newparams$brace$rest";
414 $text .= '<' . str_replace( '>', '>', $x);
416 # Close off any remaining tags
417 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
419 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
422 # this might be possible using tidy itself
423 foreach ( $bits as $x ) {
424 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
426 @list
( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
427 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
428 $newparams = Sanitizer
::fixTagAttributes( $params, $t );
429 $rest = str_replace( '>', '>', $rest );
430 $text .= "<$slash$t$newparams$brace$rest";
432 $text .= '<' . str_replace( '>', '>', $x);
436 wfProfileOut( $fname );
441 * Remove '<!--', '-->', and everything between.
442 * To avoid leaving blank lines, when a comment is both preceded
443 * and followed by a newline (ignoring spaces), trim leading and
444 * trailing spaces and one of the newlines.
447 * @param string $text
450 function removeHTMLcomments( $text ) {
451 $fname='Parser::removeHTMLcomments';
452 wfProfileIn( $fname );
453 while (($start = strpos($text, '<!--')) !== false) {
454 $end = strpos($text, '-->', $start +
4);
455 if ($end === false) {
456 # Unterminated comment; bail out
462 # Trim space and newline if the comment is both
463 # preceded and followed by a newline
464 $spaceStart = max($start - 1, 0);
465 $spaceLen = $end - $spaceStart;
466 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
470 while (substr($text, $spaceStart +
$spaceLen, 1) === ' ')
472 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart +
$spaceLen, 1) === "\n") {
473 # Remove the comment, leading and trailing
474 # spaces, and leave only one newline.
475 $text = substr_replace($text, "\n", $spaceStart, $spaceLen +
1);
478 # Remove just the comment.
479 $text = substr_replace($text, '', $start, $end - $start);
482 wfProfileOut( $fname );
487 * Take a tag soup fragment listing an HTML element's attributes
488 * and normalize it to well-formed XML, discarding unwanted attributes.
490 * - Normalizes attribute names to lowercase
491 * - Discards attributes not on a whitelist for the given element
492 * - Turns broken or invalid entities into plaintext
493 * - Double-quotes all attribute values
494 * - Attributes without values are given the name as attribute
495 * - Double attributes are discarded
496 * - Unsafe style attributes are discarded
497 * - Prepends space if there are attributes.
499 * @param string $text
500 * @param string $element
503 * @todo Check for legal values where the DTD limits things.
504 * @todo Check for unique id attribute :P
506 function fixTagAttributes( $text, $element ) {
507 if( trim( $text ) == '' ) {
512 # Since we quote this later, this can be anything distinguishable
513 # from the end of the attribute
522 $whitelist = array_flip( Sanitizer
::attributeWhitelist( $element ) );
524 foreach( $pairs as $set ) {
525 $attribute = strtolower( $set[1] );
526 if( !isset( $whitelist[$attribute] ) ) {
530 $raw = Sanitizer
::getTagAttributeCallback( $set );
531 $value = Sanitizer
::normalizeAttributeValue( $raw );
533 # Strip javascript "expression" from stylesheets.
534 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
535 if( $attribute == 'style' && preg_match(
536 '/(expression|tps*:\/\/|url\\s*\().*/is',
537 Sanitizer
::decodeCharReferences( $value ) ) ) {
542 # Templates and links may be expanded in later parsing,
543 # creating invalid or dangerous output. Suppress this.
544 $value = strtr( $value, array(
547 "''" => '''',
548 'ISBN' => 'ISBN',
550 'PMID' => 'PMID',
552 $value = preg_replace(
553 '/(' . URL_PROTOCOLS
. '):/',
554 '\\1:', $value );
556 if( !isset( $attribs[$attribute] ) ) {
557 $attribs[$attribute] = "$attribute=\"$value\"";
560 if( empty( $attribs ) ) {
563 return ' ' . implode( ' ', $attribs );
568 * Return an associative array of attribute names and values from
569 * a partial tag string. Attribute names are forces to lowercase,
570 * character references are decoded to UTF-8 text.
575 function decodeTagAttributes( $text ) {
578 if( trim( $text ) == '' ) {
590 foreach( $pairs as $set ) {
591 $attribute = strtolower( $set[1] );
592 $value = Sanitizer
::getTagAttributeCallback( $set );
593 $attribs[$attribute] = Sanitizer
::decodeCharReferences( $value );
599 * Pick the appropriate attribute value from a match set from the
600 * MW_ATTRIBS_REGEX matches.
606 function getTagAttributeCallback( $set ) {
607 if( isset( $set[6] ) ) {
608 # Illegal #XXXXXX color with no quotes.
610 } elseif( isset( $set[5] ) ) {
613 } elseif( isset( $set[4] ) ) {
616 } elseif( isset( $set[3] ) ) {
619 } elseif( !isset( $set[2] ) ) {
620 # In XHTML, attributes must have a value.
621 # For 'reduced' form, return explicitly the attribute name here.
624 wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
629 * Normalize whitespace and character references in an XML source-
630 * encoded text for an attribute value.
632 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
633 * but note that we're not returning the value, but are returning
634 * XML source fragments that will be slapped into output.
636 * @param string $text
640 function normalizeAttributeValue( $text ) {
641 return str_replace( '"', '"',
643 '/\r\n|[\x20\x0d\x0a\x09]/',
645 Sanitizer
::normalizeCharReferences( $text ) ) );
649 * Ensure that any entities and character references are legal
650 * for XML and XHTML specifically. Any stray bits will be
651 * &-escaped to result in a valid text fragment.
653 * a. any named char refs must be known in XHTML
654 * b. any numeric char refs must be legal chars, not invalid or forbidden
655 * c. use &#x, not &#X
656 * d. fix or reject non-valid attributes
658 * @param string $text
662 function normalizeCharReferences( $text ) {
663 return preg_replace_callback(
665 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
669 * @param string $matches
672 function normalizeCharReferencesCallback( $matches ) {
674 if( $matches[1] != '' ) {
675 $ret = Sanitizer
::normalizeEntity( $matches[1] );
676 } elseif( $matches[2] != '' ) {
677 $ret = Sanitizer
::decCharReference( $matches[2] );
678 } elseif( $matches[3] != '' ) {
679 $ret = Sanitizer
::hexCharReference( $matches[3] );
680 } elseif( $matches[4] != '' ) {
681 $ret = Sanitizer
::hexCharReference( $matches[4] );
683 if( is_null( $ret ) ) {
684 return htmlspecialchars( $matches[0] );
691 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
692 * return the named entity reference as is. Otherwise, returns
693 * HTML-escaped text of pseudo-entity source (eg &foo;)
695 * @param string $name
698 function normalizeEntity( $name ) {
699 global $wgHtmlEntities;
700 if( isset( $wgHtmlEntities[$name] ) ) {
703 return "&$name;";
707 function decCharReference( $codepoint ) {
708 $point = IntVal( $codepoint );
709 if( Sanitizer
::validateCodepoint( $point ) ) {
710 return sprintf( '&#%d;', $point );
716 function hexCharReference( $codepoint ) {
717 $point = hexdec( $codepoint );
718 if( Sanitizer
::validateCodepoint( $point ) ) {
719 return sprintf( '&#x%x;', $point );
726 * Returns true if a given Unicode codepoint is a valid character in XML.
727 * @param int $codepoint
730 function validateCodepoint( $codepoint ) {
731 return ($codepoint == 0x09)
732 ||
($codepoint == 0x0a)
733 ||
($codepoint == 0x0d)
734 ||
($codepoint >= 0x20 && $codepoint <= 0xd7ff)
735 ||
($codepoint >= 0xe000 && $codepoint <= 0xfffd)
736 ||
($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
740 * Decode any character references, numeric or named entities,
741 * in the text and return a UTF-8 string.
743 * @param string $text
747 function decodeCharReferences( $text ) {
748 return preg_replace_callback(
750 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
755 * @param string $matches
758 function decodeCharReferencesCallback( $matches ) {
759 if( $matches[1] != '' ) {
760 return Sanitizer
::decodeEntity( $matches[1] );
761 } elseif( $matches[2] != '' ) {
762 return Sanitizer
::decodeChar( intval( $matches[2] ) );
763 } elseif( $matches[3] != '' ) {
764 return Sanitizer
::decodeChar( hexdec( $matches[3] ) );
765 } elseif( $matches[4] != '' ) {
766 return Sanitizer
::decodeChar( hexdec( $matches[4] ) );
768 # Last case should be an ampersand by itself
773 * Return UTF-8 string for a codepoint if that is a valid
774 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
775 * @param int $codepoint
779 function decodeChar( $codepoint ) {
780 if( Sanitizer
::validateCodepoint( $codepoint ) ) {
781 return codepointToUtf8( $codepoint );
783 return UTF8_REPLACEMENT
;
788 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
789 * return the UTF-8 encoding of that character. Otherwise, returns
790 * pseudo-entity source (eg &foo;)
792 * @param string $name
795 function decodeEntity( $name ) {
796 global $wgHtmlEntities;
797 if( isset( $wgHtmlEntities[$name] ) ) {
798 return codepointToUtf8( $wgHtmlEntities[$name] );
805 * Fetch the whitelist of acceptable attributes for a given
808 * @param string $element
811 function attributeWhitelist( $element ) {
813 if( !isset( $list ) ) {
814 $list = Sanitizer
::setupAttributeWhitelist();
816 return isset( $list[$element] )
824 function setupAttributeWhitelist() {
825 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
826 $block = array_merge( $common, array( 'align' ) );
827 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
828 $tablecell = array( 'abbr',
834 'nowrap', # deprecated
835 'width', # deprecated
836 'height', # deprecated
837 'bgcolor' # deprecated
840 # Numbers refer to sections in HTML 4.01 standard describing the element.
841 # See: http://www.w3.org/TR/html4/
845 'center' => $common, # deprecated
846 'span' => $block, # ??
875 'blockquote' => array_merge( $common, array( 'cite' ) ),
886 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
889 'pre' => array_merge( $common, array( 'width' ) ),
892 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
893 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
896 'ul' => array_merge( $common, array( 'type' ) ),
897 'ol' => array_merge( $common, array( 'type', 'start' ) ),
898 'li' => array_merge( $common, array( 'type', 'value' ) ),
906 'table' => array_merge( $common,
907 array( 'summary', 'width', 'border', 'frame',
908 'rules', 'cellspacing', 'cellpadding',
909 'align', 'bgcolor', 'frame', 'rules',
913 'caption' => array_merge( $common, array( 'align' ) ),
916 'thead' => array_merge( $common, $tablealign ),
917 'tfoot' => array_merge( $common, $tablealign ),
918 'tbody' => array_merge( $common, $tablealign ),
921 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
922 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
925 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
928 'td' => array_merge( $common, $tablecell, $tablealign ),
929 'th' => array_merge( $common, $tablecell, $tablealign ),
942 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
946 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
948 # XHTML Ruby annotation text module, simple ruby only.
949 # http://www.w3c.org/TR/ruby/
954 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
961 * Take a fragment of (potentially invalid) HTML and return
962 * a version with any tags removed, encoded suitably for literal
963 * inclusion in an attribute value.
965 * @param string $text HTML fragment
968 function stripAllTags( $text ) {
970 $text = preg_replace( '/<[^>]*>/', '', $text );
972 # Normalize &entities and whitespace
973 $text = Sanitizer
::normalizeAttributeValue( $text );
975 # Will be placed into "double-quoted" attributes,
976 # make sure remaining bits are safe.
978 array('<', '>', '"'),
979 array('<', '>', '"'),