3 * XHTML sanitizer for MediaWiki
5 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
6 * http://www.mediawiki.org/
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
28 * Regular expression to match various types of character references in
29 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
31 define( 'MW_CHAR_REFS_REGEX',
39 * Regular expression to match HTML/XML attribute pairs within a tag.
40 * Allows some... latitude.
41 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
43 $attrib = '[A-Za-z0-9]';
44 $space = '[\x09\x0a\x0d\x20]';
45 define( 'MW_ATTRIBS_REGEX',
46 "/(?:^|$space)($attrib+)
49 # The attribute value: quoted or alone
52 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
53 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
54 # colors are specified like this.
55 # We'll be normalizing it.
57 )?(?=$space|\$)/sx" );
60 * List of all named character entities defined in HTML 4.01
61 * http://www.w3.org/TR/html4/sgml/entities.html
64 global $wgHtmlEntities;
65 $wgHtmlEntities = array(
319 /** @package MediaWiki */
322 * Cleans up HTML, removes dangerous tags and attributes, and
323 * removes HTML comments
325 * @param string $text
326 * @param callback $processCallback to do any variable or parameter replacements in HTML attribute values
327 * @param array $args for the processing callback
330 function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
331 global $wgUseTidy, $wgUserHtml;
332 $fname = 'Parser::removeHTMLtags';
333 wfProfileIn( $fname );
336 $htmlpairs = array( # Tags that must be closed
337 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
338 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
339 'strike', 'strong', 'tt', 'var', 'div', 'center',
340 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
341 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
344 'br', 'hr', 'li', 'dt', 'dd'
346 $htmlsingleonly = array( # Elements that cannot have close tags
349 $htmlnest = array( # Tags that can be nested--??
350 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
351 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
353 $tabletags = array( # Can only appear inside table
357 $htmlpairs = array();
358 $htmlsingle = array();
360 $tabletags = array();
363 $htmlsingle = array_merge( $tabletags, $htmlsingle );
364 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
366 # Remove HTML comments
367 $text = Sanitizer
::removeHTMLcomments( $text );
368 $bits = explode( '<', $text );
369 $text = array_shift( $bits );
371 $tagstack = array(); $tablestack = array();
372 foreach ( $bits as $x ) {
373 $prev = error_reporting( E_ALL
& ~
( E_NOTICE | E_WARNING
) );
374 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
376 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
377 error_reporting( $prev );
380 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
384 if( in_array( $t, $htmlsingleonly ) ) {
386 } elseif ( ( $ot = @array_pop
( $tagstack ) ) != $t ) {
387 @array_push
( $tagstack, $ot );
390 if ( $t == 'table' ) {
391 $tagstack = array_pop( $tablestack );
396 # Keep track for later
397 if ( in_array( $t, $tabletags ) &&
398 ! in_array( 'table', $tagstack ) ) {
400 } else if ( in_array( $t, $tagstack ) &&
401 ! in_array ( $t , $htmlnest ) ) {
403 #Â Is it a self closed htmlpair ? (bug 5487)
404 } else if( $brace == '/>' &&
405 in_array($t, $htmlpairs) ) {
407 } elseif( in_array( $t, $htmlsingleonly ) ) {
408 # Hack to force empty tag for uncloseable elements
410 } else if( in_array( $t, $htmlsingle ) ) {
411 # Hack to not close $htmlsingle tags
414 if ( $t == 'table' ) {
415 array_push( $tablestack, $tagstack );
418 array_push( $tagstack, $t );
421 # Replace any variables or template parameters with
423 if( is_callable( $processCallback ) ) {
424 call_user_func_array( $processCallback, array( &$params, $args ) );
427 # Strip non-approved attributes from the tag
428 $newparams = Sanitizer
::fixTagAttributes( $params, $t );
431 $rest = str_replace( '>', '>', $rest );
432 $close = ( $brace == '/>' ) ?
' /' : '';
433 $text .= "<$slash$t$newparams$close>$rest";
437 $text .= '<' . str_replace( '>', '>', $x);
439 # Close off any remaining tags
440 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
442 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
445 # this might be possible using tidy itself
446 foreach ( $bits as $x ) {
447 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
449 @list
( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
450 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
451 if( is_callable( $processCallback ) ) {
452 call_user_func_array( $processCallback, array( &$params, $args ) );
454 $newparams = Sanitizer
::fixTagAttributes( $params, $t );
455 $rest = str_replace( '>', '>', $rest );
456 $text .= "<$slash$t$newparams$brace$rest";
458 $text .= '<' . str_replace( '>', '>', $x);
462 wfProfileOut( $fname );
467 * Remove '<!--', '-->', and everything between.
468 * To avoid leaving blank lines, when a comment is both preceded
469 * and followed by a newline (ignoring spaces), trim leading and
470 * trailing spaces and one of the newlines.
473 * @param string $text
476 function removeHTMLcomments( $text ) {
477 $fname='Parser::removeHTMLcomments';
478 wfProfileIn( $fname );
479 while (($start = strpos($text, '<!--')) !== false) {
480 $end = strpos($text, '-->', $start +
4);
481 if ($end === false) {
482 # Unterminated comment; bail out
488 # Trim space and newline if the comment is both
489 # preceded and followed by a newline
490 $spaceStart = max($start - 1, 0);
491 $spaceLen = $end - $spaceStart;
492 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
496 while (substr($text, $spaceStart +
$spaceLen, 1) === ' ')
498 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart +
$spaceLen, 1) === "\n") {
499 # Remove the comment, leading and trailing
500 # spaces, and leave only one newline.
501 $text = substr_replace($text, "\n", $spaceStart, $spaceLen +
1);
504 # Remove just the comment.
505 $text = substr_replace($text, '', $start, $end - $start);
508 wfProfileOut( $fname );
513 * Take a tag soup fragment listing an HTML element's attributes
514 * and normalize it to well-formed XML, discarding unwanted attributes.
516 * - Normalizes attribute names to lowercase
517 * - Discards attributes not on a whitelist for the given element
518 * - Turns broken or invalid entities into plaintext
519 * - Double-quotes all attribute values
520 * - Attributes without values are given the name as attribute
521 * - Double attributes are discarded
522 * - Unsafe style attributes are discarded
523 * - Prepends space if there are attributes.
525 * @param string $text
526 * @param string $element
529 * @todo Check for legal values where the DTD limits things.
530 * @todo Check for unique id attribute :P
532 function fixTagAttributes( $text, $element ) {
533 if( trim( $text ) == '' ) {
538 # Since we quote this later, this can be anything distinguishable
539 # from the end of the attribute
549 $whitelist = array_flip( Sanitizer
::attributeWhitelist( $element ) );
551 foreach( $pairs as $set ) {
552 $attribute = strtolower( $set[1] );
553 if( !isset( $whitelist[$attribute] ) ) {
557 $raw = Sanitizer
::getTagAttributeCallback( $set );
558 $value = Sanitizer
::normalizeAttributeValue( $raw );
560 # Strip javascript "expression" from stylesheets.
561 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
562 if( $attribute == 'style' ) {
563 $stripped = Sanitizer
::decodeCharReferences( $value );
565 // Remove any comments; IE gets token splitting wrong
566 $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
567 $value = htmlspecialchars( $stripped );
569 // ... and continue checks
570 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
571 'codepointToUtf8(hexdec("$1"))', $stripped );
572 $stripped = str_replace( '\\', '', $stripped );
573 if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
580 if ( $attribute === 'id' )
581 $value = Sanitizer
::escapeId( $value );
583 # Templates and links may be expanded in later parsing,
584 # creating invalid or dangerous output. Suppress this.
585 $value = strtr( $value, array(
586 '<' => '<', // This should never happen,
587 '>' => '>', // we've received invalid input
588 '"' => '"', // which should have been escaped.
591 "''" => '''',
592 'ISBN' => 'ISBN',
594 'PMID' => 'PMID',
598 $value = preg_replace_callback(
599 '/(' . wfUrlProtocols() . ')/',
600 array( 'Sanitizer', 'armorLinksCallback' ),
603 // If this attribute was previously set, override it.
604 // Output should only have one attribute of each name.
605 $attribs[$attribute] = "$attribute=\"$value\"";
608 return count( $attribs ) ?
' ' . implode( ' ', $attribs ) : '';
612 * Given a value escape it so that it can be used in an id attribute and
613 * return it, this does not validate the value however (see first link)
615 * @link http://www.w3.org/TR/html401/types.html#type-name Valid characters
618 * @link http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with the id attribute
627 function escapeId( $id ) {
628 static $replace = array(
633 $id = urlencode( Sanitizer
::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
635 return str_replace( array_keys( $replace ), array_values( $replace ), $id );
639 * Regex replace callback for armoring links against further processing.
640 * @param array $matches
644 function armorLinksCallback( $matches ) {
645 return str_replace( ':', ':', $matches[1] );
649 * Return an associative array of attribute names and values from
650 * a partial tag string. Attribute names are forces to lowercase,
651 * character references are decoded to UTF-8 text.
656 function decodeTagAttributes( $text ) {
659 if( trim( $text ) == '' ) {
672 foreach( $pairs as $set ) {
673 $attribute = strtolower( $set[1] );
674 $value = Sanitizer
::getTagAttributeCallback( $set );
675 $attribs[$attribute] = Sanitizer
::decodeCharReferences( $value );
681 * Pick the appropriate attribute value from a match set from the
682 * MW_ATTRIBS_REGEX matches.
688 function getTagAttributeCallback( $set ) {
689 if( isset( $set[6] ) ) {
690 # Illegal #XXXXXX color with no quotes.
692 } elseif( isset( $set[5] ) ) {
695 } elseif( isset( $set[4] ) ) {
698 } elseif( isset( $set[3] ) ) {
701 } elseif( !isset( $set[2] ) ) {
702 # In XHTML, attributes must have a value.
703 # For 'reduced' form, return explicitly the attribute name here.
706 wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
711 * Normalize whitespace and character references in an XML source-
712 * encoded text for an attribute value.
714 * See http://www.w3.org/TR/REC-xml/#AVNormalize for background,
715 * but note that we're not returning the value, but are returning
716 * XML source fragments that will be slapped into output.
718 * @param string $text
722 function normalizeAttributeValue( $text ) {
723 return str_replace( '"', '"',
725 '/\r\n|[\x20\x0d\x0a\x09]/',
727 Sanitizer
::normalizeCharReferences( $text ) ) );
731 * Ensure that any entities and character references are legal
732 * for XML and XHTML specifically. Any stray bits will be
733 * &-escaped to result in a valid text fragment.
735 * a. any named char refs must be known in XHTML
736 * b. any numeric char refs must be legal chars, not invalid or forbidden
737 * c. use &#x, not &#X
738 * d. fix or reject non-valid attributes
740 * @param string $text
744 function normalizeCharReferences( $text ) {
745 return preg_replace_callback(
747 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
751 * @param string $matches
754 function normalizeCharReferencesCallback( $matches ) {
756 if( $matches[1] != '' ) {
757 $ret = Sanitizer
::normalizeEntity( $matches[1] );
758 } elseif( $matches[2] != '' ) {
759 $ret = Sanitizer
::decCharReference( $matches[2] );
760 } elseif( $matches[3] != '' ) {
761 $ret = Sanitizer
::hexCharReference( $matches[3] );
762 } elseif( $matches[4] != '' ) {
763 $ret = Sanitizer
::hexCharReference( $matches[4] );
765 if( is_null( $ret ) ) {
766 return htmlspecialchars( $matches[0] );
773 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
774 * return the named entity reference as is. Otherwise, returns
775 * HTML-escaped text of pseudo-entity source (eg &foo;)
777 * @param string $name
780 function normalizeEntity( $name ) {
781 global $wgHtmlEntities;
782 if( isset( $wgHtmlEntities[$name] ) ) {
785 return "&$name;";
789 function decCharReference( $codepoint ) {
790 $point = intval( $codepoint );
791 if( Sanitizer
::validateCodepoint( $point ) ) {
792 return sprintf( '&#%d;', $point );
798 function hexCharReference( $codepoint ) {
799 $point = hexdec( $codepoint );
800 if( Sanitizer
::validateCodepoint( $point ) ) {
801 return sprintf( '&#x%x;', $point );
808 * Returns true if a given Unicode codepoint is a valid character in XML.
809 * @param int $codepoint
812 function validateCodepoint( $codepoint ) {
813 return ($codepoint == 0x09)
814 ||
($codepoint == 0x0a)
815 ||
($codepoint == 0x0d)
816 ||
($codepoint >= 0x20 && $codepoint <= 0xd7ff)
817 ||
($codepoint >= 0xe000 && $codepoint <= 0xfffd)
818 ||
($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
822 * Decode any character references, numeric or named entities,
823 * in the text and return a UTF-8 string.
825 * @param string $text
829 function decodeCharReferences( $text ) {
830 return preg_replace_callback(
832 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
837 * @param string $matches
840 function decodeCharReferencesCallback( $matches ) {
841 if( $matches[1] != '' ) {
842 return Sanitizer
::decodeEntity( $matches[1] );
843 } elseif( $matches[2] != '' ) {
844 return Sanitizer
::decodeChar( intval( $matches[2] ) );
845 } elseif( $matches[3] != '' ) {
846 return Sanitizer
::decodeChar( hexdec( $matches[3] ) );
847 } elseif( $matches[4] != '' ) {
848 return Sanitizer
::decodeChar( hexdec( $matches[4] ) );
850 # Last case should be an ampersand by itself
855 * Return UTF-8 string for a codepoint if that is a valid
856 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
857 * @param int $codepoint
861 function decodeChar( $codepoint ) {
862 if( Sanitizer
::validateCodepoint( $codepoint ) ) {
863 return codepointToUtf8( $codepoint );
865 return UTF8_REPLACEMENT
;
870 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
871 * return the UTF-8 encoding of that character. Otherwise, returns
872 * pseudo-entity source (eg &foo;)
874 * @param string $name
877 function decodeEntity( $name ) {
878 global $wgHtmlEntities;
879 if( isset( $wgHtmlEntities[$name] ) ) {
880 return codepointToUtf8( $wgHtmlEntities[$name] );
887 * Fetch the whitelist of acceptable attributes for a given
890 * @param string $element
893 function attributeWhitelist( $element ) {
895 if( !isset( $list ) ) {
896 $list = Sanitizer
::setupAttributeWhitelist();
898 return isset( $list[$element] )
906 function setupAttributeWhitelist() {
907 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
908 $block = array_merge( $common, array( 'align' ) );
909 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
910 $tablecell = array( 'abbr',
916 'nowrap', # deprecated
917 'width', # deprecated
918 'height', # deprecated
919 'bgcolor' # deprecated
922 # Numbers refer to sections in HTML 4.01 standard describing the element.
923 # See: http://www.w3.org/TR/html4/
927 'center' => $common, # deprecated
928 'span' => $block, # ??
957 'blockquote' => array_merge( $common, array( 'cite' ) ),
968 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
971 'pre' => array_merge( $common, array( 'width' ) ),
974 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
975 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
978 'ul' => array_merge( $common, array( 'type' ) ),
979 'ol' => array_merge( $common, array( 'type', 'start' ) ),
980 'li' => array_merge( $common, array( 'type', 'value' ) ),
988 'table' => array_merge( $common,
989 array( 'summary', 'width', 'border', 'frame',
990 'rules', 'cellspacing', 'cellpadding',
991 'align', 'bgcolor', 'frame', 'rules',
995 'caption' => array_merge( $common, array( 'align' ) ),
998 'thead' => array_merge( $common, $tablealign ),
999 'tfoot' => array_merge( $common, $tablealign ),
1000 'tbody' => array_merge( $common, $tablealign ),
1003 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1004 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
1007 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
1010 'td' => array_merge( $common, $tablecell, $tablealign ),
1011 'th' => array_merge( $common, $tablecell, $tablealign ),
1019 'strike' => $common,
1024 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
1028 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
1030 # XHTML Ruby annotation text module, simple ruby only.
1031 # http://www.w3c.org/TR/ruby/
1036 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
1043 * Take a fragment of (potentially invalid) HTML and return
1044 * a version with any tags removed, encoded suitably for literal
1045 * inclusion in an attribute value.
1047 * @param string $text HTML fragment
1050 function stripAllTags( $text ) {
1052 $text = preg_replace( '/ < .*? > /x', '', $text );
1054 # Normalize &entities and whitespace
1055 $text = Sanitizer
::normalizeAttributeValue( $text );
1057 # Will be placed into "double-quoted" attributes,
1058 # make sure remaining bits are safe.
1059 $text = str_replace(
1060 array('<', '>', '"'),
1061 array('<', '>', '"'),
1068 * Hack up a private DOCTYPE with HTML's standard entity declarations.
1069 * PHP 4 seemed to know these if you gave it an HTML doctype, but
1072 * Use for passing XHTML fragments to PHP's XML parsing functions
1077 function hackDocType() {
1078 global $wgHtmlEntities;
1079 $out = "<!DOCTYPE html [\n";
1080 foreach( $wgHtmlEntities as $entity => $codepoint ) {
1081 $out .= "<!ENTITY $entity \"&#$codepoint;\">";