4 * (X)HTML sanitizer for MediaWiki
6 * Copyright (C) 2002-2005 Brion Vibber <brion@pobox.com> et al
7 * http://www.mediawiki.org/
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 * http://www.gnu.org/copyleft/gpl.html
29 * Cleans up HTML, removes dangerous tags and attributes, and
30 * removes HTML comments
33 function removeHTMLtags( $text ) {
34 global $wgUseTidy, $wgUserHtml;
35 $fname = 'Parser::removeHTMLtags';
36 wfProfileIn( $fname );
39 $htmlpairs = array( # Tags that must be closed
40 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
41 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
42 'strike', 'strong', 'tt', 'var', 'div', 'center',
43 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
44 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
47 'br', 'hr', 'li', 'dt', 'dd'
49 $htmlnest = array( # Tags that can be nested--??
50 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
51 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
53 $tabletags = array( # Can only appear inside table
58 $htmlsingle = array();
63 $htmlsingle = array_merge( $tabletags, $htmlsingle );
64 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
66 $htmlattrs = Sanitizer
::getHTMLattrs () ;
68 # Remove HTML comments
69 $text = Sanitizer
::removeHTMLcomments( $text );
71 $bits = explode( '<', $text );
72 $text = array_shift( $bits );
74 $tagstack = array(); $tablestack = array();
75 foreach ( $bits as $x ) {
76 $prev = error_reporting( E_ALL
& ~
( E_NOTICE | E_WARNING
) );
77 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
79 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
80 error_reporting( $prev );
83 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
87 if ( ! in_array( $t, $htmlsingle ) &&
88 ( $ot = @array_pop
( $tagstack ) ) != $t ) {
89 @array_push
( $tagstack, $ot );
92 if ( $t == 'table' ) {
93 $tagstack = array_pop( $tablestack );
98 # Keep track for later
99 if ( in_array( $t, $tabletags ) &&
100 ! in_array( 'table', $tagstack ) ) {
102 } else if ( in_array( $t, $tagstack ) &&
103 ! in_array ( $t , $htmlnest ) ) {
105 } else if ( ! in_array( $t, $htmlsingle ) ) {
106 if ( $t == 'table' ) {
107 array_push( $tablestack, $tagstack );
110 array_push( $tagstack, $t );
112 # Strip non-approved attributes from the tag
113 $newparams = Sanitizer
::fixTagAttributes($params);
117 $rest = str_replace( '>', '>', $rest );
118 $text .= "<$slash$t $newparams$brace$rest";
122 $text .= '<' . str_replace( '>', '>', $x);
124 # Close off any remaining tags
125 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
127 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
130 # this might be possible using tidy itself
131 foreach ( $bits as $x ) {
132 preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
134 @list
( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
135 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
136 $newparams = Sanitizer
::fixTagAttributes($params);
137 $rest = str_replace( '>', '>', $rest );
138 $text .= "<$slash$t $newparams$brace$rest";
140 $text .= '<' . str_replace( '>', '>', $x);
144 wfProfileOut( $fname );
149 * Remove '<!--', '-->', and everything between.
150 * To avoid leaving blank lines, when a comment is both preceded
151 * and followed by a newline (ignoring spaces), trim leading and
152 * trailing spaces and one of the newlines.
156 function removeHTMLcomments( $text ) {
157 $fname='Parser::removeHTMLcomments';
158 wfProfileIn( $fname );
159 while (($start = strpos($text, '<!--')) !== false) {
160 $end = strpos($text, '-->', $start +
4);
161 if ($end === false) {
162 # Unterminated comment; bail out
168 # Trim space and newline if the comment is both
169 # preceded and followed by a newline
170 $spaceStart = max($start - 1, 0);
171 $spaceLen = $end - $spaceStart;
172 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
176 while (substr($text, $spaceStart +
$spaceLen, 1) === ' ')
178 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart +
$spaceLen, 1) === "\n") {
179 # Remove the comment, leading and trailing
180 # spaces, and leave only one newline.
181 $text = substr_replace($text, "\n", $spaceStart, $spaceLen +
1);
184 # Remove just the comment.
185 $text = substr_replace($text, '', $start, $end - $start);
188 wfProfileOut( $fname );
193 * Return allowed HTML attributes
197 function getHTMLattrs () {
198 $htmlattrs = array( # Allowed attributes--no scripting, etc.
199 'title', 'align', 'lang', 'dir', 'width', 'height',
200 'bgcolor', 'clear', /* BR */ 'noshade', /* HR */
201 'cite', /* BLOCKQUOTE, Q */ 'size', 'face', 'color',
202 /* FONT */ 'type', 'start', 'value', 'compact',
203 /* For various lists, mostly deprecated but safe */
204 'summary', 'width', 'border', 'frame', 'rules',
205 'cellspacing', 'cellpadding', 'valign', 'char',
206 'charoff', 'colgroup', 'col', 'span', 'abbr', 'axis',
207 'headers', 'scope', 'rowspan', 'colspan', /* Tables */
208 'id', 'class', 'name', 'style' /* For CSS */
214 * Remove non approved attributes and javascript in css
218 function fixTagAttributes ( $t ) {
219 if ( trim ( $t ) == '' ) return '' ; # Saves runtime ;-)
220 $htmlattrs = Sanitizer
::getHTMLattrs() ;
222 # Strip non-approved attributes from the tag
224 '/(\\w+)(\\s*=\\s*([^\\s\">]+|\"[^\">]*\"))?/e',
225 "(in_array(strtolower(\"\$1\"),\$htmlattrs)?(\"\$1\".((\"x\$3\" != \"x\")?\"=\$3\":'')):'')",
228 $t = str_replace ( '<></>' , '' , $t ) ; # This should fix bug 980557
230 # Strip javascript "expression" from stylesheets. Brute force approach:
231 # If anythin offensive is found, all attributes of the HTML tag are dropped
234 '/style\\s*=.*(expression|tps*:\/\/|url\\s*\().*/is',
235 wfMungeToUtf8( $t ) ) )