* Reordered wiki table handling and __TOC__ extraction in the parser to better handle...
[lhc/web/wiklou.git] / maintenance / wiki-mangleme.php
1 <?php
2 /**
3
4 Author : Nick Jenkins, http://nickj.org/
5 Date : 18 May 2006.
6 License: GPL v 2.
7
8 Desc:
9 Performs fuzz-style testing of MediaWiki's parser.
10 The script feeds the parser some randomized malformed wiki-text, and stores
11 the HTML output.
12
13 Checks the HTML output for:
14 - unclosed tags
15 - errors in Tidy
16 both can indicate potential security issues.
17
18 Can optionally W3C validate of the HTML output (indicates malformed HTML
19 output).
20
21 Background:
22 Contains a PHP port, of a "shameless" Python PORT, OF LCAMTUF'S MANGELME
23 http://www.securiteam.com/tools/6Z00N1PBFK.html
24
25 Requirements:
26 You need PHP4 or PHP5, with PHP-curl enabled, and Tidy installed.
27
28 Usage:
29 Update the "Configuration" section, especially the "WIKI_URL" to point
30 to a local wiki you can test stuff on. You can optionally set
31 "VALIDATE_ON_WEB" to true, although at the moment very few generated pages
32 will validate. Then run "php wiki-mangleme.php".
33
34 This will print a list of HTML output that had unclosed tags, and/or that
35 caused tidy errors. It will keep running until you press Ctrl-C. All output
36 files are stored in the "mangleme" subdirectory.
37 */
38
39 # This is a command line script, load mediawiki env:
40 include('commandLine.inc');
41
42 // Configuration:
43
44 # The directory name where we store the output
45 # for windows: "c:\\temp\\mangleme"
46 define("DIRECTORY", "/tmp/mangleme");
47
48 # URL to some wiki on which we can run our tests:
49 define("WIKI_URL", $wgServer . $wgScriptPath . '/index.php?title=WIKIMANGLE' );
50
51 # Should our test output include binary strings?
52 define("INCLUDE_BINARY", false);
53
54 # Whether we want to send output on the web for validation:
55 define("VALIDATE_ON_WEB", false);
56 # URL to use to validate our output:
57 define("VALIDATOR_URL", "http://validator.w3.org/check");
58
59
60 // If it goes wrong, we want to know about it.
61 error_reporting(E_ALL);
62
63 ///////////////////// DEFINE THE DATA THAT WILL BE USED //////////////////////
64 /* Note: Only some HTML tags are understood by MediaWiki, the rest is ignored.
65 The tags that are ignored have been commented out below. */
66
67 $data = array();
68 // $data["A"] = array("NAME", "HREF", "REF", "REV", "TITLE", "TARGET", "SHAPE", "onLoad", "STYLE");
69 // $data["APPLET"] = array("CODEBASE", "CODE", "NAME", "ALIGN", "ALT", "HEIGHT", "WIDTH", "HSPACE", "VSPACE", "DOWNLOAD", "HEIGHT", "NAME", "TITLE", "onLoad", "STYLE");
70 // $data["AREA"] = array("SHAPE", "ALT", "CO-ORDS", "HREF", "onLoad", "STYLE");
71 $data["B"] = array("onLoad", "STYLE");
72 // $data["BANNER"] = array("onLoad", "STYLE");
73 // $data["BASE"] = array("HREF", "TARGET", "onLoad", "STYLE");
74 // $data["BASEFONT"] = array("SIZE", "onLoad", "STYLE");
75 // $data["BGSOUND"] = array("SRC", "LOOP", "onLoad", "STYLE");
76 // $data["BQ"] = array("CLEAR", "NOWRAP", "onLoad", "STYLE");
77 // $data["BODY"] = array("BACKGROUND", "BGCOLOR", "TEXT", "LINK", "ALINK", "VLINK", "LEFTMARGIN", "TOPMARGIN", "BGPROPERTIES", "onLoad", "STYLE");
78 $data["CAPTION"] = array("ALIGN", "VALIGN", "onLoad", "STYLE");
79 $data["CENTER"] = array("onLoad", "STYLE");
80 // $data["COL"] = array("ALIGN", "SPAN", "onLoad", "STYLE");
81 // $data["COLGROUP"] = array("ALIGN", "VALIGN", "HALIGN", "WIDTH", "SPAN", "onLoad", "STYLE");
82 $data["DIV"] = array("ALIGN", "CLASS", "LANG", "onLoad", "STYLE");
83 // $data["EMBED"] = array("SRC", "HEIGHT", "WIDTH", "UNITS", "NAME", "PALETTE", "onLoad", "STYLE");
84 // $data["FIG"] = array("SRC", "ALIGN", "HEIGHT", "WIDTH", "UNITS", "IMAGEMAP", "onLoad", "STYLE");
85 // $data["FN"] = array("ID", "onLoad", "STYLE");
86 $data["FONT"] = array("SIZE", "COLOR", "FACE", "onLoad", "STYLE");
87 // $data["FORM"] = array("ACTION", "METHOD", "ENCTYPE", "TARGET", "SCRIPT", "onLoad", "STYLE");
88 // $data["FRAME"] = array("SRC", "NAME", "MARGINWIDTH", "MARGINHEIGHT", "SCROLLING", "FRAMESPACING", "onLoad", "STYLE");
89 // $data["FRAMESET"] = array("ROWS", "COLS", "onLoad", "STYLE");
90 $data["H1"] = array("SRC", "DINGBAT", "onLoad", "STYLE");
91 // $data["HEAD"] = array("onLoad", "STYLE");
92 $data["HR"] = array("SRC", "SIZE", "WIDTH", "ALIGN", "COLOR", "onLoad", "STYLE");
93 // $data["HTML"] = array("onLoad", "STYLE");
94 // $data["IFRAME"] = array("ALIGN", "FRAMEBORDER", "HEIGHT", "MARGINHEIGHT", "MARGINWIDTH", "NAME", "SCROLLING", "SRC", "ADDRESS", "WIDTH", "onLoad", "STYLE");
95 // $data["IMG"] = array("ALIGN", "ALT", "SRC", "BORDER", "DYNSRC", "HEIGHT", "HSPACE", "ISMAP", "LOOP", "LOWSRC", "START", "UNITS", "USEMAP", "WIDTH", "VSPACE", "onLoad", "STYLE");
96 // $data["INPUT"] = array("TYPE", "NAME", "VALUE", "onLoad", "STYLE");
97 // $data["ISINDEX"] = array("HREF", "PROMPT", "onLoad", "STYLE");
98 $data["LI"] = array("SRC", "DINGBAT", "SKIP", "TYPE", "VALUE", "onLoad", "STYLE");
99 // $data["LINK"] = array("REL", "REV", "HREF", "TITLE", "onLoad", "STYLE");
100 // $data["MAP"] = array("NAME", "onLoad", "STYLE");
101 // $data["MARQUEE"] = array("ALIGN", "BEHAVIOR", "BGCOLOR", "DIRECTION", "HEIGHT", "HSPACE", "LOOP", "SCROLLAMOUNT", "SCROLLDELAY", "WIDTH", "VSPACE", "onLoad", "STYLE");
102 // $data["MENU"] = array("onLoad", "STYLE");
103 // $data["META"] = array("HTTP-EQUIV", "CONTENT", "NAME", "onLoad", "STYLE");
104 // $data["MULTICOL"] = array("COLS", "GUTTER", "WIDTH", "onLoad", "STYLE");
105 // $data["NOFRAMES"] = array("onLoad", "STYLE");
106 // $data["NOTE"] = array("CLASS", "SRC", "onLoad", "STYLE");
107 // $data["OVERLAY"] = array("SRC", "X", "Y", "HEIGHT", "WIDTH", "UNITS", "IMAGEMAP", "onLoad", "STYLE");
108 // $data["PARAM"] = array("NAME", "VALUE", "onLoad", "STYLE");
109 // $data["RANGE"] = array("FROM", "UNTIL", "onLoad", "STYLE");
110 // $data["SCRIPT"] = array("LANGUAGE", "onLoad", "STYLE");
111 // $data["SELECT"] = array("NAME", "SIZE", "MULTIPLE", "WIDTH", "HEIGHT", "UNITS", "onLoad", "STYLE");
112 // $data["OPTION"] = array("VALUE", "SHAPE", "onLoad", "STYLE");
113 // $data["SPACER"] = array("TYPE", "SIZE", "WIDTH", "HEIGHT", "ALIGN", "onLoad", "STYLE");
114 // $data["SPOT"] = array("ID", "onLoad", "STYLE");
115 // $data["TAB"] = array("INDENT", "TO", "ALIGN", "DP", "onLoad", "STYLE");
116 $data["TABLE"] = array("ALIGN", "WIDTH", "BORDER", "CELLPADDING", "CELLSPACING", "BGCOLOR", "VALIGN", "COLSPEC", "UNITS", "DP", "onLoad", "STYLE");
117 // $data["TBODY"] = array("CLASS", "ID", "onLoad", "STYLE");
118 $data["TD"] = array("COLSPAN", "ROWSPAN", "ALIGN", "VALIGN", "BGCOLOR", "onLoad", "STYLE");
119 // $data["TEXTAREA"] = array("NAME", "COLS", "ROWS", "onLoad", "STYLE");
120 // $data["TEXTFLOW"] = array("CLASS", "ID", "onLoad", "STYLE");
121 // $data["TFOOT"] = array("COLSPAN", "ROWSPAN", "ALIGN", "VALIGN", "BGCOLOR", "onLoad", "STYLE");
122 $data["TH"] = array("ALIGN", "CLASS", "ID", "onLoad", "STYLE");
123 // $data["TITLE"] = array("onLoad", "STYLE");
124 $data["TR"] = array("ALIGN", "VALIGN", "BGCOLOR", "CLASS", "onLoad", "STYLE");
125 $data["UL"] = array("SRC", "DINGBAT", "SKIP", "TYPE", "VALUE", "onLoad", "STYLE");
126
127 // Now add in a few that were not in the original, but which MediaWiki understands, even with
128 // extraneous attributes:
129 $data["gallery"] = array("CLASS", "ID", "onLoad", "STYLE");
130 $data["pre"] = array("CLASS", "ID", "onLoad", "STYLE");
131 $data["nowiki"] = array("CLASS", "ID", "onLoad", "STYLE");
132 $data["blockquote"] = array("CLASS", "ID", "onLoad", "STYLE");
133 $data["span"] = array("CLASS", "ID", "onLoad", "STYLE");
134 $data["code"] = array("CLASS", "ID", "onLoad", "STYLE");
135 $data["tt"] = array("CLASS", "ID", "onLoad", "STYLE");
136 $data["small"] = array("CLASS", "ID", "onLoad", "STYLE");
137 $data["big"] = array("CLASS", "ID", "onLoad", "STYLE");
138 $data["s"] = array("CLASS", "ID", "onLoad", "STYLE");
139 $data["u"] = array("CLASS", "ID", "onLoad", "STYLE");
140 $data["del"] = array("CLASS", "ID", "onLoad", "STYLE");
141 $data["ins"] = array("CLASS", "ID", "onLoad", "STYLE");
142 $data["sub"] = array("CLASS", "ID", "onLoad", "STYLE");
143
144
145 // The types of the HTML that we will be testing were defined above
146 $types = array_keys($data);
147
148 // Some attribute values.
149 $other = array("&","=",":","?","\"","\n","%n%n%n%n%n%n%n%n%n%n%n%n","\\");
150 $ints = array("0","-1","127","7897","89000","808080","90928345","74326794236234","0xfffffff","ffff");
151
152 ///////////////////////////////// WIKI-SYNTAX ///////////////////////////
153 /* Note: Defines various wiki-related bits of syntax, that can potentially cause
154 MediaWiki to do something other than just print that literal text */
155 $ext = array(
156 "[[", "]]", "\n{|", "|}", "{{", "}}", "|", "[[image:", "[", "]",
157 "=", "==", "===", "====", "=====", "======", "\n*", "*", "\n:", ":",
158 "{{{", "}}}",
159 "\n", "\n#", "#", "\n;", ";", "\n ",
160 "----", "\n----",
161 "|]]", "~~~", "#REDIRECT [[", "'''", "''",
162 "ISBN 2", "\n|-", "| ", "\n| ",
163 "<!--", "-->",
164 "\"",
165 ">",
166 "http://","https://","url://","ftp://","file://","irc://","javascript:",
167 "!",
168 "\n! ",
169 "!!",
170 "||",
171 ".gif",
172 ".png",
173 ".jpg",
174 ".jpeg",
175 "<!--()()",
176 '%08X',
177 '/',
178 ":x{|",
179 "\n|-",
180 "\n|+",
181 "<noinclude>",
182 "</noinclude>",
183 "\n-----",
184 "UNIQ25f46b0524f13e67NOPARSE",
185 " \302\273",
186 " :",
187 " !",
188 " ;",
189 "\302\253",
190 "RFC 000",
191 "PMID 000",
192 "?=",
193 "(",
194 ")".
195 "]]]",
196 "../",
197 "{{{{",
198 "}}}}",
199 "{{subst:",
200 '__NOTOC__',
201 '__FORCETOC__',
202 '__NOEDITSECTION__',
203 '__START__',
204 '{{PAGENAME}}',
205 '{{PAGENAMEE}}',
206 '{{NAMESPACE}}',
207 '{{MSG:',
208 '{{MSGNW:',
209 '__END__',
210 '{{INT:',
211 '{{SITENAME}}',
212 '{{NS:',
213 '{{LOCALURL:',
214 '{{LOCALURLE:',
215 '{{SCRIPTPATH}}',
216 '{{GRAMMAR:',
217 '__NOTITLECONVERT__',
218 '__NOCONTENTCONVERT__',
219 "<!--MWTEMPLATESECTION=",
220 "<!--LINK 987-->",
221 "<!--IWLINK 987-->",
222 "Image:",
223 "[[category:",
224 "{{REVISIONID}}",
225 "{{SUBPAGENAME}}",
226 "{{SUBPAGENAMEE}}",
227 "{{ns:0}}",
228 "[[:Image",
229 "[[Special:",
230 "{{fullurl:}}",
231 '__TOC__',
232 "<includeonly>",
233 "</includeonly>",
234 "<math>",
235 "</math>"
236 );
237
238
239 ///////////////////// A CLASS THAT GENERATES RANDOM STRINGS OF DATA //////////////////////
240
241 class htmler {
242 var $maxparams = 4;
243 var $maxtypes = 40;
244
245 function randnum($finish,$start=0) {
246 return mt_rand($start,$finish);
247 }
248
249 function randstring() {
250 global $ext;
251 $thestring = "";
252
253 for ($i=0; $i<40; $i++) {
254 $what = $this->randnum(1);
255
256 if ($what == 0) { // include some random wiki syntax
257 $which = $this->randnum(count($ext) - 1);
258 $thestring .= $ext[$which];
259 }
260 else { // include some random text
261 $char = chr(INCLUDE_BINARY ? $this->randnum(255) : $this->randnum(126,32));
262 if ($char == "<") $char = ""; // we don't want the '<' character, it stuffs us up.
263 $length = $this->randnum(8);
264 $thestring .= str_repeat ($char, $length);
265 }
266 }
267 return $thestring;
268 }
269
270 function makestring() {
271 global $ints, $other;
272 $what = $this->randnum(2);
273 if ($what == 0) {
274 return $this->randstring();
275 }
276 elseif ($what == 1) {
277 return $ints[$this->randnum(count($ints) - 1)];
278 }
279 else {
280 return $other[$this->randnum(count($other) - 1)];
281 }
282 }
283
284 function loop() {
285 global $types, $data;
286 $string = "";
287 $i = $this->randnum(count($types) - 1);
288 $t = $types[$i];
289 $arr = $data[$t];
290 $string .= "<" . $types[$i] . " ";
291 for ($z=0; $z<$this->maxparams; $z++) {
292 $badparam = $arr[$this->randnum(count($arr) - 1)];
293 $badstring = $this->makestring();
294 $string .= $badparam . "=" . $badstring . " ";
295 }
296 $string .= ">\n";
297 return $string;
298 }
299
300 function main() {
301 $page = "";
302 for ($k=0; $k<$this->maxtypes; $k++) {
303 $page .= $this->loop();
304 }
305 return $page;
306 }
307 }
308
309
310 //////////////////// SAVING OUTPUT /////////////////////////
311
312
313 /**
314 ** @desc: Utility function for saving a file. Currently has no error checking.
315 */
316 function saveFile($string, $name) {
317 $fp = fopen ( DIRECTORY . "/" . $name, "w");
318 fwrite($fp, $string);
319 fclose ($fp);
320 }
321
322
323 //////////////////// MEDIAWIKI PREVIEW /////////////////////////
324
325 /*
326 ** @desc: Asks MediaWiki for a preview of a string. Returns the HTML.
327 */
328 function wikiPreview($text) {
329
330 $params = array (
331 "action" => "submit",
332 "wpMinoredit" => "1",
333 "wpPreview" => "Show preview",
334 "wpSection" => "new",
335 "wpEdittime" => "",
336 "wpSummary" => "This is a test",
337 "wpTextbox1" => $text
338 );
339
340 if( function_exists('curl_init') ) {
341 $ch = curl_init();
342 } else {
343 die("Could not found 'curl_init' function. Is curl extension enabled ?\n");
344 }
345
346 curl_setopt($ch, CURLOPT_POST, 1); // save form using a POST
347 curl_setopt($ch, CURLOPT_POSTFIELDS, $params); // load the POST variables
348 curl_setopt($ch, CURLOPT_URL, WIKI_URL); // set url to post to
349 curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); // return into a variable
350
351 $result=curl_exec ($ch);
352
353 // if we encountered an error, then log it, and exit.
354 if (curl_error($ch)) {
355 trigger_error("Curl error #: " . curl_errno($ch) . " - " . curl_error ($ch) );
356 print "Curl error #: " . curl_errno($ch) . " - " . curl_error ($ch) . " - exiting.\n";
357 exit();
358 }
359
360 curl_close ($ch);
361
362 return $result;
363 }
364
365
366 //////////////////// HTML VALIDATION /////////////////////////
367
368 /*
369 ** @desc: Asks the validator whether this is valid HTML, or not.
370 */
371 function validateHTML($text) {
372
373 $params = array ("fragment" => $text);
374
375 $ch = curl_init();
376
377 curl_setopt($ch, CURLOPT_POST, 1); // save form using a POST
378 curl_setopt($ch, CURLOPT_POSTFIELDS, $params); // load the POST variables
379 curl_setopt($ch, CURLOPT_URL, VALIDATOR_URL); // set url to post to
380 curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); // return into a variable
381
382 $result=curl_exec ($ch);
383
384 // if we encountered an error, then log it, and exit.
385 if (curl_error($ch)) {
386 trigger_error("Curl error #: " . curl_errno($ch) . " - " . curl_error ($ch) );
387 print "Curl error #: " . curl_errno($ch) . " - " . curl_error ($ch) . " - exiting.\n";
388 exit();
389 }
390
391 curl_close ($ch);
392
393 $valid = (strpos($result, "Failed validation") === false ? true : false);
394
395 return array($valid, $result);
396 }
397
398
399
400 /**
401 ** @desc: checks the string to see if tags are balanced.
402 */
403 function checkOpenCloseTags($string, $filename) {
404 $valid = true;
405
406 $lines = explode("\n", $string);
407
408 $num_lines = count($lines);
409 // print "Num lines: " . $num_lines . "\n";
410
411 foreach ($lines as $line_num => $line) {
412
413 // skip mediawiki's own unbalanced lines.
414 if ($line_num == 15) continue;
415 if ($line == "\t\t<style type=\"text/css\">/*<![CDATA[*/") continue;
416 if ($line == "<textarea tabindex='1' accesskey=\",\" name=\"wpTextbox1\" id=\"wpTextbox1\" rows='25'") continue;
417
418 if ($line == "/*<![CDATA[*/") continue;
419 if ($line == "/*]]>*/") continue;
420 if ($line == "<form id=\"editform\" name=\"editform\" method=\"post\" action=\"/wiki/index.php?title=Slkdnfl&amp;action=submit\"") continue;
421 if (ereg("^enctype=\"multipart/form-data\"><input type=\"hidden\" name=\"wikidb_session\" value=\"", $line)) continue; // line num and content changes.
422 if ($line == "<textarea tabindex='1' accesskey=\",\" name=\"wpTextbox1\" rows='25'") continue;
423 if (ereg("^cols='80'>", $line)) continue; // line num and content changes.
424
425 if ($num_lines - $line_num == 246) continue;
426 if ($num_lines - $line_num == 65) continue;
427 if ($num_lines - $line_num == 62) continue;
428 if ($num_lines - $line_num == 52) continue;
429 if ($num_lines - $line_num == 50) continue;
430 if ($num_lines - $line_num == 29) continue;
431 if ($num_lines - $line_num == 28) continue;
432 if ($num_lines - $line_num == 27) continue;
433 if ($num_lines - $line_num == 23) continue;
434
435 if (substr_count($line, "<") > substr_count($line, ">")) {
436 print "\nUnclosed tag in " . DIRECTORY . "/" . $filename . " on line: " . ($line_num + 1) . " \n$line\n";
437 $valid = false;
438 }
439 }
440 return $valid;
441 }
442
443
444 /**
445 ** @desc: Get tidy to check for no HTML errors in the output file (e.g. unescaped strings).
446 */
447 function tidyCheckFile($name) {
448 $file = DIRECTORY . "/" . $name;
449 $x = `tidy -errors -quiet --show-warnings false $file 2>&1`;
450 if (trim($x) != "") {
451 print "Tidy errors found in $file:\n$x";
452 return false;
453 } else {
454 return true;
455 }
456 }
457
458
459 ////////////////////// MAIN FUNCTION ////////////////////////
460
461 // Make directory if doesn't exist
462 if (!is_dir(DIRECTORY)) {
463 mkdir (DIRECTORY, 0700 );
464 }
465
466 // seed the random number generator
467 mt_srand(crc32(microtime()));
468
469 // main loop.
470 $h = new htmler();
471
472 print "Beginning main loop. Press CTRL+C to stop testing.\n";
473 for ($count=0; true /*$count<10000 */ ; $count++) { // while (true)
474 switch( $count % 4 ) {
475 case '0': print "\r/"; break;
476 case '1': print "\r-"; break;
477 case '2': print "\r\\"; break;
478 case '3': print "\r|"; break;
479 }
480 print " $count";
481
482 // generate and save text to test.
483 $raw_markup = $h->main();
484
485 // upload to MediaWiki install.
486 $wiki_preview = wikiPreview($raw_markup);
487
488 // save output files
489 saveFile($raw_markup, $count . ".raw_markup.txt");
490 saveFile($wiki_preview, $count . ".wiki_preview.html");
491
492 // validate result
493 $valid = true;
494 if (VALIDATE_ON_WEB) list ($valid, $validator_output) = validateHTML($wiki_preview);
495 $valid = $valid && checkOpenCloseTags ($wiki_preview, $count . ".wiki_preview.html");
496 $valid = $valid && tidyCheckFile( $count . ".wiki_preview.html" );
497
498
499 if( $valid ) {
500 // Remove valid tests:
501 unlink( DIRECTORY . "/" . $count . ".raw_markup.txt" );
502 unlink( DIRECTORY . "/" . $count . ".wiki_preview.html");
503 } elseif( VALIDATE_ON_WEB ) {
504 saveFile($validator_output, $count . ".validator_output.html");
505 }
506 }
507 print 'End of wiki-mangleme. Results are in the '.DIRECTORY." directory.\n";
508 ?>