Printable mode cleanup. Now done through stylesheets, <link>ed so that the
[lhc/web/wiklou.git] / includes / Parser.php
1 <?php
2
3 include_once('Tokenizer.php');
4
5 # PHP Parser
6 #
7 # Processes wiki markup
8 #
9 # There are two main entry points into the Parser class: parse() and preSaveTransform().
10 # The parse() function produces HTML output, preSaveTransform() produces altered wiki markup.
11 #
12 # Globals used:
13 # objects: $wgLang, $wgDateFormatter, $wgLinkCache, $wgCurParser
14 #
15 # NOT $wgArticle, $wgUser or $wgTitle. Keep them away!
16 #
17 # settings: $wgUseTex*, $wgUseCategoryMagic*, $wgUseDynamicDates*, $wgInterwikiMagic*,
18 # $wgNamespacesWithSubpages, $wgLanguageCode, $wgAllowExternalImages*,
19 # $wgLocaltimezone
20 #
21 # * only within ParserOptions
22 #
23 #
24 #----------------------------------------
25 # Variable substitution O(N^2) attack
26 #-----------------------------------------
27 # Without countermeasures, it would be possible to attack the parser by saving a page
28 # filled with a large number of inclusions of large pages. The size of the generated
29 # page would be proportional to the square of the input size. Hence, we limit the number
30 # of inclusions of any given page, thus bringing any attack back to O(N).
31 #
32 define( "MAX_INCLUDE_REPEAT", 5 );
33
34 # Recursion depth of variable/inclusion evaluation
35 define( "MAX_INCLUDE_PASSES", 3 );
36
37 # Allowed values for $mOutputType
38 define( "OT_HTML", 1 );
39 define( "OT_WIKI", 2 );
40
41 class Parser
42 {
43 # Cleared with clearState():
44 var $mOutput, $mAutonumber, $mLastSection, $mDTopen, $mStripState;
45 var $mVariables, $mIncludeCount;
46
47 # Temporary:
48 var $mOptions, $mTitle, $mOutputType;
49
50 function Parser()
51 {
52 $this->clearState();
53 }
54
55 function clearState()
56 {
57 $this->mOutput = new ParserOutput;
58 $this->mAutonumber = 0;
59 $this->mLastSection = "";
60 $this->mDTopen = false;
61 $this->mStripState = false;
62 $this->mVariables = false;
63 $this->mIncludeCount = array();
64 }
65
66 # First pass--just handle <nowiki> sections, pass the rest off
67 # to doWikiPass2() which does all the real work.
68 #
69 # Returns a ParserOutput
70 #
71 function parse( $text, &$title, $options, $linestart = true, $clearState = true )
72 {
73 $fname = "Parser::parse";
74 wfProfileIn( $fname );
75
76 if ( $clearState ) {
77 $this->clearState();
78 }
79
80 $this->mOptions = $options;
81 $this->mTitle =& $title;
82 $this->mOutputType = OT_HTML;
83
84 $stripState = NULL;
85 $text = $this->strip( $text, $this->mStripState );
86 $text = $this->doWikiPass2( $text, $linestart );
87 $text = $this->unstrip( $text, $this->mStripState );
88
89 $this->mOutput->setText( $text );
90 wfProfileOut( $fname );
91 return $this->mOutput;
92 }
93
94 /* static */ function getRandomString()
95 {
96 return dechex(mt_rand(0, 0x7fffffff)) . dechex(mt_rand(0, 0x7fffffff));
97 }
98
99 # Strips <nowiki>, <pre> and <math>
100 # Returns the text, and fills an array with data needed in unstrip()
101 #
102 function strip( $text, &$state )
103 {
104 $state = array(
105 'nwlist' => array(),
106 'nwsecs' => 0,
107 'nwunq' => Parser::getRandomString(),
108 'mathlist' => array(),
109 'mathsecs' => 0,
110 'mathunq' => Parser::getRandomString(),
111 'prelist' => array(),
112 'presecs' => 0,
113 'preunq' => Parser::getRandomString()
114 );
115 $render = ($this->mOutputType == OT_HTML);
116 $stripped = "";
117 $stripped2 = "";
118 $stripped3 = "";
119
120 # Replace any instances of the placeholders
121 $text = str_replace( $state['nwunq'], wfHtmlEscapeFirst( $state['nwunq'] ), $text );
122 $text = str_replace( $state['mathunq'], wfHtmlEscapeFirst( $state['mathunq'] ), $text );
123 $text = str_replace( $state['preunq'], wfHtmlEscapeFirst( $state['preunq'] ), $text );
124
125 while ( "" != $text ) {
126 $p = preg_split( "/<\\s*nowiki\\s*>/i", $text, 2 );
127 $stripped .= $p[0];
128 if ( ( count( $p ) < 2 ) || ( "" == $p[1] ) ) {
129 $text = "";
130 } else {
131 $q = preg_split( "/<\\/\\s*nowiki\\s*>/i", $p[1], 2 );
132 ++$state['nwsecs'];
133
134 if ( $render ) {
135 $state['nwlist'][$state['nwsecs']] = wfEscapeHTMLTagsOnly($q[0]);
136 } else {
137 $state['nwlist'][$state['nwsecs']] = "<nowiki>{$q[0]}</nowiki>";
138 }
139
140 $stripped .= $state['nwunq'] . sprintf("%08X", $state['nwsecs']);
141 $text = $q[1];
142 }
143 }
144
145 if( $this->mOptions->getUseTeX() ) {
146 while ( "" != $stripped ) {
147 $p = preg_split( "/<\\s*math\\s*>/i", $stripped, 2 );
148 $stripped2 .= $p[0];
149 if ( ( count( $p ) < 2 ) || ( "" == $p[1] ) ) {
150 $stripped = "";
151 } else {
152 $q = preg_split( "/<\\/\\s*math\\s*>/i", $p[1], 2 );
153 ++$state['mathsecs'];
154
155 if ( $render ) {
156 $state['mathlist'][$state['mathsecs']] = renderMath($q[0]);
157 } else {
158 $state['mathlist'][$state['mathsecs']] = "<math>{$q[0]}</math>";
159 }
160
161 $stripped2 .= $state['mathunq'] . sprintf("%08X", $state['mathsecs']);
162 $stripped = $q[1];
163 }
164 }
165 } else {
166 $stripped2 = $stripped;
167 }
168
169 while ( "" != $stripped2 ) {
170 $p = preg_split( "/<\\s*pre\\s*>/i", $stripped2, 2 );
171 $stripped3 .= $p[0];
172 if ( ( count( $p ) < 2 ) || ( "" == $p[1] ) ) {
173 $stripped2 = "";
174 } else {
175 $q = preg_split( "/<\\/\\s*pre\\s*>/i", $p[1], 2 );
176 ++$state['presecs'];
177
178 if ( $render ) {
179 $state['prelist'][$state['presecs']] = "<pre>". wfEscapeHTMLTagsOnly($q[0]). "</pre>\n";
180 } else {
181 $state['prelist'][$state['presecs']] = "<pre>{$q[0]}</pre>";
182 }
183
184 $stripped3 .= $state['preunq'] . sprintf("%08X", $state['presecs']);
185 $stripped2 = $q[1];
186 }
187 }
188 return $stripped3;
189 }
190
191 function unstrip( $text, &$state )
192 {
193 for ( $i = 1; $i <= $state['presecs']; ++$i ) {
194 $text = str_replace( $state['preunq'] . sprintf("%08X", $i), $state['prelist'][$i], $text );
195 }
196
197 for ( $i = 1; $i <= $state['mathsecs']; ++$i ) {
198 $text = str_replace( $state['mathunq'] . sprintf("%08X", $i), $state['mathlist'][$i], $text );
199 }
200
201 for ( $i = 1; $i <= $state['nwsecs']; ++$i ) {
202 $text = str_replace( $state['nwunq'] . sprintf("%08X", $i), $state['nwlist'][$i], $text );
203 }
204 return $text;
205 }
206
207 function categoryMagic ()
208 {
209 global $wgLang , $wgUser ;
210 if ( !$this->mOptions->getUseCategoryMagic() ) return ;
211 $id = $this->mTitle->getArticleID() ;
212 $cat = $wgLang->ucfirst ( wfMsg ( "category" ) ) ;
213 $ti = $this->mTitle->getText() ;
214 $ti = explode ( ":" , $ti , 2 ) ;
215 if ( $cat != $ti[0] ) return "" ;
216 $r = "<br break=all>\n" ;
217
218 $articles = array() ;
219 $parents = array () ;
220 $children = array() ;
221
222
223 # $sk =& $this->mGetSkin();
224 $sk =& $wgUser->getSkin() ;
225
226 $doesexist = false ;
227 if ( $doesexist ) {
228 $sql = "SELECT cur_title,cur_namespace FROM cur,links WHERE l_to={$id} AND l_from=cur_id";
229 } else {
230 $sql = "SELECT cur_title,cur_namespace FROM cur,brokenlinks WHERE bl_to={$id} AND bl_from=cur_id" ;
231 }
232
233 $res = wfQuery ( $sql, DB_READ ) ;
234 while ( $x = wfFetchObject ( $res ) )
235 {
236 # $t = new Title ;
237 # $t->newFromDBkey ( $x->l_from ) ;
238 # $t = $t->getText() ;
239 $t = $wgLang->getNsText ( $x->cur_namespace ) ;
240 if ( $t != "" ) $t .= ":" ;
241 $t .= $x->cur_title ;
242
243 $y = explode ( ":" , $t , 2 ) ;
244 if ( count ( $y ) == 2 && $y[0] == $cat ) {
245 array_push ( $children , $sk->makeLink ( $t , $y[1] ) ) ;
246 } else {
247 array_push ( $articles , $sk->makeLink ( $t ) ) ;
248 }
249 }
250 wfFreeResult ( $res ) ;
251
252 # Children
253 if ( count ( $children ) > 0 )
254 {
255 asort ( $children ) ;
256 $r .= "<h2>".wfMsg("subcategories")."</h2>\n" ;
257 $r .= implode ( ", " , $children ) ;
258 }
259
260 # Articles
261 if ( count ( $articles ) > 0 )
262 {
263 asort ( $articles ) ;
264 $h = wfMsg( "category_header", $ti[1] );
265 $r .= "<h2>{$h}</h2>\n" ;
266 $r .= implode ( ", " , $articles ) ;
267 }
268
269
270 return $r ;
271 }
272
273 function getHTMLattrs ()
274 {
275 $htmlattrs = array( # Allowed attributes--no scripting, etc.
276 "title", "align", "lang", "dir", "width", "height",
277 "bgcolor", "clear", /* BR */ "noshade", /* HR */
278 "cite", /* BLOCKQUOTE, Q */ "size", "face", "color",
279 /* FONT */ "type", "start", "value", "compact",
280 /* For various lists, mostly deprecated but safe */
281 "summary", "width", "border", "frame", "rules",
282 "cellspacing", "cellpadding", "valign", "char",
283 "charoff", "colgroup", "col", "span", "abbr", "axis",
284 "headers", "scope", "rowspan", "colspan", /* Tables */
285 "id", "class", "name", "style" /* For CSS */
286 );
287 return $htmlattrs ;
288 }
289
290 function fixTagAttributes ( $t )
291 {
292 if ( trim ( $t ) == "" ) return "" ; # Saves runtime ;-)
293 $htmlattrs = $this->getHTMLattrs() ;
294
295 # Strip non-approved attributes from the tag
296 $t = preg_replace(
297 "/(\\w+)(\\s*=\\s*([^\\s\">]+|\"[^\">]*\"))?/e",
298 "(in_array(strtolower(\"\$1\"),\$htmlattrs)?(\"\$1\".((\"x\$3\" != \"x\")?\"=\$3\":'')):'')",
299 $t);
300 # Strip javascript "expression" from stylesheets. Brute force approach:
301 # If anythin offensive is found, all attributes of the HTML tag are dropped
302
303 if( preg_match(
304 "/style\\s*=.*(expression|tps*:\/\/|url\\s*\().*/is",
305 wfMungeToUtf8( $t ) ) )
306 {
307 $t="";
308 }
309
310 return trim ( $t ) ;
311 }
312
313 function doTableStuff ( $t )
314 {
315 $t = explode ( "\n" , $t ) ;
316 $td = array () ; # Is currently a td tag open?
317 $ltd = array () ; # Was it TD or TH?
318 $tr = array () ; # Is currently a tr tag open?
319 $ltr = array () ; # tr attributes
320 foreach ( $t AS $k => $x )
321 {
322 $x = rtrim ( $x ) ;
323 $fc = substr ( $x , 0 , 1 ) ;
324 if ( "{|" == substr ( $x , 0 , 2 ) )
325 {
326 $t[$k] = "<table " . $this->fixTagAttributes ( substr ( $x , 3 ) ) . ">" ;
327 array_push ( $td , false ) ;
328 array_push ( $ltd , "" ) ;
329 array_push ( $tr , false ) ;
330 array_push ( $ltr , "" ) ;
331 }
332 else if ( count ( $td ) == 0 ) { } # Don't do any of the following
333 else if ( "|}" == substr ( $x , 0 , 2 ) )
334 {
335 $z = "</table>\n" ;
336 $l = array_pop ( $ltd ) ;
337 if ( array_pop ( $tr ) ) $z = "</tr>" . $z ;
338 if ( array_pop ( $td ) ) $z = "</{$l}>" . $z ;
339 array_pop ( $ltr ) ;
340 $t[$k] = $z ;
341 }
342 /* else if ( "|_" == substr ( $x , 0 , 2 ) ) # Caption
343 {
344 $z = trim ( substr ( $x , 2 ) ) ;
345 $t[$k] = "<caption>{$z}</caption>\n" ;
346 }*/
347 else if ( "|-" == substr ( $x , 0 , 2 ) ) # Allows for |---------------
348 {
349 $x = substr ( $x , 1 ) ;
350 while ( $x != "" && substr ( $x , 0 , 1 ) == '-' ) $x = substr ( $x , 1 ) ;
351 $z = "" ;
352 $l = array_pop ( $ltd ) ;
353 if ( array_pop ( $tr ) ) $z = "</tr>" . $z ;
354 if ( array_pop ( $td ) ) $z = "</{$l}>" . $z ;
355 array_pop ( $ltr ) ;
356 $t[$k] = $z ;
357 array_push ( $tr , false ) ;
358 array_push ( $td , false ) ;
359 array_push ( $ltd , "" ) ;
360 array_push ( $ltr , $this->fixTagAttributes ( $x ) ) ;
361 }
362 else if ( "|" == $fc || "!" == $fc || "|+" == substr ( $x , 0 , 2 ) ) # Caption
363 {
364 if ( "|+" == substr ( $x , 0 , 2 ) )
365 {
366 $fc = "+" ;
367 $x = substr ( $x , 1 ) ;
368 }
369 $after = substr ( $x , 1 ) ;
370 if ( $fc == "!" ) $after = str_replace ( "!!" , "||" , $after ) ;
371 $after = explode ( "||" , $after ) ;
372 $t[$k] = "" ;
373 foreach ( $after AS $theline )
374 {
375 $z = "" ;
376 if ( $fc != "+" )
377 {
378 $tra = array_pop ( $ltr ) ;
379 if ( !array_pop ( $tr ) ) $z = "<tr {$tra}>\n" ;
380 array_push ( $tr , true ) ;
381 array_push ( $ltr , "" ) ;
382 }
383
384 $l = array_pop ( $ltd ) ;
385 if ( array_pop ( $td ) ) $z = "</{$l}>" . $z ;
386 if ( $fc == "|" ) $l = "TD" ;
387 else if ( $fc == "!" ) $l = "TH" ;
388 else if ( $fc == "+" ) $l = "CAPTION" ;
389 else $l = "" ;
390 array_push ( $ltd , $l ) ;
391 $y = explode ( "|" , $theline , 2 ) ;
392 if ( count ( $y ) == 1 ) $y = "{$z}<{$l}>{$y[0]}" ;
393 else $y = $y = "{$z}<{$l} ".$this->fixTagAttributes($y[0]).">{$y[1]}" ;
394 $t[$k] .= $y ;
395 array_push ( $td , true ) ;
396 }
397 }
398 }
399
400 # Closing open td, tr && table
401 while ( count ( $td ) > 0 )
402 {
403 if ( array_pop ( $td ) ) $t[] = "</td>" ;
404 if ( array_pop ( $tr ) ) $t[] = "</tr>" ;
405 $t[] = "</table>" ;
406 }
407
408 $t = implode ( "\n" , $t ) ;
409 # $t = $this->removeHTMLtags( $t );
410 return $t ;
411 }
412
413 # Well, OK, it's actually about 14 passes. But since all the
414 # hard lifting is done inside PHP's regex code, it probably
415 # wouldn't speed things up much to add a real parser.
416 #
417 function doWikiPass2( $text, $linestart )
418 {
419 $fname = "Parser::doWikiPass2";
420 wfProfileIn( $fname );
421
422 $text = $this->removeHTMLtags( $text );
423 $text = $this->replaceVariables( $text );
424
425 # $text = preg_replace( "/(^|\n)-----*/", "\\1<hr>", $text );
426 $text = str_replace ( "<HR>", "<hr>", $text );
427
428 $text = $this->doHeadings( $text );
429 $text = $this->doBlockLevels( $text, $linestart );
430
431 if($this->mOptions->getUseDynamicDates()) {
432 global $wgDateFormatter;
433 $text = $wgDateFormatter->reformat( $this->mOptions->getDateFormat(), $text );
434 }
435
436 $text = $this->replaceExternalLinks( $text );
437 $text = $this->replaceInternalLinks ( $text );
438 $text = $this->doTableStuff ( $text ) ;
439
440 $text = $this->formatHeadings( $text );
441
442 $sk =& $this->mOptions->getSkin();
443 $text = $sk->transformContent( $text );
444 $text .= $this->categoryMagic () ;
445
446 wfProfileOut( $fname );
447 return $text;
448 }
449
450
451 /* private */ function doHeadings( $text )
452 {
453 for ( $i = 6; $i >= 1; --$i ) {
454 $h = substr( "======", 0, $i );
455 $text = preg_replace( "/^{$h}(.+){$h}(\\s|$)/m",
456 "<h{$i}>\\1</h{$i}>\\2", $text );
457 }
458 return $text;
459 }
460
461 # Note: we have to do external links before the internal ones,
462 # and otherwise take great care in the order of things here, so
463 # that we don't end up interpreting some URLs twice.
464
465 /* private */ function replaceExternalLinks( $text )
466 {
467 $fname = "Parser::replaceExternalLinks";
468 wfProfileIn( $fname );
469 $text = $this->subReplaceExternalLinks( $text, "http", true );
470 $text = $this->subReplaceExternalLinks( $text, "https", true );
471 $text = $this->subReplaceExternalLinks( $text, "ftp", false );
472 $text = $this->subReplaceExternalLinks( $text, "irc", false );
473 $text = $this->subReplaceExternalLinks( $text, "gopher", false );
474 $text = $this->subReplaceExternalLinks( $text, "news", false );
475 $text = $this->subReplaceExternalLinks( $text, "mailto", false );
476 wfProfileOut( $fname );
477 return $text;
478 }
479
480 /* private */ function subReplaceExternalLinks( $s, $protocol, $autonumber )
481 {
482 $unique = "4jzAfzB8hNvf4sqyO9Edd8pSmk9rE2in0Tgw3";
483 $uc = "A-Za-z0-9_\\/~%\\-+&*#?!=()@\\x80-\\xFF";
484
485 # this is the list of separators that should be ignored if they
486 # are the last character of an URL but that should be included
487 # if they occur within the URL, e.g. "go to www.foo.com, where .."
488 # in this case, the last comma should not become part of the URL,
489 # but in "www.foo.com/123,2342,32.htm" it should.
490 $sep = ",;\.:";
491 $fnc = "A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF";
492 $images = "gif|png|jpg|jpeg";
493
494 # PLEASE NOTE: The curly braces { } are not part of the regex,
495 # they are interpreted as part of the string (used to tell PHP
496 # that the content of the string should be inserted there).
497 $e1 = "/(^|[^\\[])({$protocol}:)([{$uc}{$sep}]+)\\/([{$fnc}]+)\\." .
498 "((?i){$images})([^{$uc}]|$)/";
499
500 $e2 = "/(^|[^\\[])({$protocol}:)(([".$uc."]|[".$sep."][".$uc."])+)([^". $uc . $sep. "]|[".$sep."]|$)/";
501 $sk =& $this->mOptions->getSkin();
502
503 if ( $autonumber and $this->mOptions->getAllowExternalImages() ) { # Use img tags only for HTTP urls
504 $s = preg_replace( $e1, "\\1" . $sk->makeImage( "{$unique}:\\3" .
505 "/\\4.\\5", "\\4.\\5" ) . "\\6", $s );
506 }
507 $s = preg_replace( $e2, "\\1" . "<a href=\"{$unique}:\\3\"" .
508 $sk->getExternalLinkAttributes( "{$unique}:\\3", wfEscapeHTML(
509 "{$unique}:\\3" ) ) . ">" . wfEscapeHTML( "{$unique}:\\3" ) .
510 "</a>\\5", $s );
511 $s = str_replace( $unique, $protocol, $s );
512
513 $a = explode( "[{$protocol}:", " " . $s );
514 $s = array_shift( $a );
515 $s = substr( $s, 1 );
516
517 $e1 = "/^([{$uc}"."{$sep}]+)](.*)\$/sD";
518 $e2 = "/^([{$uc}"."{$sep}]+)\\s+([^\\]]+)](.*)\$/sD";
519
520 foreach ( $a as $line ) {
521 if ( preg_match( $e1, $line, $m ) ) {
522 $link = "{$protocol}:{$m[1]}";
523 $trail = $m[2];
524 if ( $autonumber ) { $text = "[" . ++$this->mAutonumber . "]"; }
525 else { $text = wfEscapeHTML( $link ); }
526 } else if ( preg_match( $e2, $line, $m ) ) {
527 $link = "{$protocol}:{$m[1]}";
528 $text = $m[2];
529 $trail = $m[3];
530 } else {
531 $s .= "[{$protocol}:" . $line;
532 continue;
533 }
534 if( $link == $text || preg_match( "!$protocol://" . preg_quote( $text, "/" ) . "/?$!", $link ) ) {
535 $paren = "";
536 } else {
537 # Expand the URL for printable version
538 $paren = "<span class='urlexpansion'> (<i>" . htmlspecialchars ( $link ) . "</i>)</span>";
539 }
540 $la = $sk->getExternalLinkAttributes( $link, $text );
541 $s .= "<a href='{$link}'{$la}>{$text}</a>{$paren}{$trail}";
542
543 }
544 return $s;
545 }
546
547 /* private */ function handle3Quotes( &$state, $token )
548 {
549 if ( $state["strong"] ) {
550 if ( $state["em"] && $state["em"] > $state["strong"] )
551 {
552 # ''' lala ''lala '''
553 $s = "</em></strong><em>";
554 } else {
555 $s = "</strong>";
556 }
557 $state["strong"] = FALSE;
558 } else {
559 $s = "<strong>";
560 $state["strong"] = $token["pos"];
561 }
562 return $s;
563 }
564
565 /* private */ function handle2Quotes( &$state, $token )
566 {
567 if ( $state["em"] ) {
568 if ( $state["strong"] && $state["strong"] > $state["em"] )
569 {
570 # ''lala'''lala'' ....'''
571 $s = "</strong></em><strong>";
572 } else {
573 $s = "</em>";
574 }
575 $state["em"] = FALSE;
576 } else {
577 $s = "<em>";
578 $state["em"] = $token["pos"];
579 }
580 return $s;
581 }
582
583 /* private */ function handle5Quotes( &$state, $token )
584 {
585 if ( $state["em"] && $state["strong"] ) {
586 if ( $state["em"] < $state["strong"] ) {
587 $s .= "</strong></em>";
588 } else {
589 $s .= "</em></strong>";
590 }
591 $state["strong"] = $state["em"] = FALSE;
592 } elseif ( $state["em"] ) {
593 $s .= "</em><strong>";
594 $state["em"] = FALSE;
595 $state["strong"] = $token["pos"];
596 } elseif ( $state["strong"] ) {
597 $s .= "</strong><em>";
598 $state["strong"] = FALSE;
599 $state["em"] = $token["pos"];
600 } else { # not $em and not $strong
601 $s .= "<strong><em>";
602 $state["strong"] = $state["em"] = $token["pos"];
603 }
604 return $s;
605 }
606
607 /* private */ function replaceInternalLinks( $str )
608 {
609 global $wgLang; # for language specific parser hook
610
611 $tokenizer=Tokenizer::newFromString( $str );
612 $tokenStack = array();
613
614 $s="";
615 $state["em"] = FALSE;
616 $state["strong"] = FALSE;
617 $tagIsOpen = FALSE;
618 $threeopen = false;
619
620 # The tokenizer splits the text into tokens and returns them one by one.
621 # Every call to the tokenizer returns a new token.
622 while ( $token = $tokenizer->nextToken() )
623 {
624 $threeopen = false;
625 switch ( $token["type"] )
626 {
627 case "text":
628 # simple text with no further markup
629 $txt = $token["text"];
630 break;
631 case "[[[":
632 # remember the tag opened with 3 [
633 $threeopen = true;
634 case "[[":
635 # link opening tag.
636 # FIXME : Treat orphaned open tags (stack not empty when text is over)
637 $tagIsOpen = TRUE;
638 array_push( $tokenStack, $token );
639 $txt="";
640 break;
641
642 case "]]]":
643 case "]]":
644 # link close tag.
645 # get text from stack, glue it together, and call the code to handle a
646 # link
647
648 if ( count( $tokenStack ) == 0 )
649 {
650 # stack empty. Found a ]] without an opening [[
651 $txt = "]]";
652 } else {
653 $linkText = "";
654 $lastToken = array_pop( $tokenStack );
655 while ( !(($lastToken["type"] == "[[[") or ($lastToken["type"] == "[[")) )
656 {
657 if( !empty( $lastToken["text"] ) ) {
658 $linkText = $lastToken["text"] . $linkText;
659 }
660 $lastToken = array_pop( $tokenStack );
661 }
662
663 $txt = $linkText ."]]";
664
665 if( isset( $lastToken["text"] ) ) {
666 $prefix = $lastToken["text"];
667 } else {
668 $prefix = "";
669 }
670 $nextToken = $tokenizer->previewToken();
671 if ( $nextToken["type"] == "text" )
672 {
673 # Preview just looks at it. Now we have to fetch it.
674 $nextToken = $tokenizer->nextToken();
675 $txt .= $nextToken["text"];
676 }
677 $txt = $this->handleInternalLink( $txt, $prefix );
678
679 # did the tag start with 3 [ ?
680 if($threeopen) {
681 # show the first as text
682 $txt = "[".$txt;
683 $threeopen=false;
684 }
685
686 }
687 $tagIsOpen = (count( $tokenStack ) != 0);
688 break;
689 case "----":
690 $txt = "\n<hr>\n";
691 break;
692 case "'''":
693 # This and the three next ones handle quotes
694 $txt = $this->handle3Quotes( $state, $token );
695 break;
696 case "''":
697 $txt = $this->handle2Quotes( $state, $token );
698 break;
699 case "'''''":
700 $txt = $this->handle5Quotes( $state, $token );
701 break;
702 case "":
703 # empty token
704 $txt="";
705 break;
706 case "RFC ":
707 if ( $tagIsOpen ) {
708 $txt = "RFC ";
709 } else {
710 $txt = $this->doMagicRFC( $tokenizer );
711 }
712 break;
713 case "ISBN ":
714 if ( $tagIsOpen ) {
715 $txt = "ISBN ";
716 } else {
717 $txt = $this->doMagicISBN( $tokenizer );
718 }
719 break;
720 default:
721 # Call language specific Hook.
722 $txt = $wgLang->processToken( $token, $tokenStack );
723 if ( NULL == $txt ) {
724 # An unkown token. Highlight.
725 $txt = "<font color=\"#FF0000\"><b>".$token["type"]."</b></font>";
726 $txt .= "<font color=\"#FFFF00\"><b>".$token["text"]."</b></font>";
727 }
728 break;
729 }
730 # If we're parsing the interior of a link, don't append the interior to $s,
731 # but push it to the stack so it can be processed when a ]] token is found.
732 if ( $tagIsOpen && $txt != "" ) {
733 $token["type"] = "text";
734 $token["text"] = $txt;
735 array_push( $tokenStack, $token );
736 } else {
737 $s .= $txt;
738 }
739 } #end while
740 if ( count( $tokenStack ) != 0 )
741 {
742 # still objects on stack. opened [[ tag without closing ]] tag.
743 $txt = "";
744 while ( $lastToken = array_pop( $tokenStack ) )
745 {
746 if ( $lastToken["type"] == "text" )
747 {
748 $txt = $lastToken["text"] . $txt;
749 } else {
750 $txt = $lastToken["type"] . $txt;
751 }
752 }
753 $s .= $txt;
754 }
755 return $s;
756 }
757
758 /* private */ function handleInternalLink( $line, $prefix )
759 {
760 global $wgLang, $wgLinkCache;
761 global $wgNamespacesWithSubpages, $wgLanguageCode;
762 static $fname = "Parser::replaceInternalLinks" ;
763 wfProfileIn( $fname );
764
765 wfProfileIn( "$fname-setup" );
766 static $tc = FALSE;
767 if ( !$tc ) { $tc = Title::legalChars() . "#"; }
768 $sk =& $this->mOptions->getSkin();
769
770 # Match a link having the form [[namespace:link|alternate]]trail
771 static $e1 = FALSE;
772 if ( !$e1 ) { $e1 = "/^([{$tc}]+)(?:\\|([^]]+))?]](.*)\$/sD"; }
773 # Match the end of a line for a word that's not followed by whitespace,
774 # e.g. in the case of 'The Arab al[[Razi]]', 'al' will be matched
775 #$e2 = "/^(.*)\\b(\\w+)\$/suD";
776 #$e2 = "/^(.*\\s)(\\S+)\$/suD";
777 static $e2 = '/^(.*\s)([a-zA-Z\x80-\xff]+)$/sD';
778
779
780 # Special and Media are pseudo-namespaces; no pages actually exist in them
781 static $image = FALSE;
782 static $special = FALSE;
783 static $media = FALSE;
784 static $category = FALSE;
785 if ( !$image ) { $image = Namespace::getImage(); }
786 if ( !$special ) { $special = Namespace::getSpecial(); }
787 if ( !$media ) { $media = Namespace::getMedia(); }
788 if ( !$category ) { $category = wfMsg ( "category" ) ; }
789
790 $nottalk = !Namespace::isTalk( $this->mTitle->getNamespace() );
791
792 wfProfileOut( "$fname-setup" );
793 $s = "";
794
795 if ( preg_match( $e1, $line, $m ) ) { # page with normal text or alt
796 $text = $m[2];
797 $trail = $m[3];
798 } else { # Invalid form; output directly
799 $s .= $prefix . "[[" . $line ;
800 return $s;
801 }
802
803 /* Valid link forms:
804 Foobar -- normal
805 :Foobar -- override special treatment of prefix (images, language links)
806 /Foobar -- convert to CurrentPage/Foobar
807 /Foobar/ -- convert to CurrentPage/Foobar, strip the initial / from text
808 */
809 $c = substr($m[1],0,1);
810 $noforce = ($c != ":");
811 if( $c == "/" ) { # subpage
812 if(substr($m[1],-1,1)=="/") { # / at end means we don't want the slash to be shown
813 $m[1]=substr($m[1],1,strlen($m[1])-2);
814 $noslash=$m[1];
815 } else {
816 $noslash=substr($m[1],1);
817 }
818 if($wgNamespacesWithSubpages[$this->mTitle->getNamespace()]) { # subpages allowed here
819 $link = $this->mTitle->getPrefixedText(). "/" . trim($noslash);
820 if( "" == $text ) {
821 $text= $m[1];
822 } # this might be changed for ugliness reasons
823 } else {
824 $link = $noslash; # no subpage allowed, use standard link
825 }
826 } elseif( $noforce ) { # no subpage
827 $link = $m[1];
828 } else {
829 $link = substr( $m[1], 1 );
830 }
831 if( "" == $text )
832 $text = $link;
833
834 $nt = Title::newFromText( $link );
835 if( !$nt ) {
836 $s .= $prefix . "[[" . $line;
837 return $s;
838 }
839 $ns = $nt->getNamespace();
840 $iw = $nt->getInterWiki();
841 if( $noforce ) {
842 if( $iw && $this->mOptions->getInterwikiMagic() && $nottalk && $wgLang->getLanguageName( $iw ) ) {
843 array_push( $this->mOutput->mLanguageLinks, $nt->getPrefixedText() );
844 $s .= $prefix . $trail;
845 return $s;
846 }
847 if( $ns == $image ) {
848 $s .= $prefix . $sk->makeImageLinkObj( $nt, $text ) . $trail;
849 $wgLinkCache->addImageLinkObj( $nt );
850 return $s;
851 }
852 }
853 if( ( $nt->getPrefixedText() == $this->mTitle->getPrefixedText() ) &&
854 ( strpos( $link, "#" ) == FALSE ) ) {
855 $s .= $prefix . "<strong>" . $text . "</strong>" . $trail;
856 return $s;
857 }
858
859 # Category feature
860 $catns = strtoupper ( $nt->getDBkey () ) ;
861 $catns = explode ( ":" , $catns ) ;
862 if ( count ( $catns ) > 1 ) $catns = array_shift ( $catns ) ;
863 else $catns = "" ;
864 if ( $catns == strtoupper($category) && $this->mOptions->getUseCategoryMagic() ) {
865 $t = explode ( ":" , $nt->getText() ) ;
866 array_shift ( $t ) ;
867 $t = implode ( ":" , $t ) ;
868 $t = $wgLang->ucFirst ( $t ) ;
869 $nnt = Title::newFromText ( $category.":".$t ) ;
870 $t = $sk->makeLinkObj( $nnt, $t, "", $trail , $prefix );
871 $this->mOutput->mCategoryLinks[] = $t ;
872 $s .= $prefix . $trail ;
873 return $s ;
874 }
875 if( $ns == $media ) {
876 $s .= $prefix . $sk->makeMediaLinkObj( $nt, $text ) . $trail;
877 $wgLinkCache->addImageLinkObj( $nt );
878 return $s;
879 } elseif( $ns == $special ) {
880 $s .= $prefix . $sk->makeKnownLinkObj( $nt, $text, "", $trail );
881 return $s;
882 }
883 $s .= $sk->makeLinkObj( $nt, $text, "", $trail , $prefix );
884
885 wfProfileOut( $fname );
886 return $s;
887 }
888
889 # Some functions here used by doBlockLevels()
890 #
891 /* private */ function closeParagraph()
892 {
893 $result = "";
894 if ( 0 != strcmp( "p", $this->mLastSection ) &&
895 0 != strcmp( "", $this->mLastSection ) ) {
896 $result = "</" . $this->mLastSection . ">";
897 }
898 $this->mLastSection = "";
899 return $result."\n";
900 }
901 # getCommon() returns the length of the longest common substring
902 # of both arguments, starting at the beginning of both.
903 #
904 /* private */ function getCommon( $st1, $st2 )
905 {
906 $fl = strlen( $st1 );
907 $shorter = strlen( $st2 );
908 if ( $fl < $shorter ) { $shorter = $fl; }
909
910 for ( $i = 0; $i < $shorter; ++$i ) {
911 if ( $st1{$i} != $st2{$i} ) { break; }
912 }
913 return $i;
914 }
915 # These next three functions open, continue, and close the list
916 # element appropriate to the prefix character passed into them.
917 #
918 /* private */ function openList( $char )
919 {
920 $result = $this->closeParagraph();
921
922 if ( "*" == $char ) { $result .= "<ul><li>"; }
923 else if ( "#" == $char ) { $result .= "<ol><li>"; }
924 else if ( ":" == $char ) { $result .= "<dl><dd>"; }
925 else if ( ";" == $char ) {
926 $result .= "<dl><dt>";
927 $this->mDTopen = true;
928 }
929 else { $result = "<!-- ERR 1 -->"; }
930
931 return $result;
932 }
933
934 /* private */ function nextItem( $char )
935 {
936 if ( "*" == $char || "#" == $char ) { return "</li><li>"; }
937 else if ( ":" == $char || ";" == $char ) {
938 $close = "</dd>";
939 if ( $this->mDTopen ) { $close = "</dt>"; }
940 if ( ";" == $char ) {
941 $this->mDTopen = true;
942 return $close . "<dt>";
943 } else {
944 $this->mDTopen = false;
945 return $close . "<dd>";
946 }
947 }
948 return "<!-- ERR 2 -->";
949 }
950
951 /* private */function closeList( $char )
952 {
953 if ( "*" == $char ) { $text = "</li></ul>"; }
954 else if ( "#" == $char ) { $text = "</li></ol>"; }
955 else if ( ":" == $char ) {
956 if ( $this->mDTopen ) {
957 $this->mDTopen = false;
958 $text = "</dt></dl>";
959 } else {
960 $text = "</dd></dl>";
961 }
962 }
963 else { return "<!-- ERR 3 -->"; }
964 return $text."\n";
965 }
966
967 /* private */ function doBlockLevels( $text, $linestart )
968 {
969 $fname = "Parser::doBlockLevels";
970 wfProfileIn( $fname );
971 # Parsing through the text line by line. The main thing
972 # happening here is handling of block-level elements p, pre,
973 # and making lists from lines starting with * # : etc.
974 #
975 $a = explode( "\n", $text );
976 $text = $lastPref = "";
977 $this->mDTopen = $inBlockElem = false;
978
979 if ( ! $linestart ) { $text .= array_shift( $a ); }
980 foreach ( $a as $t ) {
981 if ( "" != $text ) { $text .= "\n"; }
982
983 $oLine = $t;
984 $opl = strlen( $lastPref );
985 $npl = strspn( $t, "*#:;" );
986 $pref = substr( $t, 0, $npl );
987 $pref2 = str_replace( ";", ":", $pref );
988 $t = substr( $t, $npl );
989
990 if ( 0 != $npl && 0 == strcmp( $lastPref, $pref2 ) ) {
991 $text .= $this->nextItem( substr( $pref, -1 ) );
992
993 if ( ";" == substr( $pref, -1 ) ) {
994 $cpos = strpos( $t, ":" );
995 if ( ! ( false === $cpos ) ) {
996 $term = substr( $t, 0, $cpos );
997 $text .= $term . $this->nextItem( ":" );
998 $t = substr( $t, $cpos + 1 );
999 }
1000 }
1001 } else if (0 != $npl || 0 != $opl) {
1002 $cpl = $this->getCommon( $pref, $lastPref );
1003
1004 while ( $cpl < $opl ) {
1005 $text .= $this->closeList( $lastPref{$opl-1} );
1006 --$opl;
1007 }
1008 if ( $npl <= $cpl && $cpl > 0 ) {
1009 $text .= $this->nextItem( $pref{$cpl-1} );
1010 }
1011 while ( $npl > $cpl ) {
1012 $char = substr( $pref, $cpl, 1 );
1013 $text .= $this->openList( $char );
1014
1015 if ( ";" == $char ) {
1016 $cpos = strpos( $t, ":" );
1017 if ( ! ( false === $cpos ) ) {
1018 $term = substr( $t, 0, $cpos );
1019 $text .= $term . $this->nextItem( ":" );
1020 $t = substr( $t, $cpos + 1 );
1021 }
1022 }
1023 ++$cpl;
1024 }
1025 $lastPref = $pref2;
1026 }
1027 if ( 0 == $npl ) { # No prefix--go to paragraph mode
1028 if ( preg_match(
1029 "/(<table|<blockquote|<h1|<h2|<h3|<h4|<h5|<h6)/i", $t ) ) {
1030 $text .= $this->closeParagraph();
1031 $inBlockElem = true;
1032 }
1033 if ( ! $inBlockElem ) {
1034 if ( " " == $t{0} ) {
1035 $newSection = "pre";
1036 # $t = wfEscapeHTML( $t );
1037 }
1038 else { $newSection = "p"; }
1039
1040 if ( 0 == strcmp( "", trim( $oLine ) ) ) {
1041 $text .= $this->closeParagraph();
1042 $text .= "<" . $newSection . ">";
1043 } else if ( 0 != strcmp( $this->mLastSection,
1044 $newSection ) ) {
1045 $text .= $this->closeParagraph();
1046 if ( 0 != strcmp( "p", $newSection ) ) {
1047 $text .= "<" . $newSection . ">";
1048 }
1049 }
1050 $this->mLastSection = $newSection;
1051 }
1052 if ( $inBlockElem &&
1053 preg_match( "/(<\\/table|<\\/blockquote|<\\/h1|<\\/h2|<\\/h3|<\\/h4|<\\/h5|<\\/h6)/i", $t ) ) {
1054 $inBlockElem = false;
1055 }
1056 }
1057 $text .= $t;
1058 }
1059 while ( $npl ) {
1060 $text .= $this->closeList( $pref2{$npl-1} );
1061 --$npl;
1062 }
1063 if ( "" != $this->mLastSection ) {
1064 if ( "p" != $this->mLastSection ) {
1065 $text .= "</" . $this->mLastSection . ">";
1066 }
1067 $this->mLastSection = "";
1068 }
1069 wfProfileOut( $fname );
1070 return $text;
1071 }
1072
1073 function getVariableValue( $index ) {
1074 global $wgLang;
1075
1076 switch ( $index ) {
1077 case MAG_CURRENTMONTH:
1078 return date( "m" );
1079 case MAG_CURRENTMONTHNAME:
1080 return $wgLang->getMonthName( date("n") );
1081 case MAG_CURRENTMONTHNAMEGEN:
1082 return $wgLang->getMonthNameGen( date("n") );
1083 case MAG_CURRENTDAY:
1084 return date("j");
1085 case MAG_CURRENTDAYNAME:
1086 return $wgLang->getWeekdayName( date("w")+1 );
1087 case MAG_CURRENTYEAR:
1088 return date( "Y" );
1089 case MAG_CURRENTTIME:
1090 return $wgLang->time( wfTimestampNow(), false );
1091 case MAG_NUMBEROFARTICLES:
1092 return wfNumberOfArticles();
1093 default:
1094 return NULL;
1095 }
1096 }
1097
1098 function initialiseVariables()
1099 {
1100 global $wgVariableIDs;
1101 $this->mVariables = array();
1102
1103 foreach ( $wgVariableIDs as $id ) {
1104 $mw =& MagicWord::get( $id );
1105 $mw->addToArray( $this->mVariables, $this->getVariableValue( $id ) );
1106 }
1107 }
1108
1109 /* private */ function replaceVariables( $text )
1110 {
1111 global $wgLang, $wgCurParser;
1112 global $wgScript, $wgArticlePath;
1113
1114 $fname = "Parser::replaceVariables";
1115 wfProfileIn( $fname );
1116
1117 $bail = false;
1118 if ( !$this->mVariables ) {
1119 $this->initialiseVariables();
1120 }
1121 $titleChars = Title::legalChars();
1122 $regex = "/{{([$titleChars]*?)}}/s";
1123
1124 # "Recursive" variable expansion: run it through a couple of passes
1125 for ( $i=0; $i<MAX_INCLUDE_REPEAT && !$bail; $i++ ) {
1126 $oldText = $text;
1127
1128 # It's impossible to rebind a global in PHP
1129 # Instead, we run the substitution on a copy, then merge the changed fields back in
1130 $wgCurParser = $this->fork();
1131
1132 $text = preg_replace_callback( $regex, "wfBraceSubstitution", $text );
1133 if ( $oldText == $text ) {
1134 $bail = true;
1135 }
1136 $this->merge( $wgCurParser );
1137 }
1138
1139 return $text;
1140 }
1141
1142 # Returns a copy of this object except with various variables cleared
1143 # This copy can be re-merged with the parent after operations on the copy
1144 function fork()
1145 {
1146 $copy = $this;
1147 $copy->mOutput = new ParserOutput;
1148 return $copy;
1149 }
1150
1151 # Merges a copy split off with fork()
1152 function merge( &$copy )
1153 {
1154 $this->mOutput->merge( $copy->mOutput );
1155
1156 # Merge include throttling arrays
1157 foreach( $copy->mIncludeCount as $dbk => $count ) {
1158 if ( array_key_exists( $dbk, $this->mIncludeCount ) ) {
1159 $this->mIncludeCount[$dbk] += $count;
1160 } else {
1161 $this->mIncludeCount[$dbk] = $count;
1162 }
1163 }
1164 }
1165
1166 function braceSubstitution( $matches )
1167 {
1168 global $wgLinkCache;
1169 $fname = "Parser::braceSubstitution";
1170 $found = false;
1171 $nowiki = false;
1172
1173 $text = $matches[1];
1174
1175 # SUBST
1176 $mwSubst =& MagicWord::get( MAG_SUBST );
1177 if ( $mwSubst->matchStartAndRemove( $text ) ) {
1178 if ( $this->mOutputType == OT_HTML ) {
1179 # Invalid SUBST not replaced at PST time
1180 # Return without further processing
1181 $text = $matches[0];
1182 $found = true;
1183 }
1184 } elseif ( $this->mOutputType == OT_WIKI ) {
1185 # SUBST not found in PST pass, do nothing
1186 $text = $matches[0];
1187 $found = true;
1188 }
1189
1190 # Various prefixes
1191 if ( !$found ) {
1192 # Check for MSGNW:
1193 $mwMsgnw =& MagicWord::get( MAG_MSGNW );
1194 if ( $mwMsgnw->matchStartAndRemove( $text ) ) {
1195 $nowiki = true;
1196 } else {
1197 # Remove obsolete MSG:
1198 $mwMsg =& MagicWord::get( MAG_MSG );
1199 $mwMsg->matchStartAndRemove( $text );
1200 }
1201
1202 # Check if it is an internal message
1203 $mwInt =& MagicWord::get( MAG_INT );
1204 if ( $mwInt->matchStartAndRemove( $text ) ) {
1205 $text = wfMsg( $text );
1206 $found = true;
1207 }
1208 }
1209
1210 # Check for a match against internal variables
1211 if ( !$found && array_key_exists( $text, $this->mVariables ) ) {
1212 $text = $this->mVariables[$text];
1213 $found = true;
1214 $this->mOutput->mContainsOldMagic = true;
1215 }
1216
1217 # Load from database
1218 if ( !$found ) {
1219 $title = Title::newFromText( $text, NS_TEMPLATE );
1220 if ( !is_null( $text ) && !$title->isExternal() ) {
1221 # Check for excessive inclusion
1222 $dbk = $title->getPrefixedDBkey();
1223 if ( !array_key_exists( $dbk, $this->mIncludeCount ) ) {
1224 $this->mIncludeCount[$dbk] = 0;
1225 }
1226 if ( ++$this->mIncludeCount[$dbk] <= MAX_INCLUDE_REPEAT ) {
1227 $row = wfGetArray( "cur", array("cur_text"), array(
1228 "cur_namespace" => $title->getNamespace(),
1229 "cur_title" => $title->getDBkey() ), $fname );
1230 if ( $row !== false ) {
1231 $found = true;
1232 $text = $row->cur_text;
1233
1234 # Escaping and link table handling
1235 # Not required for preSaveTransform()
1236 if ( $this->mOutputType == OT_HTML ) {
1237 if ( $nowiki ) {
1238 $text = wfEscapeWikiText( $text );
1239 } else {
1240 $text = $this->removeHTMLtags( $text );
1241 }
1242 $wgLinkCache->suspend();
1243 $text = $this->replaceInternalLinks( $text );
1244 $wgLinkCache->resume();
1245 $wgLinkCache->addLinkObj( $title );
1246
1247 }
1248 }
1249 }
1250
1251 # If the title is valid but undisplayable, make a link to it
1252 if ( $this->mOutputType == OT_HTML && !$found ) {
1253 $text = "[[" . $title->getPrefixedText() . "]]";
1254 $found = true;
1255 }
1256 }
1257 }
1258
1259 if ( !$found ) {
1260 return $matches[0];
1261 } else {
1262 return $text;
1263 }
1264 }
1265
1266 # Cleans up HTML, removes dangerous tags and attributes
1267 /* private */ function removeHTMLtags( $text )
1268 {
1269 $fname = "Parser::removeHTMLtags";
1270 wfProfileIn( $fname );
1271 $htmlpairs = array( # Tags that must be closed
1272 "b", "i", "u", "font", "big", "small", "sub", "sup", "h1",
1273 "h2", "h3", "h4", "h5", "h6", "cite", "code", "em", "s",
1274 "strike", "strong", "tt", "var", "div", "center",
1275 "blockquote", "ol", "ul", "dl", "table", "caption", "pre",
1276 "ruby", "rt" , "rb" , "rp"
1277 );
1278 $htmlsingle = array(
1279 "br", "p", "hr", "li", "dt", "dd"
1280 );
1281 $htmlnest = array( # Tags that can be nested--??
1282 "table", "tr", "td", "th", "div", "blockquote", "ol", "ul",
1283 "dl", "font", "big", "small", "sub", "sup"
1284 );
1285 $tabletags = array( # Can only appear inside table
1286 "td", "th", "tr"
1287 );
1288
1289 $htmlsingle = array_merge( $tabletags, $htmlsingle );
1290 $htmlelements = array_merge( $htmlsingle, $htmlpairs );
1291
1292 $htmlattrs = $this->getHTMLattrs () ;
1293
1294 # Remove HTML comments
1295 $text = preg_replace( "/<!--.*-->/sU", "", $text );
1296
1297 $bits = explode( "<", $text );
1298 $text = array_shift( $bits );
1299 $tagstack = array(); $tablestack = array();
1300
1301 foreach ( $bits as $x ) {
1302 $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
1303 preg_match( "/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/",
1304 $x, $regs );
1305 list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
1306 error_reporting( $prev );
1307
1308 $badtag = 0 ;
1309 if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
1310 # Check our stack
1311 if ( $slash ) {
1312 # Closing a tag...
1313 if ( ! in_array( $t, $htmlsingle ) &&
1314 ( $ot = array_pop( $tagstack ) ) != $t ) {
1315 array_push( $tagstack, $ot );
1316 $badtag = 1;
1317 } else {
1318 if ( $t == "table" ) {
1319 $tagstack = array_pop( $tablestack );
1320 }
1321 $newparams = "";
1322 }
1323 } else {
1324 # Keep track for later
1325 if ( in_array( $t, $tabletags ) &&
1326 ! in_array( "table", $tagstack ) ) {
1327 $badtag = 1;
1328 } else if ( in_array( $t, $tagstack ) &&
1329 ! in_array ( $t , $htmlnest ) ) {
1330 $badtag = 1 ;
1331 } else if ( ! in_array( $t, $htmlsingle ) ) {
1332 if ( $t == "table" ) {
1333 array_push( $tablestack, $tagstack );
1334 $tagstack = array();
1335 }
1336 array_push( $tagstack, $t );
1337 }
1338 # Strip non-approved attributes from the tag
1339 $newparams = $this->fixTagAttributes($params);
1340
1341 }
1342 if ( ! $badtag ) {
1343 $rest = str_replace( ">", "&gt;", $rest );
1344 $text .= "<$slash$t $newparams$brace$rest";
1345 continue;
1346 }
1347 }
1348 $text .= "&lt;" . str_replace( ">", "&gt;", $x);
1349 }
1350 # Close off any remaining tags
1351 while ( $t = array_pop( $tagstack ) ) {
1352 $text .= "</$t>\n";
1353 if ( $t == "table" ) { $tagstack = array_pop( $tablestack ); }
1354 }
1355 wfProfileOut( $fname );
1356 return $text;
1357 }
1358
1359 /*
1360 *
1361 * This function accomplishes several tasks:
1362 * 1) Auto-number headings if that option is enabled
1363 * 2) Add an [edit] link to sections for logged in users who have enabled the option
1364 * 3) Add a Table of contents on the top for users who have enabled the option
1365 * 4) Auto-anchor headings
1366 *
1367 * It loops through all headlines, collects the necessary data, then splits up the
1368 * string and re-inserts the newly formatted headlines.
1369 *
1370 */
1371
1372 /* private */ function formatHeadings( $text )
1373 {
1374 $doNumberHeadings = $this->mOptions->getNumberHeadings();
1375 $doShowToc = $this->mOptions->getShowToc();
1376 if( !$this->mTitle->userCanEdit() ) {
1377 $showEditLink = 0;
1378 $rightClickHack = 0;
1379 } else {
1380 $showEditLink = $this->mOptions->getEditSection();
1381 $rightClickHack = $this->mOptions->getEditSectionOnRightClick();
1382 }
1383
1384 # Inhibit editsection links if requested in the page
1385 $esw =& MagicWord::get( MAG_NOEDITSECTION );
1386 if( $esw->matchAndRemove( $text ) ) {
1387 $showEditLink = 0;
1388 }
1389 # if the string __NOTOC__ (not case-sensitive) occurs in the HTML,
1390 # do not add TOC
1391 $mw =& MagicWord::get( MAG_NOTOC );
1392 if( $mw->matchAndRemove( $text ) ) {
1393 $doShowToc = 0;
1394 }
1395
1396 # never add the TOC to the Main Page. This is an entry page that should not
1397 # be more than 1-2 screens large anyway
1398 if( $this->mTitle->getPrefixedText() == wfMsg("mainpage") ) {
1399 $doShowToc = 0;
1400 }
1401
1402 # We need this to perform operations on the HTML
1403 $sk =& $this->mOptions->getSkin();
1404
1405 # Get all headlines for numbering them and adding funky stuff like [edit]
1406 # links
1407 preg_match_all( "/<H([1-6])(.*?" . ">)(.*?)<\/H[1-6]>/i", $text, $matches );
1408
1409 # headline counter
1410 $headlineCount = 0;
1411
1412 # Ugh .. the TOC should have neat indentation levels which can be
1413 # passed to the skin functions. These are determined here
1414 $toclevel = 0;
1415 $toc = "";
1416 $full = "";
1417 $head = array();
1418 $sublevelCount = array();
1419 foreach( $matches[3] as $headline ) {
1420 if( $level ) {
1421 $prevlevel = $level;
1422 }
1423 $level = $matches[1][$headlineCount];
1424 if( ( $doNumberHeadings || $doShowToc ) && $prevlevel && $level > $prevlevel ) {
1425 # reset when we enter a new level
1426 $sublevelCount[$level] = 0;
1427 $toc .= $sk->tocIndent( $level - $prevlevel );
1428 $toclevel += $level - $prevlevel;
1429 }
1430 if( ( $doNumberHeadings || $doShowToc ) && $level < $prevlevel ) {
1431 # reset when we step back a level
1432 $sublevelCount[$level+1]=0;
1433 $toc .= $sk->tocUnindent( $prevlevel - $level );
1434 $toclevel -= $prevlevel - $level;
1435 }
1436 # count number of headlines for each level
1437 $sublevelCount[$level]++;
1438
1439 if( $doNumberHeadings || $doShowToc ) {
1440 for( $i = 1; $i <= $level; $i++ ) {
1441 if( $sublevelCount[$i] ) {
1442 if( $dot ) {
1443 $numbering .= ".";
1444 }
1445 $numbering .= $sublevelCount[$i];
1446 $dot = 1;
1447 }
1448 }
1449 }
1450
1451 # The canonized header is a version of the header text safe to use for links
1452 # Avoid insertion of weird stuff like <math> by expanding the relevant sections
1453 $canonized_headline = Parser::unstrip( $headline, $this->mStripState );
1454
1455 # strip out HTML
1456 $canonized_headline = preg_replace( "/<.*?" . ">/","",$canonized_headline );
1457
1458 $tocline = trim( $canonized_headline );
1459 $canonized_headline = str_replace( '"', "", $canonized_headline );
1460 $canonized_headline = str_replace( " ", "_", trim( $canonized_headline) );
1461 $refer[$headlineCount] = $canonized_headline;
1462
1463 # count how many in assoc. array so we can track dupes in anchors
1464 $refers[$canonized_headline]++;
1465 $refcount[$headlineCount]=$refers[$canonized_headline];
1466
1467 # Prepend the number to the heading text
1468
1469 if( $doNumberHeadings || $doShowToc ) {
1470 $tocline = $numbering . " " . $tocline;
1471
1472 # Don't number the heading if it is the only one (looks silly)
1473 if( $doNumberHeadings && count( $matches[3] ) > 1) {
1474 # the two are different if the line contains a link
1475 $headline=$numbering . " " . $headline;
1476 }
1477 }
1478
1479 # Create the anchor for linking from the TOC to the section
1480 $anchor = $canonized_headline;
1481 if($refcount[$headlineCount] > 1 ) {
1482 $anchor .= "_" . $refcount[$headlineCount];
1483 }
1484 if( $doShowToc ) {
1485 $toc .= $sk->tocLine($anchor,$tocline,$toclevel);
1486 }
1487 if( $showEditLink ) {
1488 $head[$headlineCount] .= $sk->editSectionLink($headlineCount+1);
1489 }
1490
1491
1492 # the headline might have a link
1493 if( preg_match( "/(.*)<a(.*)/", $headline, $headlinematches ) ) {
1494 # if so give an anchor name to the already existent link
1495 $headline = $headlinematches[1]
1496 . "<a name=\"$anchor\" " . $headlinematches[2];
1497 } else {
1498 # else create an anchor link for the headline
1499 $headline = "<a name=\"$anchor.\">$headline</a>";
1500 }
1501
1502 # give headline the correct <h#> tag
1503 $head[$headlineCount] .= "<h".$level.$matches[2][$headlineCount] .$headline."</h".$level.">";
1504
1505 # Add the edit section link
1506 if( $rightClickHack ) {
1507 $head[$headlineCount] = $sk->editSectionScript($headlineCount+1,$head[$headlineCount]);
1508 }
1509
1510 $numbering = "";
1511 $headlineCount++;
1512 $dot = 0;
1513 }
1514
1515 if( $doShowToc ) {
1516 $toclines = $headlineCount;
1517 $toc .= $sk->tocUnindent( $toclevel );
1518 $toc = $sk->tocTable( $toc );
1519 }
1520
1521 # split up and insert constructed headlines
1522
1523 $blocks = preg_split( "/<H[1-6].*?" . ">.*?<\/H[1-6]>/i", $text );
1524 $i = 0;
1525
1526 foreach( $blocks as $block ) {
1527 if( $showEditLink && $headlineCount > 0 && $i == 0 ) {
1528 # This is the [edit] link that appears for the top block of text when
1529 # section editing is enabled
1530 $full .= $sk->editSectionLink(0);
1531 }
1532 $full .= $block;
1533 if( $doShowToc && $toclines>3 && !$i) {
1534 # Let's add a top anchor just in case we want to link to the top of the page
1535 $full = "<a name=\"top\"></a>".$full.$toc;
1536 }
1537
1538 if( !empty( $head[$i] ) ) {
1539 $full .= $head[$i];
1540 }
1541 $i++;
1542 }
1543
1544 return $full;
1545 }
1546
1547 /* private */ function doMagicISBN( &$tokenizer )
1548 {
1549 global $wgLang;
1550
1551 # Check whether next token is a text token
1552 # If yes, fetch it and convert the text into a
1553 # Special::BookSources link
1554 $token = $tokenizer->previewToken();
1555 while ( $token["type"] == "" )
1556 {
1557 $tokenizer->nextToken();
1558 $token = $tokenizer->previewToken();
1559 }
1560 if ( $token["type"] == "text" )
1561 {
1562 $token = $tokenizer->nextToken();
1563 $x = $token["text"];
1564 $valid = "0123456789-ABCDEFGHIJKLMNOPQRSTUVWXYZ";
1565
1566 $isbn = $blank = "" ;
1567 while ( " " == $x{0} ) {
1568 $blank .= " ";
1569 $x = substr( $x, 1 );
1570 }
1571 while ( strstr( $valid, $x{0} ) != false ) {
1572 $isbn .= $x{0};
1573 $x = substr( $x, 1 );
1574 }
1575 $num = str_replace( "-", "", $isbn );
1576 $num = str_replace( " ", "", $num );
1577
1578 if ( "" == $num ) {
1579 $text = "ISBN $blank$x";
1580 } else {
1581 $titleObj = Title::makeTitle( NS_SPECIAL, "Booksources" );
1582 $text = "<a href=\"" .
1583 $titleObj->escapeLocalUrl( "isbn={$num}" ) .
1584 "\" class=\"internal\">ISBN $isbn</a>";
1585 $text .= $x;
1586 }
1587 } else {
1588 $text = "ISBN ";
1589 }
1590 return $text;
1591 }
1592 /* private */ function doMagicRFC( &$tokenizer )
1593 {
1594 global $wgLang;
1595
1596 # Check whether next token is a text token
1597 # If yes, fetch it and convert the text into a
1598 # link to an RFC source
1599 $token = $tokenizer->previewToken();
1600 while ( $token["type"] == "" )
1601 {
1602 $tokenizer->nextToken();
1603 $token = $tokenizer->previewToken();
1604 }
1605 if ( $token["type"] == "text" )
1606 {
1607 $token = $tokenizer->nextToken();
1608 $x = $token["text"];
1609 $valid = "0123456789";
1610
1611 $rfc = $blank = "" ;
1612 while ( " " == $x{0} ) {
1613 $blank .= " ";
1614 $x = substr( $x, 1 );
1615 }
1616 while ( strstr( $valid, $x{0} ) != false ) {
1617 $rfc .= $x{0};
1618 $x = substr( $x, 1 );
1619 }
1620
1621 if ( "" == $rfc ) {
1622 $text .= "RFC $blank$x";
1623 } else {
1624 $url = wfmsg( "rfcurl" );
1625 $url = str_replace( "$1", $rfc, $url);
1626 $sk =& $this->mOptions->getSkin();
1627 $la = $sk->getExternalLinkAttributes( $url, "RFC {$rfc}" );
1628 $text = "<a href='{$url}'{$la}>RFC {$rfc}</a>{$x}";
1629 }
1630 } else {
1631 $text = "RFC ";
1632 }
1633 return $text;
1634 }
1635
1636 function preSaveTransform( $text, &$title, &$user, $options, $clearState = true )
1637 {
1638 $this->mOptions = $options;
1639 $this->mTitle = $title;
1640 $this->mOutputType = OT_WIKI;
1641
1642 if ( $clearState ) {
1643 $this->clearState();
1644 }
1645
1646 $stripState = false;
1647 $text = str_replace("\r\n", "\n", $text);
1648 $text = $this->strip( $text, $stripState, false );
1649 $text = $this->pstPass2( $text, $user );
1650 $text = $this->unstrip( $text, $stripState );
1651 return $text;
1652 }
1653
1654 /* private */ function pstPass2( $text, &$user )
1655 {
1656 global $wgLang, $wgLocaltimezone, $wgCurParser;
1657
1658 # Variable replacement
1659 # Because mOutputType is OT_WIKI, this will only process {{subst:xxx}} type tags
1660 $text = $this->replaceVariables( $text );
1661
1662 # Signatures
1663 #
1664 $n = $user->getName();
1665 $k = $user->getOption( "nickname" );
1666 if ( "" == $k ) { $k = $n; }
1667 if(isset($wgLocaltimezone)) {
1668 $oldtz = getenv("TZ"); putenv("TZ=$wgLocaltimezone");
1669 }
1670 /* Note: this is an ugly timezone hack for the European wikis */
1671 $d = $wgLang->timeanddate( date( "YmdHis" ), false ) .
1672 " (" . date( "T" ) . ")";
1673 if(isset($wgLocaltimezone)) putenv("TZ=$oldtz");
1674
1675 $text = preg_replace( "/~~~~~/", $d, $text );
1676 $text = preg_replace( "/~~~~/", "[[" . $wgLang->getNsText(
1677 Namespace::getUser() ) . ":$n|$k]] $d", $text );
1678 $text = preg_replace( "/~~~/", "[[" . $wgLang->getNsText(
1679 Namespace::getUser() ) . ":$n|$k]]", $text );
1680
1681 # Context links: [[|name]] and [[name (context)|]]
1682 #
1683 $tc = "[&;%\\-,.\\(\\)' _0-9A-Za-z\\/:\\x80-\\xff]";
1684 $np = "[&;%\\-,.' _0-9A-Za-z\\/:\\x80-\\xff]"; # No parens
1685 $namespacechar = '[ _0-9A-Za-z\x80-\xff]'; # Namespaces can use non-ascii!
1686 $conpat = "/^({$np}+) \\(({$tc}+)\\)$/";
1687
1688 $p1 = "/\[\[({$np}+) \\(({$np}+)\\)\\|]]/"; # [[page (context)|]]
1689 $p2 = "/\[\[\\|({$tc}+)]]/"; # [[|page]]
1690 $p3 = "/\[\[($namespacechar+):({$np}+)\\|]]/"; # [[namespace:page|]]
1691 $p4 = "/\[\[($namespacechar+):({$np}+) \\(({$np}+)\\)\\|]]/";
1692 # [[ns:page (cont)|]]
1693 $context = "";
1694 $t = $this->mTitle->getText();
1695 if ( preg_match( $conpat, $t, $m ) ) {
1696 $context = $m[2];
1697 }
1698 $text = preg_replace( $p4, "[[\\1:\\2 (\\3)|\\2]]", $text );
1699 $text = preg_replace( $p1, "[[\\1 (\\2)|\\1]]", $text );
1700 $text = preg_replace( $p3, "[[\\1:\\2|\\2]]", $text );
1701
1702 if ( "" == $context ) {
1703 $text = preg_replace( $p2, "[[\\1]]", $text );
1704 } else {
1705 $text = preg_replace( $p2, "[[\\1 ({$context})|\\1]]", $text );
1706 }
1707
1708 /*
1709 $mw =& MagicWord::get( MAG_SUBST );
1710 $wgCurParser = $this->fork();
1711 $text = $mw->substituteCallback( $text, "wfBraceSubstitution" );
1712 $this->merge( $wgCurParser );
1713 */
1714
1715 # Trim trailing whitespace
1716 # MAG_END (__END__) tag allows for trailing
1717 # whitespace to be deliberately included
1718 $text = rtrim( $text );
1719 $mw =& MagicWord::get( MAG_END );
1720 $mw->matchAndRemove( $text );
1721
1722 return $text;
1723 }
1724
1725
1726 }
1727
1728 class ParserOutput
1729 {
1730 var $mText, $mLanguageLinks, $mCategoryLinks, $mContainsOldMagic;
1731
1732 function ParserOutput( $text = "", $languageLinks = array(), $categoryLinks = array(),
1733 $containsOldMagic = false )
1734 {
1735 $this->mText = $text;
1736 $this->mLanguageLinks = $languageLinks;
1737 $this->mCategoryLinks = $categoryLinks;
1738 $this->mContainsOldMagic = $containsOldMagic;
1739 }
1740
1741 function getText() { return $this->mText; }
1742 function getLanguageLinks() { return $this->mLanguageLinks; }
1743 function getCategoryLinks() { return $this->mCategoryLinks; }
1744 function containsOldMagic() { return $this->mContainsOldMagic; }
1745 function setText( $text ) { return wfSetVar( $this->mText, $text ); }
1746 function setLanguageLinks( $ll ) { return wfSetVar( $this->mLanguageLinks, $ll ); }
1747 function setCategoryLinks( $cl ) { return wfSetVar( $this->mCategoryLinks, $cl ); }
1748 function setContainsOldMagic( $com ) { return wfSetVar( $this->mContainsOldMagic, $com ); }
1749
1750 function merge( $other ) {
1751 $this->mLanguageLinks = array_merge( $this->mLanguageLinks, $other->mLanguageLinks );
1752 $this->mCategoryLinks = array_merge( $this->mCategoryLinks, $this->mLanguageLinks );
1753 $this->mContainsOldMagic = $this->mContainsOldMagic || $other->mContainsOldMagic;
1754 }
1755
1756 }
1757
1758 class ParserOptions
1759 {
1760 # All variables are private
1761 var $mUseTeX; # Use texvc to expand <math> tags
1762 var $mUseCategoryMagic; # Treat [[Category:xxxx]] tags specially
1763 var $mUseDynamicDates; # Use $wgDateFormatter to format dates
1764 var $mInterwikiMagic; # Interlanguage links are removed and returned in an array
1765 var $mAllowExternalImages; # Allow external images inline
1766 var $mSkin; # Reference to the preferred skin
1767 var $mDateFormat; # Date format index
1768 var $mEditSection; # Create "edit section" links
1769 var $mEditSectionOnRightClick; # Generate JavaScript to edit section on right click
1770 var $mNumberHeadings; # Automatically number headings
1771 var $mShowToc; # Show table of contents
1772
1773 function getUseTeX() { return $this->mUseTeX; }
1774 function getUseCategoryMagic() { return $this->mUseCategoryMagic; }
1775 function getUseDynamicDates() { return $this->mUseDynamicDates; }
1776 function getInterwikiMagic() { return $this->mInterwikiMagic; }
1777 function getAllowExternalImages() { return $this->mAllowExternalImages; }
1778 function getSkin() { return $this->mSkin; }
1779 function getDateFormat() { return $this->mDateFormat; }
1780 function getEditSection() { return $this->mEditSection; }
1781 function getEditSectionOnRightClick() { return $this->mEditSectionOnRightClick; }
1782 function getNumberHeadings() { return $this->mNumberHeadings; }
1783 function getShowToc() { return $this->mShowToc; }
1784
1785 function setUseTeX( $x ) { return wfSetVar( $this->mUseTeX, $x ); }
1786 function setUseCategoryMagic( $x ) { return wfSetVar( $this->mUseCategoryMagic, $x ); }
1787 function setUseDynamicDates( $x ) { return wfSetVar( $this->mUseDynamicDates, $x ); }
1788 function setInterwikiMagic( $x ) { return wfSetVar( $this->mInterwikiMagic, $x ); }
1789 function setAllowExternalImages( $x ) { return wfSetVar( $this->mAllowExternalImages, $x ); }
1790 function setSkin( $x ) { return wfSetRef( $this->mSkin, $x ); }
1791 function setDateFormat( $x ) { return wfSetVar( $this->mDateFormat, $x ); }
1792 function setEditSection( $x ) { return wfSetVar( $this->mEditSection, $x ); }
1793 function setEditSectionOnRightClick( $x ) { return wfSetVar( $this->mEditSectionOnRightClick, $x ); }
1794 function setNumberHeadings( $x ) { return wfSetVar( $this->mNumberHeadings, $x ); }
1795 function setShowToc( $x ) { return wfSetVar( $this->mShowToc, $x ); }
1796
1797 /* static */ function newFromUser( &$user )
1798 {
1799 $popts = new ParserOptions;
1800 $popts->initialiseFromUser( &$user );
1801 return $popts;
1802 }
1803
1804 function initialiseFromUser( &$userInput )
1805 {
1806 global $wgUseTeX, $wgUseCategoryMagic, $wgUseDynamicDates, $wgInterwikiMagic, $wgAllowExternalImages;
1807
1808 if ( !$userInput ) {
1809 $user = new User;
1810 } else {
1811 $user =& $userInput;
1812 }
1813
1814 $this->mUseTeX = $wgUseTeX;
1815 $this->mUseCategoryMagic = $wgUseCategoryMagic;
1816 $this->mUseDynamicDates = $wgUseDynamicDates;
1817 $this->mInterwikiMagic = $wgInterwikiMagic;
1818 $this->mAllowExternalImages = $wgAllowExternalImages;
1819 $this->mSkin =& $user->getSkin();
1820 $this->mDateFormat = $user->getOption( "date" );
1821 $this->mEditSection = $user->getOption( "editsection" );
1822 $this->mEditSectionOnRightClick = $user->getOption( "editsectiononrightclick" );
1823 $this->mNumberHeadings = $user->getOption( "numberheadings" );
1824 $this->mShowToc = $user->getOption( "showtoc" );
1825 }
1826
1827
1828 }
1829
1830 # Regex callbacks, used in Parser::replaceVariables
1831 function wfBraceSubstitution( $matches )
1832 {
1833 global $wgCurParser;
1834 return $wgCurParser->braceSubstitution( $matches );
1835 }
1836
1837 ?>