Fix Special:Import for new schema; make it create page records as needed and hook...
[lhc/web/wiklou.git] / includes / Parser.php
index 0ad37a0..fbc7b28 100644 (file)
@@ -1,11 +1,13 @@
 <?php
-
 /**
  * File for Parser and related classes
  *
  * @package MediaWiki
  */
 
+/** */
+require_once( 'Sanitizer.php' );
+
 /**
  * Update this version number when the ParserOutput format
  * changes in an incompatible way, so the parser cache
@@ -51,7 +53,7 @@ define( 'EXT_LINK_URL_CLASS', '[^]<>"\\x00-\\x20\\x7F]' );
 define( 'EXT_LINK_TEXT_CLASS', '[^\]\\x00-\\x1F\\x7F]' );
 define( 'EXT_IMAGE_FNAME_CLASS', '[A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]' );
 define( 'EXT_IMAGE_EXTENSIONS', 'gif|png|jpg|jpeg' );
-define( 'EXT_LINK_BRACKETED',  '/\[(('.URL_PROTOCOLS.'):'.EXT_LINK_URL_CLASS.'+) *('.EXT_LINK_TEXT_CLASS.'*?)\]/S' );
+define( 'EXT_LINK_BRACKETED',  '/\[(\b('.URL_PROTOCOLS.'):'.EXT_LINK_URL_CLASS.'+) *('.EXT_LINK_TEXT_CLASS.'*?)\]/S' );
 define( 'EXT_IMAGE_REGEX',
        '/^('.HTTP_PROTOCOLS.':)'.  # Protocol
        '('.EXT_LINK_URL_CLASS.'+)\\/'.  # Hostname and path
@@ -128,7 +130,7 @@ class Parser
        function clearState() {
                $this->mOutput = new ParserOutput;
                $this->mAutonumber = 0;
-               $this->mLastSection = "";
+               $this->mLastSection = '';
                $this->mDTopen = false;
                $this->mVariables = false;
                $this->mIncludeCount = array();
@@ -142,6 +144,11 @@ class Parser
         * to internalParse() which does all the real work.
         *
         * @access private
+        * @param string $text Text we want to parse
+        * @param Title &$title A title object
+        * @param array $options
+        * @param boolean $linestart
+        * @param boolean $clearState
         * @return ParserOutput a ParserOutput
         */
        function parse( $text, &$title, $options, $linestart = true, $clearState = true ) {
@@ -158,7 +165,11 @@ class Parser
                $this->mOutputType = OT_HTML;
 
                $stripState = NULL;
-               $text = $this->strip( $text, $this->mStripState );
+               global $fnord; $fnord = 1;
+               //$text = $this->strip( $text, $this->mStripState );
+               // VOODOO MAGIC FIX! Sometimes the above segfaults in PHP5.
+               $x =& $this->mStripState;
+               $text = $this->strip( $text, $x );
 
                $text = $this->internalParse( $text, $linestart );
                $text = $this->unstrip( $text, $this->mStripState );
@@ -169,16 +180,14 @@ class Parser
                                # only if there is something before the space
                                '/(.) (?=\\?|:|;|!|\\302\\273)/i' => '\\1&nbsp;\\2',
                                # french spaces, Guillemet-right
-                               "/(\\302\\253) /i"=>"\\1&nbsp;",
+                               '/(\\302\\253) /i' => '\\1&nbsp;',
                                '/<hr *>/i' => '<hr />',
                                '/<br *>/i' => '<br />',
                                '/<center *>/i' => '<div class="center">',
                                '/<\\/center *>/i' => '</div>',
-                               # Clean up spare ampersands; note that we probably ought to be
-                               # more careful about named entities.
-                               '/&(?!:amp;|#[Xx][0-9A-fa-f]+;|#[0-9]+;|[a-zA-Z0-9]+;)/' => '&amp;'
                        );
                        $text = preg_replace( array_keys($fixtags), array_values($fixtags), $text );
+                       $text = Sanitizer::normalizeCharReferences( $text );
                } else {
                        $fixtags = array(
                                # french spaces, last one Guillemet-left
@@ -240,7 +249,7 @@ class Parser
 
                while ( '' != $text ) {
                        if($tag==STRIP_COMMENTS) {
-                               $p = preg_split( '/<!--/i', $text, 2 );
+                               $p = preg_split( '/<!--/', $text, 2 );
                        } else {
                                $p = preg_split( "/<\\s*$tag\\s*>/i", $text, 2 );
                        }
@@ -458,57 +467,6 @@ class Parser
                return $rnd;
        }
 
-       /**
-        * Return allowed HTML attributes
-        *
-        * @access private
-        */
-       function getHTMLattrs () {
-               $htmlattrs = array( # Allowed attributes--no scripting, etc.
-                               'title', 'align', 'lang', 'dir', 'width', 'height',
-                               'bgcolor', 'clear', /* BR */ 'noshade', /* HR */
-                               'cite', /* BLOCKQUOTE, Q */ 'size', 'face', 'color',
-                               /* FONT */ 'type', 'start', 'value', 'compact',
-                               /* For various lists, mostly deprecated but safe */
-                               'summary', 'width', 'border', 'frame', 'rules',
-                               'cellspacing', 'cellpadding', 'valign', 'char',
-                               'charoff', 'colgroup', 'col', 'span', 'abbr', 'axis',
-                               'headers', 'scope', 'rowspan', 'colspan', /* Tables */
-                               'id', 'class', 'name', 'style' /* For CSS */
-                               );
-               return $htmlattrs ;
-       }
-
-       /**
-        * Remove non approved attributes and javascript in css
-        *
-        * @access private
-        */
-       function fixTagAttributes ( $t ) {
-               if ( trim ( $t ) == '' ) return '' ; # Saves runtime ;-)
-               $htmlattrs = $this->getHTMLattrs() ;
-
-               # Strip non-approved attributes from the tag
-               $t = preg_replace(
-                       '/(\\w+)(\\s*=\\s*([^\\s\">]+|\"[^\">]*\"))?/e',
-                       "(in_array(strtolower(\"\$1\"),\$htmlattrs)?(\"\$1\".((\"x\$3\" != \"x\")?\"=\$3\":'')):'')",
-                       $t);
-
-               $t = str_replace ( '<></>' , '' , $t ) ; # This should fix bug 980557
-
-               # Strip javascript "expression" from stylesheets. Brute force approach:
-               # If anythin offensive is found, all attributes of the HTML tag are dropped
-
-               if( preg_match(
-                       '/style\\s*=.*(expression|tps*:\/\/|url\\s*\().*/is',
-                       wfMungeToUtf8( $t ) ) )
-               {
-                       $t='';
-               }
-
-               return trim ( $t ) ;
-       }
-
        /**
         * interface with html tidy, used if $wgUseTidy = true
         *
@@ -586,7 +544,7 @@ class Parser
                                $indent_level = strlen( $matches[1] );
                                $t[$k] = "\n" .
                                        str_repeat( '<dl><dd>', $indent_level ) .
-                                       '<table ' . $this->fixTagAttributes ( $matches[2] ) . '>' ;
+                                       '<table' . Sanitizer::fixTagAttributes ( $matches[2], 'table' ) . '>' ;
                                array_push ( $td , false ) ;
                                array_push ( $ltd , '' ) ;
                                array_push ( $tr , false ) ;
@@ -594,7 +552,7 @@ class Parser
                        }
                        else if ( count ( $td ) == 0 ) { } # Don't do any of the following
                        else if ( '|}' == substr ( $x , 0 , 2 ) ) {
-                               $z = "</table>\n" ;
+                               $z = "</table>" . substr ( $x , 2) . "\n";
                                $l = array_pop ( $ltd ) ;
                                if ( array_pop ( $tr ) ) $z = '</tr>' . $z ;
                                if ( array_pop ( $td ) ) $z = '</'.$l.'>' . $z ;
@@ -613,7 +571,7 @@ class Parser
                                array_push ( $tr , false ) ;
                                array_push ( $td , false ) ;
                                array_push ( $ltd , '' ) ;
-                               array_push ( $ltr , $this->fixTagAttributes ( $x ) ) ;
+                               array_push ( $ltr , Sanitizer::fixTagAttributes ( $x, 'tr' ) ) ;
                        }
                        else if ( '|' == $fc || '!' == $fc || '|+' == substr ( $x , 0 , 2 ) ) { # Caption
                                # $x is a table row
@@ -633,7 +591,7 @@ class Parser
                                        if ( $fc != '+' )
                                        {
                                                $tra = array_pop ( $ltr ) ;
-                                               if ( !array_pop ( $tr ) ) $z = '<tr '.$tra.">\n" ;
+                                               if ( !array_pop ( $tr ) ) $z = '<tr'.$tra.">\n" ;
                                                array_push ( $tr , true ) ;
                                                array_push ( $ltr , '' ) ;
                                        }
@@ -655,7 +613,7 @@ class Parser
                                        }
                                        if ( count ( $y ) == 1 )
                                                $y = "{$z}<{$l}>{$y[0]}" ;
-                                       else $y = $y = "{$z}<{$l} ".$this->fixTagAttributes($y[0]).">{$y[1]}" ;
+                                       else $y = $y = "{$z}<{$l}".Sanitizer::fixTagAttributes($y[0], $l).">{$y[1]}" ;
                                        $t[$k] .= $y ;
                                        array_push ( $td , true ) ;
                                }
@@ -671,7 +629,7 @@ class Parser
                }
 
                $t = implode ( "\n" , $t ) ;
-               #               $t = $this->removeHTMLtags( $t );
+               #               $t = Sanitizer::removeHTMLtags( $t );
                wfProfileOut( $fname );
                return $t ;
        }
@@ -688,7 +646,7 @@ class Parser
                $fname = 'Parser::internalParse';
                wfProfileIn( $fname );
 
-               $text = $this->removeHTMLtags( $text );
+               $text = Sanitizer::removeHTMLtags( $text );
                $text = $this->replaceVariables( $text, $args );
 
                $text = preg_replace( '/(^|\n)-----*/', '\\1<hr />', $text );
@@ -709,8 +667,6 @@ class Parser
                $text = $this->doMagicLinks( $text );
                $text = $this->doTableStuff( $text );
                $text = $this->formatHeadings( $text, $isMain );
-               $sk =& $this->mOptions->getSkin();
-               $text = $sk->transformContent( $text );
 
                wfProfileOut( $fname );
                return $text;
@@ -723,11 +679,7 @@ class Parser
         * @access private
         */
        function &doMagicLinks( &$text ) {
-               global $wgUseGeoMode;
                $text = $this->magicISBN( $text );
-               if ( isset( $wgUseGeoMode ) && $wgUseGeoMode ) {
-                       $text = $this->magicGEO( $text );
-               }
                $text = $this->magicRFC( $text, 'RFC ', 'rfcurl' );
                $text = $this->magicRFC( $text, 'PMID ', 'pubmedurl' );
                return $text;
@@ -959,7 +911,9 @@ class Parser
                wfProfileIn( $fname );
 
                $sk =& $this->mOptions->getSkin();
-               $linktrail = wfMsgForContent('linktrail');
+               global $wgContLang;
+               $linktrail = $wgContLang->linkTrail();
+               
                $bits = preg_split( EXT_LINK_BRACKETED, $text, -1, PREG_SPLIT_DELIM_CAPTURE );
 
                $s = $this->replaceFreeExternalLinks( array_shift( $bits ) );
@@ -988,14 +942,19 @@ class Parser
 
                        $dtrail = '';
 
+                       # Set linktype for CSS - if URL==text, link is essentially free
+                       $linktype = ($text == $url) ? 'free' : 'text';
+
                        # No link text, e.g. [http://domain.tld/some.link]
                        if ( $text == '' ) {
                                # Autonumber if allowed
                                if ( strpos( HTTP_PROTOCOLS, $protocol ) !== false ) {
                                        $text = '[' . ++$this->mAutonumber . ']';
+                                       $linktype = 'autonumber';
                                } else {
                                        # Otherwise just use the URL
                                        $text = htmlspecialchars( $url );
+                                       $linktype = 'free';
                                }
                        } else {
                                # Have link text, e.g. [http://domain.tld/some.link text]s
@@ -1006,18 +965,10 @@ class Parser
                                }
                        }
 
-                       $encUrl = htmlspecialchars( $url );
-                       # Bit in parentheses showing the URL for the printable version
-                       if( $url == $text || preg_match( "!$protocol://" . preg_quote( $text, '/' ) . "/?$!", $url ) ) {
-                               $paren = '';
-                       } else {
-                               # Expand the URL for printable version
-                               if ( ! $sk->suppressUrlExpansion() ) {
-                                       $paren = "<span class='urlexpansion'> (<i>" . htmlspecialchars ( $encUrl ) . "</i>)</span>";
-                               } else {
-                                       $paren = '';
-                               }
-                       }
+                       # Replace &amp; from obsolete syntax with &.
+                       # All HTML entities will be escaped by makeExternalLink()
+                       # or maybeMakeImageLink()
+                       $url = str_replace( '&amp;', '&', $url );
 
                        # Process the trail (i.e. everything after this link up until start of the next link),
                        # replacing any non-bracketed links
@@ -1027,7 +978,7 @@ class Parser
                        # This means that users can paste URLs directly into the text
                        # Funny characters like &ouml; aren't valid in URLs anyway
                        # This was changed in August 2004
-                       $s .= $sk->makeExternalLink( $url, $text, false ) . $dtrail. $paren . $trail;
+                       $s .= $sk->makeExternalLink( $url, $text, false, $linktype ) . $dtrail . $trail;
                }
 
                wfProfileOut( $fname );
@@ -1042,7 +993,7 @@ class Parser
                $fname = 'Parser::replaceFreeExternalLinks';
                wfProfileIn( $fname );
                
-               $bits = preg_split( '/((?:'.URL_PROTOCOLS.'):)/S', $text, -1, PREG_SPLIT_DELIM_CAPTURE );
+               $bits = preg_split( '/(\b(?:'.URL_PROTOCOLS.'):)/S', $text, -1, PREG_SPLIT_DELIM_CAPTURE );
                $s = array_shift( $bits );
                $i = 0;
 
@@ -1087,7 +1038,7 @@ class Parser
                                $text = $this->maybeMakeImageLink( $url );
                                if ( $text === false ) {
                                        # Not an image, make a link
-                                       $text = $sk->makeExternalLink( $url, $url );
+                                       $text = $sk->makeExternalLink( $url, $url, true, 'free' );
                                }
                                $s .= $text . $trail;
                        } else {
@@ -1119,7 +1070,6 @@ class Parser
         *
         * @access private
         */
-
        function replaceInternalLinks( $s ) {
                global $wgLang, $wgContLang, $wgLinkCache;
                global $wgDisableLangConversion;
@@ -1148,7 +1098,7 @@ class Parser
 
                # Match a link having the form [[namespace:link|alternate]]trail
                static $e1 = FALSE;
-               if ( !$e1 ) { $e1 = "/^([{$tc}]+)(?:\\|([^]]+))?]](.*)\$/sD"; }
+               if ( !$e1 ) { $e1 = "/^([{$tc}]+)(?:\\|(.+?))?]](.*)\$/sD"; }
                # Match cases where there is no "]]", which might still be images
                static $e1_img = FALSE;
                if ( !$e1_img ) { $e1_img = "/^([{$tc}]+)\\|(.*)\$/sD"; }
@@ -1158,7 +1108,10 @@ class Parser
 
                $useLinkPrefixExtension = $wgContLang->linkPrefixExtension();
 
-               $nottalk = !Namespace::isTalk( $this->mTitle->getNamespace() );
+               if( is_null( $this->mTitle ) ) {
+                       wfDebugDieBacktrace( 'nooo' );
+               }
+               $nottalk = !$this->mTitle->isTalkPage();
 
                if ( $useLinkPrefixExtension ) {
                        if ( preg_match( $e2, $s, $m ) ) {
@@ -1216,7 +1169,7 @@ class Parser
                        # Don't allow internal links to pages containing
                        # PROTO: where PROTO is a valid URL protocol; these
                        # should be external links.
-                       if (preg_match('/^((?:'.URL_PROTOCOLS.'):)/', $m[1])) {
+                       if (preg_match('/^(\b(?:'.URL_PROTOCOLS.'):)/', $m[1])) {
                                $s .= $prefix . '[[' . $line ;
                                continue;
                        }
@@ -1310,8 +1263,8 @@ class Parser
                                        $text = $this->replaceExternalLinks($text);
                                        $text = $this->replaceInternalLinks($text);
                                        
-                                       # replace the image with a link-holder so that replaceExternalLinks() can't mess with it
-                                       $s .= $prefix . $this->insertStripItem( $sk->makeImageLinkObj( $nt, $text ), $this->mStripState ) . $trail;
+                                       # cloak any absolute URLs inside the image markup, so replaceExternalLinks() won't touch them
+                                       $s .= $prefix . str_replace('http://', 'http-noparse://', $sk->makeImageLinkObj( $nt, $text ) ) . $trail;
                                        $wgLinkCache->addImageLinkObj( $nt );
                                        
                                        wfProfileOut( "$fname-image" );
@@ -1393,33 +1346,56 @@ class Parser
                # :Foobar -- override special treatment of prefix (images, language links)
                # /Foobar -- convert to CurrentPage/Foobar
                # /Foobar/ -- convert to CurrentPage/Foobar, strip the initial / from text
+               # ../ -- convert to CurrentPage, from CurrentPage/CurrentSubPage
+               # ../Foobar -- convert to CurrentPage/Foobar, from CurrentPage/CurrentSubPage
 
                $fname = 'Parser::maybeDoSubpageLink';
                wfProfileIn( $fname );
-               # Look at the first character
-               if( $target != '' && $target{0} == '/' ) {
-                       # / at end means we don't want the slash to be shown
-                       if(substr($target,-1,1)=='/') {
-                               $target=substr($target,1,-1);
-                               $noslash=$target;
-                       } else {
-                               $noslash=substr($target,1);
-                       }
+               $ret = $target; # default return value is no change
+                       
+               # Some namespaces don't allow subpages, 
+               # so only perform processing if subpages are allowed
+               if( $this->areSubpagesAllowed() ) {             
+                       # Look at the first character
+                       if( $target != '' && $target{0} == '/' ) {
+                               # / at end means we don't want the slash to be shown
+                               if( substr( $target, -1, 1 ) == '/' ) {
+                                       $target = substr( $target, 1, -1 );
+                                       $noslash = $target;
+                               } else {
+                                       $noslash = substr( $target, 1 );
+                               }
                                
-                       # Some namespaces don't allow subpages
-                       if( $this->areSubpagesAllowed() ) {
-                               # subpages allowed here
                                $ret = $this->mTitle->getPrefixedText(). '/' . trim($noslash);
                                if( '' === $text ) {
                                        $text = $target;
                                } # this might be changed for ugliness reasons
                        } else {
-                               # no subpage allowed, use standard link
-                               $ret = $target;
+                               # check for .. subpage backlinks
+                               $dotdotcount = 0;
+                               $nodotdot = $target;
+                               while( strncmp( $nodotdot, "../", 3 ) == 0 ) {
+                                       ++$dotdotcount;
+                                       $nodotdot = substr( $nodotdot, 3 );
+                               }
+                               if($dotdotcount > 0) {
+                                       $exploded = explode( '/', $this->mTitle->GetPrefixedText() );
+                                       if( count( $exploded ) > $dotdotcount ) { # not allowed to go below top level page
+                                               $ret = implode( '/', array_slice( $exploded, 0, -$dotdotcount ) );
+                                               # / at the end means don't show full path
+                                               if( substr( $nodotdot, -1, 1 ) == '/' ) {
+                                                       $nodotdot = substr( $nodotdot, 0, -1 );
+                                                       if( '' === $text ) {
+                                                               $text = $nodotdot;
+                                                       }
+                                               }
+                                               $nodotdot = trim( $nodotdot );
+                                               if( $nodotdot != '' ) {
+                                                       $ret .= '/' . $nodotdot;
+                                               }
+                                       }
+                               }
                        }
-               } else {
-                       # no subpage
-                       $ret = $target;
                }
 
                wfProfileOut( $fname );
@@ -1742,6 +1718,10 @@ class Parser
                                return $varCache[$index] = $wgContLang->formatNum( date( 'Y' ) );
                        case MAG_CURRENTTIME:
                                return $varCache[$index] = $wgContLang->time( wfTimestampNow(), false );
+                       case MAG_CURRENTWEEK:
+                               return $varCache[$index] = $wgContLang->formatNum( date('W') );
+                       case MAG_CURRENTDOW:
+                               return $varCache[$index] = $wgContLang->formatNum( date('w') );
                        case MAG_NUMBEROFARTICLES:
                                return $varCache[$index] = $wgContLang->formatNum( wfNumberOfArticles() );
                        case MAG_SITENAME:
@@ -1765,7 +1745,7 @@ class Parser
                $this->mVariables = array();
                foreach ( $wgVariableIDs as $id ) {
                        $mw =& MagicWord::get( $id );
-                       $mw->addToArray( $this->mVariables, $this->getVariableValue( $id ) );
+                       $mw->addToArray( $this->mVariables, $id );
                }
                wfProfileOut( $fname );
        }
@@ -1823,6 +1803,7 @@ class Parser
         */
        function variableSubstitution( $matches ) {
                $fname = 'parser::variableSubstitution';
+               $varname = $matches[1];
                wfProfileIn( $fname );
                if ( !$this->mVariables ) {
                        $this->initialiseVariables();
@@ -1831,14 +1812,15 @@ class Parser
                if ( $this->mOutputType == OT_WIKI ) {
                        # Do only magic variables prefixed by SUBST
                        $mwSubst =& MagicWord::get( MAG_SUBST );
-                       if (!$mwSubst->matchStartAndRemove( $matches[1] ))
+                       if (!$mwSubst->matchStartAndRemove( $varname ))
                                $skip = true;
                        # Note that if we don't substitute the variable below,
                        # we don't remove the {{subst:}} magic word, in case
                        # it is a template rather than a magic variable.
                }
-               if ( !$skip && array_key_exists( $matches[1], $this->mVariables ) ) {
-                       $text = $this->mVariables[$matches[1]];
+               if ( !$skip && array_key_exists( $varname, $this->mVariables ) ) {
+                       $id = $this->mVariables[$varname];
+                       $text = $this->getVariableValue( $id );
                        $this->mOutput->mContainsOldMagic = true;
                } else {
                        $text = $matches[0];
@@ -2008,20 +1990,25 @@ class Parser
                # Did we encounter this template already? If yes, it is in the cache
                # and we need to check for loops.
                if ( !$found && isset( $this->mTemplates[$part1] ) ) {
-                       # set $text to cached message.
-                       $text = $linestart . $this->mTemplates[$part1];
                        $found = true;
 
                        # Infinite loop test
                        if ( isset( $this->mTemplatePath[$part1] ) ) {
                                $noparse = true;
                                $found = true;
-                               $text .= '<!-- WARNING: template loop detected -->';
+                               $text = $linestart .
+                                       "\{\{$part1}}" .
+                                       '<!-- WARNING: template loop detected -->';
+                               wfDebug( "$fname: template loop broken at '$part1'\n" );
+                       } else {
+                               # set $text to cached message.
+                               $text = $linestart . $this->mTemplates[$part1];
                        }
                }
 
                # Load from database
                $itcamefromthedatabase = false;
+               $lastPathLevel = $this->mTemplatePath;
                if ( !$found ) {
                        $ns = NS_TEMPLATE;
                        $part1 = $this->maybeDoSubpageLink( $part1, $subpage='' );
@@ -2050,7 +2037,9 @@ class Parser
                                }
 
                                # Template cache array insertion
-                               $this->mTemplates[$part1] = $text;
+                               if( $found ) {
+                                       $this->mTemplates[$part1] = $text;
+                               }
                        }
                }
 
@@ -2082,7 +2071,7 @@ class Parser
                        $this->mTemplatePath[$part1] = 1;
 
                        $text = $this->strip( $text, $this->mStripState );
-                       $text = $this->removeHTMLtags( $text );
+                       $text = Sanitizer::removeHTMLtags( $text );
                        $text = $this->replaceVariables( $text, $assocArgs );
 
                        # Resume the link cache and register the inclusion as a link
@@ -2096,9 +2085,9 @@ class Parser
                                $text = "\n" . $text;
                        }
                }
-
-               # Empties the template path
-               $this->mTemplatePath = array();
+               # Prune lower levels off the recursion check path
+               $this->mTemplatePath = $lastPathLevel;
+               
                if ( !$found ) {
                        wfProfileOut( $fname );
                        return $matches[0];
@@ -2130,9 +2119,8 @@ class Parser
                                }
                        }
                }
-
-               # Empties the template path
-               $this->mTemplatePath = array();
+               # Prune lower levels off the recursion check path
+               $this->mTemplatePath = $lastPathLevel;
                
                if ( !$found ) {
                        wfProfileOut( $fname );
@@ -2174,171 +2162,6 @@ class Parser
                }
        }
 
-
-       /**
-        * Cleans up HTML, removes dangerous tags and attributes, and
-        * removes HTML comments
-        * @access private
-        */
-       function removeHTMLtags( $text ) {
-               global $wgUseTidy, $wgUserHtml;
-               $fname = 'Parser::removeHTMLtags';
-               wfProfileIn( $fname );
-
-               if( $wgUserHtml ) {
-                       $htmlpairs = array( # Tags that must be closed
-                               'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
-                               'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
-                               'strike', 'strong', 'tt', 'var', 'div', 'center',
-                               'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
-                               'ruby', 'rt' , 'rb' , 'rp', 'p', 'abbr', 'acronym'
-                       );
-                       $htmlsingle = array(
-                               'br', 'hr', 'li', 'dt', 'dd'
-                       );
-                       $htmlnest = array( # Tags that can be nested--??
-                               'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
-                               'dl', 'font', 'big', 'small', 'sub', 'sup'
-                       );
-                       $tabletags = array( # Can only appear inside table
-                               'td', 'th', 'tr'
-                       );
-               } else {
-                       $htmlpairs = array();
-                       $htmlsingle = array();
-                       $htmlnest = array();
-                       $tabletags = array();
-               }
-
-               $htmlsingle = array_merge( $tabletags, $htmlsingle );
-               $htmlelements = array_merge( $htmlsingle, $htmlpairs );
-
-               $htmlattrs = $this->getHTMLattrs () ;
-
-               # Remove HTML comments
-               $text = $this->removeHTMLcomments( $text );
-
-               $bits = explode( '<', $text );
-               $text = array_shift( $bits );
-               if(!$wgUseTidy) {
-                       $tagstack = array(); $tablestack = array();
-                       foreach ( $bits as $x ) {
-                               $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
-                               preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
-                               $x, $regs );
-                               list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
-                               error_reporting( $prev );
-
-                               $badtag = 0 ;
-                               if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
-                                       # Check our stack
-                                       if ( $slash ) {
-                                               # Closing a tag...
-                                               if ( ! in_array( $t, $htmlsingle ) &&
-                                               ( $ot = @array_pop( $tagstack ) ) != $t ) {
-                                                       @array_push( $tagstack, $ot );
-                                                       $badtag = 1;
-                                               } else {
-                                                       if ( $t == 'table' ) {
-                                                               $tagstack = array_pop( $tablestack );
-                                                       }
-                                                       $newparams = '';
-                                               }
-                                       } else {
-                                               # Keep track for later
-                                               if ( in_array( $t, $tabletags ) &&
-                                               ! in_array( 'table', $tagstack ) ) {
-                                                       $badtag = 1;
-                                               } else if ( in_array( $t, $tagstack ) &&
-                                               ! in_array ( $t , $htmlnest ) ) {
-                                                       $badtag = 1 ;
-                                               } else if ( ! in_array( $t, $htmlsingle ) ) {
-                                                       if ( $t == 'table' ) {
-                                                               array_push( $tablestack, $tagstack );
-                                                               $tagstack = array();
-                                                       }
-                                                       array_push( $tagstack, $t );
-                                               }
-                                               # Strip non-approved attributes from the tag
-                                               $newparams = $this->fixTagAttributes($params);
-
-                                       }
-                                       if ( ! $badtag ) {
-                                               $rest = str_replace( '>', '&gt;', $rest );
-                                               $text .= "<$slash$t $newparams$brace$rest";
-                                               continue;
-                                       }
-                               }
-                               $text .= '&lt;' . str_replace( '>', '&gt;', $x);
-                       }
-                       # Close off any remaining tags
-                       while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
-                               $text .= "</$t>\n";
-                               if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
-                       }
-               } else {
-                       # this might be possible using tidy itself
-                       foreach ( $bits as $x ) {
-                               preg_match( '/^(\\/?)(\\w+)([^>]*)(\\/{0,1}>)([^<]*)$/',
-                               $x, $regs );
-                               @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
-                               if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
-                                       $newparams = $this->fixTagAttributes($params);
-                                       $rest = str_replace( '>', '&gt;', $rest );
-                                       $text .= "<$slash$t $newparams$brace$rest";
-                               } else {
-                                       $text .= '&lt;' . str_replace( '>', '&gt;', $x);
-                               }
-                       }
-               }
-               wfProfileOut( $fname );
-               return $text;
-       }
-
-       /**
-        * Remove '<!--', '-->', and everything between.
-        * To avoid leaving blank lines, when a comment is both preceded
-        * and followed by a newline (ignoring spaces), trim leading and
-        * trailing spaces and one of the newlines.
-        * 
-        * @access private
-        */
-       function removeHTMLcomments( $text ) {
-               $fname='Parser::removeHTMLcomments';
-               wfProfileIn( $fname );
-               while (($start = strpos($text, '<!--')) !== false) {
-                       $end = strpos($text, '-->', $start + 4);
-                       if ($end === false) {
-                               # Unterminated comment; bail out
-                               break;
-                       }
-
-                       $end += 3;
-
-                       # Trim space and newline if the comment is both
-                       # preceded and followed by a newline
-                       $spaceStart = max($start - 1, 0);
-                       $spaceLen = $end - $spaceStart;
-                       while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
-                               $spaceStart--;
-                               $spaceLen++;
-                       }
-                       while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
-                               $spaceLen++;
-                       if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
-                               # Remove the comment, leading and trailing
-                               # spaces, and leave only one newline.
-                               $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
-                       }
-                       else {
-                               # Remove just the comment.
-                               $text = substr_replace($text, '', $start, $end - $start);
-                       }
-               }
-               wfProfileOut( $fname );
-               return $text;
-       }
-
        /**
         * This function accomplishes several tasks:
         * 1) Auto-number headings if that option is enabled
@@ -2348,10 +2171,13 @@ class Parser
         *      
         * It loops through all headlines, collects the necessary data, then splits up the
         * string and re-inserts the newly formatted headlines.
+        * 
+        * @param string $text
+        * @param boolean $isMain
         * @access private
         */
-       /* private */ function formatHeadings( $text, $isMain=true ) {
-               global $wgInputEncoding, $wgMaxTocLevel, $wgContLang, $wgLinkHolders;
+       function formatHeadings( $text, $isMain=true ) {
+               global $wgInputEncoding, $wgMaxTocLevel, $wgContLang, $wgLinkHolders, $wgInterwikiLinkHolders;
 
                $doNumberHeadings = $this->mOptions->getNumberHeadings();
                $doShowToc = $this->mOptions->getShowToc();
@@ -2384,7 +2210,7 @@ class Parser
 
                # Get all headlines for numbering them and adding funky stuff like [edit]
                # links - this is for later, but we need the number of headlines right now
-               $numMatches = preg_match_all( '/<H([1-6])(.*?' . '>)(.*?)<\/H[1-6]>/i', $text, $matches );
+               $numMatches = preg_match_all( '/<H([1-6])(.*?'.'>)(.*?)<\/H[1-6] *>/i', $text, $matches );
 
                # if there are fewer than 4 headlines in the article, do not show TOC
                if( $numMatches < 4 ) {
@@ -2393,8 +2219,9 @@ class Parser
 
                # if the string __TOC__ (not case-sensitive) occurs in the HTML,
                # override above conditions and always show TOC at that place
+
                $mw =& MagicWord::get( MAG_TOC );
-               if ($mw->match( $text ) ) {
+               if($mw->match( $text ) ) {
                        $doShowToc = 1;
                        $forceTocHere = true;
                } else {
@@ -2406,7 +2233,10 @@ class Parser
                        }
                }
 
-
+               # Never ever show TOC if no headers
+               if( $numMatches < 1 ) {
+                       $doShowToc = 0;
+               }
 
                # We need this to perform operations on the HTML
                $sk =& $this->mOptions->getSkin();
@@ -2417,17 +2247,22 @@ class Parser
 
                # Ugh .. the TOC should have neat indentation levels which can be
                # passed to the skin functions. These are determined here
-               $toclevel = 0;
                $toc = '';
                $full = '';
                $head = array();
                $sublevelCount = array();
+               $levelCount = array();
+               $toclevel = 0;
                $level = 0;
                $prevlevel = 0;
+               $toclevel = 0;
+               $prevtoclevel = 0;
+
                foreach( $matches[3] as $headline ) {
                        $istemplate = 0;
-                       $templatetitle = "";
+                       $templatetitle = '';
                        $templatesection = 0;
+                       $numbering = '';
 
                        if (preg_match("/<!--MWTEMPLATESECTION=([^&]+)&([^_]+)-->/", $headline, $mat)) {
                                $istemplate = 1;
@@ -2436,28 +2271,54 @@ class Parser
                                $headline = preg_replace("/<!--MWTEMPLATESECTION=([^&]+)&([^_]+)-->/", "", $headline);
                        }
 
-                       $numbering = '';
-                       if( $level ) {
+                       if( $toclevel ) {
                                $prevlevel = $level;
+                               $prevtoclevel = $toclevel;
                        }
                        $level = $matches[1][$headlineCount];
-                       if( ( $doNumberHeadings || $doShowToc ) && $prevlevel && $level > $prevlevel ) {
-                               # reset when we enter a new level
-                               $sublevelCount[$level] = 0;
-                               $toc .= $sk->tocIndent( $level - $prevlevel );
-                               $toclevel += $level - $prevlevel;
-                       }
-                       if( ( $doNumberHeadings || $doShowToc ) && $level < $prevlevel ) {
-                               # reset when we step back a level
-                               $sublevelCount[$level+1]=0;
-                               $toc .= $sk->tocUnindent( $prevlevel - $level );
-                               $toclevel -= $prevlevel - $level;
-                       }
-                       # count number of headlines for each level
-                       @$sublevelCount[$level]++;
+                       
                        if( $doNumberHeadings || $doShowToc ) {
+                               
+                               if ( $level > $prevlevel ) {
+                                       # Increase TOC level
+                                       $toclevel++;
+                                       $sublevelCount[$toclevel] = 0;
+                                       $toc .= $sk->tocIndent();
+                               }
+                               elseif ( $level < $prevlevel && $toclevel > 1 ) {
+                                       # Decrease TOC level, find level to jump to
+
+                                       if ( $toclevel == 2 && $level <= $levelCount[1] ) {
+                                               # Can only go down to level 1
+                                               $toclevel = 1;
+                                       } else {
+                                               for ($i = $toclevel; $i > 0; $i--) {
+                                                       if ( $levelCount[$i] == $level ) {
+                                                               # Found last matching level
+                                                               $toclevel = $i;
+                                                               break;
+                                                       }
+                                                       elseif ( $levelCount[$i] < $level ) {
+                                                               # Found first matching level below current level
+                                                               $toclevel = $i + 1;
+                                                               break;
+                                                       }
+                                               }
+                                       }
+
+                                       $toc .= $sk->tocUnindent( $prevtoclevel - $toclevel );
+                               }
+                               else {
+                                       # No change in level, end TOC line
+                                       $toc .= $sk->tocLineEnd();
+                               }
+                               
+                               $levelCount[$toclevel] = $level;
+
+                               # count number of headlines for each level
+                               @$sublevelCount[$toclevel]++;
                                $dot = 0;
-                               for( $i = 1; $i <= $level; $i++ ) {
+                               for( $i = 1; $i <= $toclevel; $i++ ) {
                                        if( !empty( $sublevelCount[$i] ) ) {
                                                if( $dot ) {
                                                        $numbering .= '.';
@@ -2471,7 +2332,7 @@ class Parser
                        # The canonized header is a version of the header text safe to use for links
                        # Avoid insertion of weird stuff like <math> by expanding the relevant sections
                        $canonized_headline = $this->unstrip( $headline, $this->mStripState );
-                       $canonized_headline = $this->unstripNoWiki( $headline, $this->mStripState );
+                       $canonized_headline = $this->unstripNoWiki( $canonized_headline, $this->mStripState );
 
                        # Remove link placeholders by the link text.
                        #     <!--LINK number-->
@@ -2480,6 +2341,9 @@ class Parser
                        $canonized_headline = preg_replace( '/<!--LINK ([0-9]*)-->/e',
                                                            "\$wgLinkHolders['texts'][\$1]",
                                                            $canonized_headline );
+                       $canonized_headline = preg_replace( '/<!--IWLINK ([0-9]*)-->/e',
+                                                           "\$wgInterwikiLinkHolders[\$1]",
+                                                           $canonized_headline );
 
                        # strip out HTML
                        $canonized_headline = preg_replace( '/<.*?' . '>/','',$canonized_headline );
@@ -2496,16 +2360,10 @@ class Parser
                        @$refers[$canonized_headline]++;
                        $refcount[$headlineCount]=$refers[$canonized_headline];
 
-                       # Prepend the number to the heading text
-
-                       if( $doNumberHeadings || $doShowToc ) {
-                               $tocline = $numbering . ' ' . $tocline;
-
-                               # Don't number the heading if it is the only one (looks silly)
-                               if( $doNumberHeadings && count( $matches[3] ) > 1) {
-                                       # the two are different if the line contains a link
-                                       $headline=$numbering . ' ' . $headline;
-                               }
+                       # Don't number the heading if it is the only one (looks silly)
+                       if( $doNumberHeadings && count( $matches[3] ) > 1) {
+                               # the two are different if the line contains a link
+                               $headline=$numbering . ' ' . $headline;
                        }
 
                        # Create the anchor for linking from the TOC to the section
@@ -2514,7 +2372,7 @@ class Parser
                                $anchor .= '_' . $refcount[$headlineCount];
                        }
                        if( $doShowToc && ( !isset($wgMaxTocLevel) || $toclevel<$wgMaxTocLevel ) ) {
-                               $toc .= $sk->tocLine($anchor,$tocline,$toclevel);
+                               $toc .= $sk->tocLine($anchor, $tocline, $numbering, $toclevel);
                        }
                        if( $showEditLink && ( !$istemplate || $templatetitle !== "" ) ) {
                                if ( empty( $head[$headlineCount] ) ) {
@@ -2544,8 +2402,8 @@ class Parser
 
                if( $doShowToc ) {
                        $toclines = $headlineCount;
-                       $toc .= $sk->tocUnindent( $toclevel );
-                       $toc = $sk->tocTable( $toc );
+                       $toc .= $sk->tocUnindent( $toclevel - 1 );
+                       $toc = $sk->tocList( $toc );
                }
 
                # split up and insert constructed headlines
@@ -2629,57 +2487,6 @@ class Parser
                return $text;
        }
 
-       /**
-        * Return an HTML link for the "GEO ..." text
-        * @access private
-        */
-       function magicGEO( $text ) {
-               global $wgLang, $wgUseGeoMode;
-               $fname = 'Parser::magicGEO';
-               wfProfileIn( $fname );
-
-               # These next five lines are only for the ~35000 U.S. Census Rambot pages...
-               $directions = array ( 'N' => 'North' , 'S' => 'South' , 'E' => 'East' , 'W' => 'West' ) ;
-               $text = preg_replace ( "/(\d+)&deg;(\d+)'(\d+)\" {$directions['N']}, (\d+)&deg;(\d+)'(\d+)\" {$directions['W']}/" , "(GEO +\$1.\$2.\$3:-\$4.\$5.\$6)" , $text ) ;
-               $text = preg_replace ( "/(\d+)&deg;(\d+)'(\d+)\" {$directions['N']}, (\d+)&deg;(\d+)'(\d+)\" {$directions['E']}/" , "(GEO +\$1.\$2.\$3:+\$4.\$5.\$6)" , $text ) ;
-               $text = preg_replace ( "/(\d+)&deg;(\d+)'(\d+)\" {$directions['S']}, (\d+)&deg;(\d+)'(\d+)\" {$directions['W']}/" , "(GEO +\$1.\$2.\$3:-\$4.\$5.\$6)" , $text ) ;
-               $text = preg_replace ( "/(\d+)&deg;(\d+)'(\d+)\" {$directions['S']}, (\d+)&deg;(\d+)'(\d+)\" {$directions['E']}/" , "(GEO +\$1.\$2.\$3:+\$4.\$5.\$6)" , $text ) ;
-
-               $a = split( 'GEO ', ' '.$text );
-               if ( count ( $a ) < 2 ) {
-                       wfProfileOut( $fname );
-                       return $text;
-               }
-               $text = substr( array_shift( $a ), 1);
-               $valid = '0123456789.+-:';
-
-               foreach ( $a as $x ) {
-                       $geo = $blank = '' ;
-                       while ( ' ' == $x{0} ) {
-                               $blank .= ' ';
-                               $x = substr( $x, 1 );
-                       }
-                       while ( strstr( $valid, $x{0} ) != false ) {
-                               $geo .= $x{0};
-                               $x = substr( $x, 1 );
-                       }
-                       $num = str_replace( '+', '', $geo );
-                       $num = str_replace( ' ', '', $num );
-
-                       if ( '' == $num || count ( explode ( ':' , $num , 3 ) ) < 2 ) {
-                               $text .= "GEO $blank$x";
-                       } else {
-                               $titleObj = Title::makeTitle( NS_SPECIAL, 'Geo' );
-                               $text .= '<a href="' .
-                               $titleObj->escapeLocalUrl( 'coordinates='.$num ) .
-                                       "\" class=\"internal\">GEO $geo</a>";
-                               $text .= $x;
-                       }
-               }
-               wfProfileOut( $fname );
-               return $text;
-       }
-
        /**
         * Return an HTML link for the "RFC 1234" text
         * @access private
@@ -2807,9 +2614,14 @@ class Parser
                        putenv( 'TZ='.$oldtzs );
                }
 
-               $text = preg_replace( '/~~~~~~/', $d, $text );
-               $text = preg_replace( '/~~~~/', '[[' . $wgContLang->getNsText( NS_USER ) . ":$n|$k]] $d", $text );
-               $text = preg_replace( '/~~~/', '[[' . $wgContLang->getNsText( NS_USER ) . ":$n|$k]]", $text );
+               if( $user->getOption( 'fancysig' ) ) {
+                       $sigText = $k;
+               } else {
+                       $sigText = '[[' . $wgContLang->getNsText( NS_USER ) . ":$n|$k]]";
+               }
+               $text = preg_replace( '/~~~~~/', $d, $text );
+               $text = preg_replace( '/~~~~/', "$sigText $d", $text );
+               $text = preg_replace( '/~~~/', $sigText, $text );
 
                # Context links: [[|name]] and [[name (context)|]]
                #
@@ -2929,7 +2741,7 @@ class Parser
                if ( !empty( $wgLinkHolders['namespaces'] ) ) {
                        wfProfileIn( $fname.'-check' );
                        $dbr =& wfGetDB( DB_SLAVE );
-                       $cur = $dbr->tableName( 'cur' );
+                       $page = $dbr->tableName( 'page' );
                        $sk = $wgUser->getSkin();
                        $threshold = $wgUser->getOption('stubthreshold');
                        
@@ -2958,14 +2770,14 @@ class Parser
                                        # Not in the link cache, add it to the query
                                        if ( !isset( $current ) ) {
                                                $current = $val;
-                                               $query =  "SELECT cur_id, cur_namespace, cur_title";
+                                               $query =  "SELECT page_id, page_namespace, page_title";
                                                if ( $threshold > 0 ) {
-                                                       $query .= ", LENGTH(cur_text) AS cur_len, cur_is_redirect";
-                                               } 
-                                               $query .= " FROM $cur WHERE (cur_namespace=$val AND cur_title IN(";
+                                                       $query .= ', page_len, page_is_redirect';
+                                               }
+                                               $query .= " FROM $page WHERE (page_namespace=$val AND page_title IN(";
                                        } elseif ( $current != $val ) {
                                                $current = $val;
-                                               $query .= ")) OR (cur_namespace=$val AND cur_title IN(";
+                                               $query .= ")) OR (page_namespace=$val AND page_title IN(";
                                        } else {
                                                $query .= ', ';
                                        }
@@ -2986,13 +2798,13 @@ class Parser
                                # 1 = known
                                # 2 = stub
                                while ( $s = $dbr->fetchObject($res) ) {
-                                       $title = Title::makeTitle( $s->cur_namespace, $s->cur_title );
+                                       $title = Title::makeTitle( $s->page_namespace, $s->page_title );
                                        $pdbk = $title->getPrefixedDBkey();
-                                       $wgLinkCache->addGoodLink( $s->cur_id, $pdbk );
+                                       $wgLinkCache->addGoodLink( $s->page_id, $pdbk );
                                        
                                        if ( $threshold >  0 ) {
-                                               $size = $s->cur_len;
-                                               if ( $s->cur_is_redirect || $s->cur_namespace != 0 || $length < $threshold ) {
+                                               $size = $s->page_len;
+                                               if ( $s->page_is_redirect || $s->page_namespace != 0 || $size >= $threshold ) {
                                                        $colours[$pdbk] = 1;
                                                } else {
                                                        $colours[$pdbk] = 2;
@@ -3078,12 +2890,21 @@ class Parser
                                continue;
                        }
                        $nt = Title::newFromURL( $matches[1] );
+                       if( is_null( $nt ) ) {
+                               # Bogus title. Ignore these so we don't bomb out later.
+                               continue;
+                       }
                        if ( isset( $matches[3] ) ) {
                                $label = $matches[3];
                        } else {
                                $label = '';
                        }
-                       $ig->add( Image::newFromTitle( $nt ), $label );
+                       
+                       # FIXME: Use the full wiki parser and add its links
+                       # to the page's links.
+                       $html = $this->mOptions->mSkin->formatComment( $label );
+                       
+                       $ig->add( Image::newFromTitle( $nt ), $html );
                        $wgLinkCache->addImageLinkObj( $nt );
                }
                return $ig->toHTML();
@@ -3189,14 +3010,17 @@ class ParserOptions
 
        function setSkin( &$x ) { $this->mSkin =& $x; }
 
-       # Get parser options
-       /* static */ function newFromUser( &$user ) {
+       /**
+        * Get parser options
+        * @static
+        */
+       function newFromUser( &$user ) {
                $popts = new ParserOptions;
                $popts->initialiseFromUser( $user );
                return $popts;
        }
 
-       # Get user options
+       /** Get user options */
        function initialiseFromUser( &$userInput ) {
                global $wgUseTeX, $wgUseDynamicDates, $wgInterwikiMagic, $wgAllowExternalImages;
                $fname = 'ParserOptions::initialiseFromUser';
@@ -3222,8 +3046,6 @@ class ParserOptions
                $this->mShowToc = $user->getOption( 'showtoc' );
                wfProfileOut( $fname );
        }
-
-
 }
 
 /**
@@ -3269,6 +3091,13 @@ function wfLoadSiteStats() {
        }
 }
 
+/**
+ * Escape html tags
+ * Basicly replacing " > and < with HTML entities ( &quot;, &gt;, &lt;)
+ *  
+ * @param string $in Text that might contain HTML tags
+ * @return string Escaped string
+ */
 function wfEscapeHTMLTagsOnly( $in ) {
        return str_replace(
                array( '"', '>', '<' ),