* Reordered wiki table handling and __TOC__ extraction in the parser to better handle...
authorBrion Vibber <brion@users.mediawiki.org>
Tue, 23 May 2006 07:19:01 +0000 (07:19 +0000)
committerBrion Vibber <brion@users.mediawiki.org>
Tue, 23 May 2006 07:19:01 +0000 (07:19 +0000)
* Only the first __TOC__ is now turned into a TOC.

The table change doesn't disrupt either the parser tests or the en.wikipedia main page. Hopefully it won't break other real content...

RELEASE-NOTES
includes/MagicWord.php
includes/Parser.php
includes/Sanitizer.php
maintenance/parserTests.txt

index 9fa2822..8ba8452 100644 (file)
@@ -315,7 +315,10 @@ it from source control: http://www.mediawiki.org/wiki/Download_from_SVN
 * (bug 6046) Update to Indonesian localisation (id) #15
 * (bug 5523) $wgNoFollowNsExceptions to allow disabling rel="nofollow" in
   specially-selected namespaces.
-* Fix for HTML/JS injection bug in variable handler (found by Nick Jenkins)
+* (bug 6055) Fix for HTML/JS injection bug in variable handler (found by Nick Jenkins)
+* Reordered wiki table handling and __TOC__ extraction in the parser to better
+  handle some overlapping tag cases.
+* Only the first __TOC__ is now turned into a TOC.
 
 
 == Compatibility ==
index c747b7f..895c0e0 100644 (file)
@@ -329,8 +329,8 @@ class MagicWord {
        /**
         * Replaces the word with something else
         */
-       function replace( $replacement, $subject ) {
-               $res = preg_replace( $this->getRegex(), wfRegexReplacement( $replacement ), $subject );
+       function replace( $replacement, $subject, $limit=-1 ) {
+               $res = preg_replace( $this->getRegex(), wfRegexReplacement( $replacement ), $subject, $limit );
                $this->mModified = !($res === $subject);
                return $res;
        }
index cb6d692..b71b275 100644 (file)
@@ -157,6 +157,9 @@ class Parser
                $this->mTemplates = array();
                $this->mTemplatePath = array();
 
+               $this->mShowToc = true;
+               $this->mForceTocPosition = false;
+               
                wfRunHooks( 'ParserClearState', array( &$this ) );
        }
 
@@ -873,12 +876,20 @@ class Parser
                $text = strtr( $text, array( '<onlyinclude>' => '' , '</onlyinclude>' => '' ) );
                $text = strtr( $text, array( '<noinclude>' => '', '</noinclude>' => '') );
                $text = preg_replace( '/<includeonly>.*?<\/includeonly>/s', '', $text );
-
+               
                $text = Sanitizer::removeHTMLtags( $text, array( &$this, 'attributeStripCallback' ) );
+
                $text = $this->replaceVariables( $text, $args );
 
+               // Tables need to come after variable replacement for things to work
+               // properly; putting them before other transformations should keep
+               // exciting things like link expansions from showing up in surprising
+               // places.
+               $text = $this->doTableStuff( $text );
+
                $text = preg_replace( '/(^|\n)-----*/', '\\1<hr />', $text );
 
+               $text = $this->stripToc( $text );
                $text = $this->doHeadings( $text );
                if($this->mOptions->getUseDynamicDates()) {
                        $df =& DateFormatter::getInstance();
@@ -893,7 +904,6 @@ class Parser
                $text = str_replace($this->mUniqPrefix."NOPARSE", "", $text);
 
                $text = $this->doMagicLinks( $text );
-               $text = $this->doTableStuff( $text );
                $text = $this->formatHeadings( $text, $isMain );
 
                wfProfileOut( $fname );
@@ -2411,7 +2421,7 @@ class Parser
                wfProfileOut( $fname );
                return $text;
        }
-
+       
        /**
         * Replace magic variables
         * @private
@@ -3049,6 +3059,31 @@ class Parser
                }
        }
 
+       /**
+        * Detect __TOC__ magic word and set a placeholder
+        */
+       function stripToc( $text ) {
+               # if the string __NOTOC__ (not case-sensitive) occurs in the HTML,
+               # do not add TOC
+               $mw = MagicWord::get( MAG_NOTOC );
+               if( $mw->matchAndRemove( $text ) ) {
+                       $this->mShowToc = false;
+               }
+               
+               $mw = MagicWord::get( MAG_TOC );
+               if( $mw->match( $text ) ) {
+                       $this->mShowToc = true;
+                       $this->mForceTocPosition = true;
+                       
+                       // Set a placeholder. At the end we'll fill it in with the TOC.
+                       $text = $mw->replace( '<!--MWTOC-->', $text, 1 );
+                       
+                       // Only keep the first one.
+                       $text = $mw->replace( '', $text );
+               }
+               return $text;
+       }
+
        /**
         * This function accomplishes several tasks:
         * 1) Auto-number headings if that option is enabled
@@ -3067,8 +3102,6 @@ class Parser
                global $wgMaxTocLevel, $wgContLang;
 
                $doNumberHeadings = $this->mOptions->getNumberHeadings();
-               $doShowToc = true;
-               $forceTocHere = false;
                if( !$this->mTitle->userCanEdit() ) {
                        $showEditLink = 0;
                } else {
@@ -3080,12 +3113,6 @@ class Parser
                if( $esw->matchAndRemove( $text ) ) {
                        $showEditLink = 0;
                }
-               # if the string __NOTOC__ (not case-sensitive) occurs in the HTML,
-               # do not add TOC
-               $mw =& MagicWord::get( MAG_NOTOC );
-               if( $mw->matchAndRemove( $text ) ) {
-                       $doShowToc = false;
-               }
 
                # Get all headlines for numbering them and adding funky stuff like [edit]
                # links - this is for later, but we need the number of headlines right now
@@ -3093,7 +3120,7 @@ class Parser
 
                # if there are fewer than 4 headlines in the article, do not show TOC
                if( $numMatches < 4 ) {
-                       $doShowToc = false;
+                       $this->mShowToc = false;
                }
 
                # Allow user to stipulate that a page should have a "new section"
@@ -3107,20 +3134,20 @@ class Parser
 
                $mw =& MagicWord::get( MAG_TOC );
                if($mw->match( $text ) ) {
-                       $doShowToc = true;
-                       $forceTocHere = true;
+                       $this->mShowToc = true;
+                       $this->mForceTocPosition = true;
                } else {
                        # if the string __FORCETOC__ (not case-sensitive) occurs in the HTML,
                        # override above conditions and always show TOC above first header
                        $mw =& MagicWord::get( MAG_FORCETOC );
                        if ($mw->matchAndRemove( $text ) ) {
-                               $doShowToc = true;
+                               $this->mShowToc = true;
                        }
                }
 
                # Never ever show TOC if no headers
                if( $numMatches < 1 ) {
-                       $doShowToc = false;
+                       $this->mShowToc = false;
                }
 
                # We need this to perform operations on the HTML
@@ -3162,7 +3189,7 @@ class Parser
                        }
                        $level = $matches[1][$headlineCount];
 
-                       if( $doNumberHeadings || $doShowToc ) {
+                       if( $doNumberHeadings || $this->mShowToc ) {
 
                                if ( $level > $prevlevel ) {
                                        # Increase TOC level
@@ -3258,7 +3285,7 @@ class Parser
                        if($refcount[$headlineCount] > 1 ) {
                                $anchor .= '_' . $refcount[$headlineCount];
                        }
-                       if( $doShowToc && ( !isset($wgMaxTocLevel) || $toclevel<$wgMaxTocLevel ) ) {
+                       if( $this->mShowToc && ( !isset($wgMaxTocLevel) || $toclevel<$wgMaxTocLevel ) ) {
                                $toc .= $sk->tocLine($anchor, $tocline, $numbering, $toclevel);
                        }
                        if( $showEditLink && ( !$istemplate || $templatetitle !== "" ) ) {
@@ -3279,7 +3306,7 @@ class Parser
                                $sectionCount++;
                }
 
-               if( $doShowToc ) {
+               if( $this->mShowToc ) {
                        if( $toclevel<$wgMaxTocLevel ) {
                                $toc .= $sk->tocUnindent( $toclevel - 1 );
                        }
@@ -3301,8 +3328,8 @@ class Parser
                                # $full .= $sk->editSectionLink(0);
                        }
                        $full .= $block;
-                       if( $doShowToc && !$i && $isMain && !$forceTocHere) {
-                       # Top anchor now in skin
+                       if( $this->mShowToc && !$i && $isMain && !$this->mForceTocPosition ) {
+                               # Top anchor now in skin
                                $full = $full.$toc;
                        }
 
@@ -3311,9 +3338,8 @@ class Parser
                        }
                        $i++;
                }
-               if($forceTocHere) {
-                       $mw =& MagicWord::get( MAG_TOC );
-                       return $mw->replace( $toc, $full );
+               if( $this->mForceTocPosition ) {
+                       return str_replace( '<!--MWTOC-->', $toc, $full );
                } else {
                        return $full;
                }
index 93745c7..25f4d57 100644 (file)
@@ -603,6 +603,7 @@ class Sanitizer {
                                'RFC'  => '&#82;FC',
                                'PMID' => '&#80;MID',
                                '|'    => '&#124;',
+                               '__'   => '&#95;_',
                        ) );
 
                        # Stupid hack
index ee6e133..22a2505 100644 (file)
@@ -3970,6 +3970,147 @@ mailto:inline@mail.tld
 </p>
 !! end
 
+
+#
+# Security and HTML correctness
+# From Nick Jenkins' fuzz testing
+#
+
+!! test
+Fuzz testing: Parser13
+!! input
+{| 
+| http://a|
+!! result
+<table>
+<tr>
+<td>
+</td>
+</tr>
+</table>
+
+!! end
+
+!! test
+Fuzz testing: Parser14
+!! input
+== onmouseover= ==
+http://__TOC__
+!! result
+<div class="editsection" style="float:right;margin-left:5px;">[<a href="/index.php?title=Parser_test&amp;action=edit&amp;section=1" title="Edit section: onmouseover=">edit</a>]</div><a name="onmouseover.3D"></a><h2> onmouseover= </h2>
+<p>http://
+</p>
+!! end
+
+!! test
+Fuzz testing: Parser14-table
+!! input
+==a==
+{| STYLE=__TOC__
+!! result
+<div class="editsection" style="float:right;margin-left:5px;">[<a href="/index.php?title=Parser_test&amp;action=edit&amp;section=1" title="Edit section: a">edit</a>]</div><a name="a"></a><h2>a</h2>
+<table style="&#95;_TOC&#95;_">
+<tr><td></td></tr>
+</table>
+
+!! end
+
+# Known to produce bogus xml (extra </td>)
+!! test
+Fuzz testing: Parser16
+!! options
+noxml
+!! input
+{|
+!https://||||||
+!! result
+<table>
+<tr>
+<th>https://</th><th></th><th></th><th>
+</td>
+</tr>
+</table>
+
+!! end
+
+!! test
+Fuzz testing: Parser21
+!! input
+{|
+! irc://{{ftp://a" onmouseover="alert('hello world');"
+|
+!! result
+<table>
+<tr>
+<th> <a href="irc://{{ftp://a" class='external free' title="irc://{{ftp://a" rel="nofollow">irc://{{ftp://a</a>" onmouseover="alert('hello world');"
+</th><td>
+</td>
+</tr>
+</table>
+
+!! end
+
+!! test
+Fuzz testing: Parser22
+!! input
+http://===r:::https://b
+
+{|
+!!result
+<p><a href="http://===r:::https://b" class='external free' title="http://===r:::https://b" rel="nofollow">http://===r:::https://b</a>
+</p>
+<table>
+<tr><td></td></tr>
+</table>
+
+!! end
+
+# Known to produce bad XML for now
+!! test
+Fuzz testing: Parser24
+!! options
+noxml
+!! input
+{|
+{{{|
+<u CLASS=
+| {{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}} >
+<br style="onmouseover='alert(document.cookie);' " />
+
+MOVE YOUR MOUSE CURSOR OVER THIS TEXT
+|
+!! result
+<table>
+
+<u class="&#124;">} &gt;
+<br style="onmouseover='alert(document.cookie);' " />
+
+MOVE YOUR MOUSE CURSOR OVER THIS TEXT
+<tr>
+<td></u>
+</td>
+</tr>
+</table>
+
+!! end
+
+# Known to produce bad XML for now
+!!test
+Fuzz testing: Parser25 (bug 6055)
+!! options
+noxml
+!! input
+{{{
+| 
+<LI CLASS=||
+ >
+}}}blah" onmouseover="alert('hello world');" align="left"'''MOVE MOUSE CURSOR OVER HERE
+!! result
+<li class="&#124;&#124;">
+blah" onmouseover="alert('hello world');" align="left"<b>MOVE MOUSE CURSOR OVER HERE</b>
+
+!! end
+
 #
 #
 #