Protect -{...}- variant constructs in definition lists.
authorC. Scott Ananian <cscott@cscott.net>
Tue, 13 Dec 2016 20:37:04 +0000 (15:37 -0500)
committerC. Scott Ananian <cscott@cscott.net>
Fri, 17 Feb 2017 20:52:44 +0000 (15:52 -0500)
Given the wikitext:

;-{zh-cn:AAA;zh-tw:BBB}-

Prevent `doBlockLevels` from trying to split the definition list at the
embedded colon and using `AAA;zh-tw:BBB}-` as the `<dd>` portion.

Bug: T153135
Change-Id: I3a4d02f1fbd0d0fe8278d6b7c66005f0dd3dd36b

includes/parser/BlockLevelPass.php
tests/parser/parserTests.txt

index cbacd34..e16cfd4 100644 (file)
@@ -38,6 +38,7 @@ class BlockLevelPass {
        const COLON_STATE_COMMENT = 5;
        const COLON_STATE_COMMENTDASH = 6;
        const COLON_STATE_COMMENTDASHDASH = 7;
+       const COLON_STATE_LC = 8;
 
        /**
         * Make lists from lines starting with ':', '*', '#', etc.
@@ -389,15 +390,14 @@ class BlockLevelPass {
         * @return string The position of the ':', or false if none found
         */
        private function findColonNoLinks( $str, &$before, &$after ) {
-               $colonPos = strpos( $str, ':' );
-               if ( $colonPos === false ) {
+               if ( !preg_match( '/:|<|-\{/', $str, $m, PREG_OFFSET_CAPTURE ) ) {
                        # Nothing to find!
                        return false;
                }
 
-               $ltPos = strpos( $str, '<' );
-               if ( $ltPos === false || $ltPos > $colonPos ) {
+               if ( $m[0][0] === ':' ) {
                        # Easy; no tag nesting to worry about
+                       $colonPos = $m[0][1];
                        $before = substr( $str, 0, $colonPos );
                        $after = substr( $str, $colonPos + 1 );
                        return $colonPos;
@@ -405,9 +405,10 @@ class BlockLevelPass {
 
                # Ugly state machine to walk through avoiding tags.
                $state = self::COLON_STATE_TEXT;
-               $level = 0;
+               $ltLevel = 0;
+               $lcLevel = 0;
                $len = strlen( $str );
-               for ( $i = 0; $i < $len; $i++ ) {
+               for ( $i = $m[0][1]; $i < $len; $i++ ) {
                        $c = $str[$i];
 
                        switch ( $state ) {
@@ -418,7 +419,7 @@ class BlockLevelPass {
                                        $state = self::COLON_STATE_TAGSTART;
                                        break;
                                case ":":
-                                       if ( $level === 0 ) {
+                                       if ( $ltLevel === 0 ) {
                                                # We found it!
                                                $before = substr( $str, 0, $i );
                                                $after = substr( $str, $i + 1 );
@@ -428,35 +429,44 @@ class BlockLevelPass {
                                        break;
                                default:
                                        # Skip ahead looking for something interesting
-                                       $colonPos = strpos( $str, ':', $i );
-                                       if ( $colonPos === false ) {
+                                       if ( !preg_match( '/:|<|-\{/', $str, $m, PREG_OFFSET_CAPTURE, $i ) ) {
                                                # Nothing else interesting
                                                return false;
                                        }
-                                       $ltPos = strpos( $str, '<', $i );
-                                       if ( $level === 0 ) {
-                                               if ( $ltPos === false || $colonPos < $ltPos ) {
-                                                       # We found it!
-                                                       $before = substr( $str, 0, $colonPos );
-                                                       $after = substr( $str, $colonPos + 1 );
-                                                       return $i;
-                                               }
+                                       if ( $m[0][0] === '-{' ) {
+                                               $state = self::COLON_STATE_LC;
+                                               $lcLevel++;
+                                               $i = $m[0][1] + 1;
+                                       } else {
+                                               # Skip ahead to next interesting character.
+                                               $i = $m[0][1] - 1;
                                        }
-                                       if ( $ltPos === false ) {
-                                               # Nothing else interesting to find; abort!
-                                               # We're nested, but there's no close tags left. Abort!
-                                               break 2;
+                                       break;
+                               }
+                               break;
+                       case self::COLON_STATE_LC:
+                               # In language converter markup -{ ... }-
+                               if ( !preg_match( '/-\{|\}-/', $str, $m, PREG_OFFSET_CAPTURE, $i ) ) {
+                                       # Nothing else interesting to find; abort!
+                                       # We're nested in language converter markup, but there
+                                       # are no close tags left.  Abort!
+                                       break 2;
+                               } elseif ( $m[0][0] === '-{' ) {
+                                       $i = $m[0][1] + 1;
+                                       $lcLevel++;
+                               } elseif ( $m[0][0] === '}-' ) {
+                                       $i = $m[0][1] + 1;
+                                       $lcLevel--;
+                                       if ( $lcLevel === 0 ) {
+                                               $state = self::COLON_STATE_TEXT;
                                        }
-                                       # Skip ahead to next tag start
-                                       $i = $ltPos;
-                                       $state = self::COLON_STATE_TAGSTART;
                                }
                                break;
                        case self::COLON_STATE_TAG:
                                # In a <tag>
                                switch ( $c ) {
                                case ">":
-                                       $level++;
+                                       $ltLevel++;
                                        $state = self::COLON_STATE_TEXT;
                                        break;
                                case "/":
@@ -486,8 +496,8 @@ class BlockLevelPass {
                        case self::COLON_STATE_CLOSETAG:
                                # In a </tag>
                                if ( $c === ">" ) {
-                                       $level--;
-                                       if ( $level < 0 ) {
+                                       $ltLevel--;
+                                       if ( $ltLevel < 0 ) {
                                                wfDebug( __METHOD__ . ": Invalid input; too many close tags\n" );
                                                return false;
                                        }
@@ -526,8 +536,11 @@ class BlockLevelPass {
                                throw new MWException( "State machine error in " . __METHOD__ );
                        }
                }
-               if ( $level > 0 ) {
-                       wfDebug( __METHOD__ . ": Invalid input; not enough close tags (level $level, state $state)\n" );
+               if ( $ltLevel > 0 || $lcLevel > 0 ) {
+                       wfDebug(
+                               __METHOD__ . ": Invalid input; not enough close tags " .
+                               "(level $ltLevel/$lcLevel, state $state)\n"
+                       );
                        return false;
                }
                return false;
index 46e1012..a94b276 100644 (file)
@@ -20893,19 +20893,61 @@ File:foobar.jpg|{{Test|unamedParam|alt=-{R|param}-}}|alt=galleryalt
 
 !! end
 
-# FIXME: This test is currently broken in the PHP parser (bug 52661)
 !! test
-Don't break list handling if language converter markup is in the item.
+T153135: Don't break list handling if language converter markup is in the item.
 !! options
 language=zh variant=zh-cn
 !! wikitext
 ;-{zh-cn:AAA;zh-tw:BBB}-
+;-{R|foo:bar}-
 !! html/php
-<dl><dt><span class="error">在手动语言转换规则中检测到错误</span></dd></dl>
+<dl><dt>AAA</dt>
+<dt>foo:bar</dt></dl>
 
 !! html/parsoid
-<dl><dt>AAA
-</dt></dl>
+<dl>
+<dt><span typeof="mw:LanguageVariant" data-mw='{"bidir":[{"l":"zh-cn","t":"AAA"},{"l":"zh-tw","t":"BBB"}],"show":true}'></span></dt>
+<dt><span typeof="mw:LanguageVariant" data-mw='{"disabled":true,"show":true,"text":"foo:bar"}'></span></dt>
+</dl>
+!! end
+
+// Note that parsoid does not protect colons unless language converter
+// markup is properly nested, because it is a backtracking parser.
+!! test
+T153135: Unclosed markup in definition list (code coverage)
+!! options
+language=zh variant=zh-cn
+!! wikitext
+;<b>foo:bar
+;-{zh-cn:AAA
+!! html/php
+<dl><dt><b>foo:bar</dt>
+<dt>-{zh-cn:AAA</b></dt></dl>
+
+!! html/parsoid
+<dl>
+<dt><b>foo:bar</b></dt>
+<b>
+<dt>-{zh-cn</dt>
+<dd>AAA</dd>
+</b></dl>
+!! end
+
+!! test
+T153135: Nested language converter markup in definition list (code coverage)
+!! options
+language=zh variant=zh-cn
+!! wikitext
+;-{zh-cn:AAA -{zh-hans|foo:bar}- -{R|bat:baz}-}-:def
+!! html/php
+<dl><dt>AAA foo:bar bat:baz</dt>
+<dd>def</dd></dl>
+
+!! html/parsoid
+<dl>
+<dt><span typeof="mw:LanguageVariant" data-mw='{"bidir":[{"l":"zh-cn","t":"AAA &lt;span typeof=\"mw:LanguageVariant\" data-parsoid=&#39;{\"fl\":[\"zh-hans\"],\"dsr\":[13,32,null,2]}&#39; data-mw=&#39;{\"filter\":[\"zh-hans\"],\"text\":\"bar\"}&#39;>&lt;/span> &lt;span typeof=\"mw:LanguageVariant\" data-parsoid=&#39;{\"fl\":[\"R\"],\"dsr\":[33,46,null,2]}&#39; data-mw=&#39;{\"disabled\":true,\"show\":true,\"text\":\"bat:baz\"}&#39;>&lt;/span>"}],"show":true}'></span></dt>
+<dd>def</dd>
+</dl>
 !! end
 
 !! test