From 186a182a150f20475f0887cbc261d9be01dbfd98 Mon Sep 17 00:00:00 2001 From: "C. Scott Ananian" Date: Thu, 19 Jan 2017 14:58:05 -0500 Subject: [PATCH] Protect language converter markup in the preprocessor (take 2). This revises 28774022769d2273be16c6c6e1cca710a1fd97ef, which was reverted in master due to unexpected issues with `-{{...}} ` markup on translatewiki and enwiki. Test cases are added to ensure that this is parsed as a template, not as language converter markup. https://www.mediawiki.org/wiki/Preprocessor_ABNF is the canonical documentation for the preprocessor; this will be updated after this patch is merged. The basic principles described in that page are maintained in this patch: * Rightmost opening structure has precedence: `-{{` is parsed as a dash followed by template opening. * `{{{` has precedence over `{{` and `-{`: `-{{{{` is parsed as `-{` `{{{` since we first grab the rightmost `{{{`. A bunch of test cases were added to verify the "ideal precedence" order described on that wiki page. This patch introduced some minor incompatibilities in existing markup, in particular with chemical formulae in templates. Fixes for these are being tracked at https://www.mediawiki.org/wiki/Parsoid/Language_conversion/Preprocessor_fixups Bug: T146304 Bug: T153761 Change-Id: I2f0c186c75e392c95e1a3d89266cae2586349150 --- RELEASE-NOTES-1.30 | 4 + includes/parser/Preprocessor.php | 6 +- includes/parser/Preprocessor_DOM.php | 62 +++-- includes/parser/Preprocessor_Hash.php | 63 +++-- tests/parser/parserTests.txt | 344 +++++++++++++++++++++++++- 5 files changed, 444 insertions(+), 35 deletions(-) diff --git a/RELEASE-NOTES-1.30 b/RELEASE-NOTES-1.30 index aa583b8579..97356fd409 100644 --- a/RELEASE-NOTES-1.30 +++ b/RELEASE-NOTES-1.30 @@ -79,6 +79,10 @@ changes to languages because of Phabricator reports. deprecated in 1.24) were removed. * wfMemcKey() and wfGlobalCacheKey() were deprecated. ObjectCache::makeKey() and ObjectCache::makeGlobalKey() should be used instead. +* (T146304) Preprocessor handling of LanguageConverter markup has been improved. + As a result of the new uniform handling, '-{' may need to be escaped + (for example, as '-{') where it occurs inside template arguments + or wikilinks. == Compatibility == MediaWiki 1.30 requires PHP 5.5.9 or later. There is experimental support for diff --git a/includes/parser/Preprocessor.php b/includes/parser/Preprocessor.php index 426b5507a4..cb8e3a7f72 100644 --- a/includes/parser/Preprocessor.php +++ b/includes/parser/Preprocessor.php @@ -51,9 +51,9 @@ abstract class Preprocessor { ], '-{' => [ 'end' => '}-', - 'names' => [ 1 => null ], - 'min' => 1, - 'max' => 1, + 'names' => [ 2 => null ], + 'min' => 2, + 'max' => 2, ], ]; diff --git a/includes/parser/Preprocessor_DOM.php b/includes/parser/Preprocessor_DOM.php index b93c6173ea..753930735d 100644 --- a/includes/parser/Preprocessor_DOM.php +++ b/includes/parser/Preprocessor_DOM.php @@ -223,8 +223,7 @@ class Preprocessor_DOM extends Preprocessor { $searchBase = "[{<\n"; # } if ( !$wgDisableLangConversion ) { - // FIXME: disabled due to T153761 - // $searchBase .= '-'; + $searchBase .= '-'; } // For fast reverse searches @@ -277,6 +276,13 @@ class Preprocessor_DOM extends Preprocessor { $search = $searchBase; if ( $stack->top === false ) { $currentClosing = ''; + } elseif ( + $stack->top->close === '}-' && + $stack->top->count > 2 + ) { + # adjust closing for -{{{...{{ + $currentClosing = '}'; + $search .= $currentClosing; } else { $currentClosing = $stack->top->close; $search .= $currentClosing; @@ -333,11 +339,15 @@ class Preprocessor_DOM extends Preprocessor { } elseif ( isset( $this->rules[$curChar] ) ) { $found = 'open'; $rule = $this->rules[$curChar]; - } elseif ( $curChar == '-' ) { - $found = 'dash'; } else { - # Some versions of PHP have a strcspn which stops on null characters - # Ignore and continue + # Some versions of PHP have a strcspn which stops on + # null characters; ignore these and continue. + # We also may get '-' and '}' characters here which + # don't match -{ or $currentClosing. Add these to + # output and continue. + if ( $curChar == '-' || $curChar == '}' ) { + $accum .= $curChar; + } ++$i; continue; } @@ -615,7 +625,10 @@ class Preprocessor_DOM extends Preprocessor { } elseif ( $found == 'open' ) { # count opening brace characters $curLen = strlen( $curChar ); - $count = ( $curLen > 1 ) ? 1 : strspn( $text, $curChar, $i ); + $count = ( $curLen > 1 ) ? + # allow the final character to repeat + strspn( $text, $curChar[$curLen-1], $i+1 ) + 1 : + strspn( $text, $curChar, $i ); # we need to add to stack only if opening brace count is enough for one of the rules if ( $count >= $rule['min'] ) { @@ -635,17 +648,25 @@ class Preprocessor_DOM extends Preprocessor { # Add literal brace(s) $accum .= htmlspecialchars( str_repeat( $curChar, $count ) ); } - $i += $curLen * $count; + $i += $count; } elseif ( $found == 'close' ) { $piece = $stack->top; # lets check if there are enough characters for closing brace $maxCount = $piece->count; + if ( $piece->close === '}-' && $curChar === '}' ) { + $maxCount--; # don't try to match closing '-' as a '}' + } $curLen = strlen( $curChar ); - $count = ( $curLen > 1 ) ? 1 : strspn( $text, $curChar, $i, $maxCount ); + $count = ( $curLen > 1 ) ? $curLen : + strspn( $text, $curChar, $i, $maxCount ); # check for maximum matching characters (if there are 5 closing # characters, we will probably need only 3 - depending on the rules) $rule = $this->rules[$piece->open]; + if ( $piece->close === '}-' && $piece->count > 2 ) { + # tweak for -{..{{ }}..}- + $rule = $this->rules['{']; + } if ( $count > $rule['max'] ) { # The specified maximum exists in the callback array, unless the caller # has made an error @@ -663,14 +684,16 @@ class Preprocessor_DOM extends Preprocessor { if ( $matchingCount <= 0 ) { # No matching element found in callback array # Output a literal closing brace and continue - $accum .= htmlspecialchars( str_repeat( $curChar, $count ) ); - $i += $curLen * $count; + $endText = substr( $text, $i, $count ); + $accum .= htmlspecialchars( $endText ); + $i += $count; continue; } $name = $rule['names'][$matchingCount]; if ( $name === null ) { // No element, just literal text - $element = $piece->breakSyntax( $matchingCount ) . str_repeat( $rule['end'], $matchingCount ); + $endText = substr( $text, $i, $matchingCount ); + $element = $piece->breakSyntax( $matchingCount ) . $endText; } else { # Create XML element # Note: $parts is already XML, does not need to be encoded further @@ -703,7 +726,7 @@ class Preprocessor_DOM extends Preprocessor { } # Advance input pointer - $i += $curLen * $matchingCount; + $i += $matchingCount; # Unwind the stack $stack->pop(); @@ -719,7 +742,12 @@ class Preprocessor_DOM extends Preprocessor { $stack->push( $piece ); $accum =& $stack->getAccum(); } else { - $accum .= str_repeat( $piece->open, $piece->count ); + $s = substr( $piece->open, 0, -1 ); + $s .= str_repeat( + substr( $piece->open, -1 ), + $piece->count - strlen( $s ) + ); + $accum .= $s; } } $flags = $stack->getFlags(); @@ -924,7 +952,11 @@ class PPDStackElement { if ( $openingCount === false ) { $openingCount = $this->count; } - $s = str_repeat( $this->open, $openingCount ); + $s = substr( $this->open, 0, -1 ); + $s .= str_repeat( + substr( $this->open, -1 ), + $openingCount - strlen( $s ) + ); $first = true; foreach ( $this->parts as $part ) { if ( $first ) { diff --git a/includes/parser/Preprocessor_Hash.php b/includes/parser/Preprocessor_Hash.php index b2e9531ddd..597d1f231c 100644 --- a/includes/parser/Preprocessor_Hash.php +++ b/includes/parser/Preprocessor_Hash.php @@ -155,8 +155,7 @@ class Preprocessor_Hash extends Preprocessor { $searchBase = "[{<\n"; if ( !$wgDisableLangConversion ) { - // FIXME: disabled due to T153761 - // $searchBase .= '-'; + $searchBase .= '-'; } // For fast reverse searches @@ -208,6 +207,13 @@ class Preprocessor_Hash extends Preprocessor { $search = $searchBase; if ( $stack->top === false ) { $currentClosing = ''; + } elseif ( + $stack->top->close === '}-' && + $stack->top->count > 2 + ) { + # adjust closing for -{{{...{{ + $currentClosing = '}'; + $search .= $currentClosing; } else { $currentClosing = $stack->top->close; $search .= $currentClosing; @@ -264,11 +270,15 @@ class Preprocessor_Hash extends Preprocessor { } elseif ( isset( $this->rules[$curChar] ) ) { $found = 'open'; $rule = $this->rules[$curChar]; - } elseif ( $curChar == '-' ) { - $found = 'dash'; } else { - # Some versions of PHP have a strcspn which stops on null characters - # Ignore and continue + # Some versions of PHP have a strcspn which stops on + # null characters; ignore these and continue. + # We also may get '-' and '}' characters here which + # don't match -{ or $currentClosing. Add these to + # output and continue. + if ( $curChar == '-' || $curChar == '}' ) { + self::addLiteral( $accum, $curChar ); + } ++$i; continue; } @@ -558,7 +568,10 @@ class Preprocessor_Hash extends Preprocessor { } elseif ( $found == 'open' ) { # count opening brace characters $curLen = strlen( $curChar ); - $count = ( $curLen > 1 ) ? 1 : strspn( $text, $curChar, $i ); + $count = ( $curLen > 1 ) ? + # allow the final character to repeat + strspn( $text, $curChar[$curLen-1], $i+1 ) + 1 : + strspn( $text, $curChar, $i ); # we need to add to stack only if opening brace count is enough for one of the rules if ( $count >= $rule['min'] ) { @@ -577,17 +590,25 @@ class Preprocessor_Hash extends Preprocessor { # Add literal brace(s) self::addLiteral( $accum, str_repeat( $curChar, $count ) ); } - $i += $curLen * $count; + $i += $count; } elseif ( $found == 'close' ) { $piece = $stack->top; # lets check if there are enough characters for closing brace $maxCount = $piece->count; + if ( $piece->close === '}-' && $curChar === '}' ) { + $maxCount--; # don't try to match closing '-' as a '}' + } $curLen = strlen( $curChar ); - $count = ( $curLen > 1 ) ? 1 : strspn( $text, $curChar, $i, $maxCount ); + $count = ( $curLen > 1 ) ? $curLen : + strspn( $text, $curChar, $i, $maxCount ); # check for maximum matching characters (if there are 5 closing # characters, we will probably need only 3 - depending on the rules) $rule = $this->rules[$piece->open]; + if ( $piece->close === '}-' && $piece->count > 2 ) { + # tweak for -{..{{ }}..}- + $rule = $this->rules['{']; + } if ( $count > $rule['max'] ) { # The specified maximum exists in the callback array, unless the caller # has made an error @@ -605,15 +626,17 @@ class Preprocessor_Hash extends Preprocessor { if ( $matchingCount <= 0 ) { # No matching element found in callback array # Output a literal closing brace and continue - self::addLiteral( $accum, str_repeat( $curChar, $count ) ); - $i += $curLen * $count; + $endText = substr( $text, $i, $count ); + self::addLiteral( $accum, $endText ); + $i += $count; continue; } $name = $rule['names'][$matchingCount]; if ( $name === null ) { // No element, just literal text + $endText = substr( $text, $i, $matchingCount ); $element = $piece->breakSyntax( $matchingCount ); - self::addLiteral( $element, str_repeat( $rule['end'], $matchingCount ) ); + self::addLiteral( $element, $endText ); } else { # Create XML element $parts = $piece->parts; @@ -648,7 +671,7 @@ class Preprocessor_Hash extends Preprocessor { } # Advance input pointer - $i += $curLen * $matchingCount; + $i += $matchingCount; # Unwind the stack $stack->pop(); @@ -664,7 +687,12 @@ class Preprocessor_Hash extends Preprocessor { $stack->push( $piece ); $accum =& $stack->getAccum(); } else { - self::addLiteral( $accum, str_repeat( $piece->open, $piece->count ) ); + $s = substr( $piece->open, 0, -1 ); + $s .= str_repeat( + substr( $piece->open, -1 ), + $piece->count - strlen( $s ) + ); + self::addLiteral( $accum, $s ); } } @@ -762,7 +790,12 @@ class PPDStackElement_Hash extends PPDStackElement { if ( $openingCount === false ) { $openingCount = $this->count; } - $accum = [ str_repeat( $this->open, $openingCount ) ]; + $s = substr( $this->open, 0, -1 ); + $s .= str_repeat( + substr( $this->open, -1 ), + $openingCount - strlen( $s ) + ); + $accum = [ $s ]; $lastIndex = 0; $first = true; foreach ( $this->parts as $part ) { diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index 368dc0d213..54a8c61985 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -11840,6 +11840,326 @@ parsoid !!end +### +### Preprocessor precedence tests +### See: https://www.mediawiki.org/wiki/Preprocessor_ABNF +### +##{{[[-{{{{{{[[Foo|bar}}]]}-}}}}}]] +!! test +Preprocessor precedence 1: link is rightmost opening +!! wikitext +{{[[Foo|bar}}]] + +But close-brace is not a valid character in a link title: +{{[[Foo}}|bar]] + +However, we can still tell this was handled as a link in the preprocessor: +{{echo|[[Foo}}|bar]]|bat}} +!! html +

{{bar}} +

But close-brace is not a valid character in a link title: +{{[[Foo}}|bar]] +

However, we can still tell this was handled as a link in the preprocessor: +[[Foo}}|bar]] +

+!! end + +!! test +Preprocessor precedence 2: template is rightmost opening +!! options +language=zh +!! wikitext +-{{echo|foo}-}}- +!! html +

-foo}-- +

+!! end + +!! test +Preprocessor precedence 3: language converter is rightmost opening +!! options +language=zh +!! wikitext +{{echo|hi}} + +{{-{R|echo|hi}}}- + +[[-{R|raw]]}- +!! html +

hi +

{{echo|hi}} +

[[raw]] +

+!! end + +!! test +Preprocessor precedence 4: left-most angle bracket +!! options +language=zh +!! wikitext + +!! html +!! end + +!! article +Template:Precedence5 +!! text +{{{{{1}}}}} +!! endarticle + +!! test +Preprocessor precedence 5: tplarg takes precedence over template +!! wikitext +{{Precedence5|Bullet}} +!! html + + +!! end + +!! test +Preprocessor precedence 6: broken link is rightmost opening +!! wikitext +{{echo|[[Foo}} + +{{echo|[[Foo|bar|bat=baz}} +!! html +

{{echo|[[Foo}} +

{{echo|[[Foo|bar|bat=baz}} +

+!! end + +# This next test exposes a difference between PHP and Parsoid: +# Given [[Foo|{{echo|Bar]]x}}y]]z: +# 1) Both PHP and Parsoid ignore the `]]` inside the `echo` in the +# "preprocessor" stage. The `{{echo` extends until the `x}}`, and the +# outer `[[Foo` extends until the `y]]` +# 2a) But then the PHP preprocessor emits `[[Foo|Bar]]xy]]z` as an +# intermediate result (after template expansion), and link processing +# happens on this intermediate result, which moves the wikilink +# boundary leftward to `[[Foo|Bar]]` +# 2b) Parsoid works in a single step, so it's going to keep the +# wikilink as extending to the `y]]` +# 3a) Then PHP does linktrail processing which slurps up the trailing +# `xy` inside the link. +# 3b) Parsoid will do linktrail processing to slurp up the trailing +# `z` inside the link. +# This is "correct" behavior. Parsoid's basic worldview is that the +# `]]` inside the template shouldn't be allowed to leak out to affect +# the surrounding wikilink. PHP may match Parsoid (in the future) +# if you use {{#balance}} (T114445). + +!! test +Preprocessor precedence 7: broken template is rightmost opening +!! wikitext +[[Foo|{{echo|Bar]] + +[[Foo|{{echo|Bar]]-x}}-y]]-z + +Careful: linktrails can move the end of the wikilink: +[[Foo|{{echo|y']]a}}l]]l +!! html +

{{echo|Bar +

Bar-x-y]]-z +

Careful: linktrails can move the end of the wikilink: +y'al]]l +

+!! end + +!! test +Preprocessor precedence 8: broken language converter is rightmost opening +!! options +language=zh +!! wikitext +[[Foo-{R|raw]] +!! html +

[[Foo-{R|raw]] +

+!! end + +!! article +Template:Preprocessor_precedence_9 +!! text +;4: {{{{1}}}} +;5: {{{{{2}}}}} +;6: {{{{{{3}}}}}} +;7: {{{{{{{4}}}}}}} +!! endarticle + +!! test +Preprocessor precedence 9: groups of braces +!! wikitext +{{Preprocessor precedence 9|Four|Bullet|1|2}} +!! html +
4
+
{Four}
+
5
+
+ +
6
+
Four
+
7
+
{Bullet}
+ +!! end + +!! article +Template:Preprocessor_precedence_10 +!! text +;1: -{R|raw}- +;2: -{{Bullet}}- +;3: -{{{1}}}- +;4: -{{{{2}}}}- +;5: -{{{{{3}}}}}- +;6: -{{{{{{4}}}}}}- +;7: -{{{{{{{5}}}}}}}- +!! endarticle + +!! test +Preprocessor precedence 10: groups of braces with leading dash +!! options +language=zh +!! wikitext +{{Preprocessor precedence 10|Three|raw2|Bullet|1|2}} +!! html +
1
+
raw
+
2
+
-
+ +
3
+
-Three-
+
4
+
raw2
+
5
+
-
+ +
6
+
-Three-
+
7
+
raw2
+ +!! end + +!! test +Preprocessor precedence 11: found during visual diff testing +!! wikitext +{{#tag:span|-{{#tag:span|-{{echo|x}}}}}} + +{{echo|-{{echo|-{{echo|x}}}}}} + +{{echo|-{{echo|x}}}} +!! html +

--x +

--x +

-x +

+!! end + +!! test +Preprocessor precedence 12: broken language converter closed by brace. +!! wikitext +This form breaks the template, which is unfortunate: +* {{echo|foo-{bar}bat}} + +But if the broken language converter markup is inside an extension +tag, nothing bad happens: +* foo-{bar}bat +* {{echo|foo-{bar}bat}} +*
foo-{bar}bat
+* {{echo|
foo-{bar}bat
}} + +foo-{bar}bat +{{echo|foo-{bar}bat}} + +!! html+tidy +

This form breaks the template, which is unfortunate:

+ +

But if the broken language converter markup is inside an extension tag, nothing bad happens:

+ +
+'foo-{bar}bat'
+array (
+)
+
+
+'foo-{bar}bat'
+array (
+)
+
+!! end + +!! test +Preprocessor precedence, 13: broken language converter in external link +!! wikitext +* [http://example.com/-{foo Example in URL] +* [http://example.com Example in -{link} description] +* {{echo|[http://example.com/-{foo Breaks template, however]}} +!! html+tidy + +!! end + +!! test +Preprocessor precedence, 14: broken language converter in comment +!! wikitext +* ...should be ok +* ...extra dashes +* {{echo|foobat}} ...should be ok +!! html+tidy + +!! end + +!! test +Preprocessor precedence, 15: broken brace markup in headings +!! wikitext +__NOTOC__ __NOEDITSECTION__ +===1 foo[bar 1=== +1 +===2 foo[[bar 2=== +2 +===3 foo{bar 3=== +3 +===4 foo{{bar 4=== +4 +===5 foo{{{bar 5=== +5 +===6 foo-{bar 6=== +6 +!! html+tidy +

1 foo[bar 1

+

1

+

2 foo[[bar 2

+

2

+

3 foo{bar 3

+

3

+

4 foo{{bar 4

+

4

+

5 foo{{{bar 5

+

5

+

6 foo-{bar 6

+

6

+!! end + ### ### Token Stream Patcher tests ### @@ -20946,6 +21266,28 @@ Raw: -{R|zh:China;zh-tw:Taiwan}-

!! end +!! test +Nested markup inside raw output of variant escape tags (R flag) +!! options +language=zh variant=zh-tw +!! wikitext +Nested raw: -{R|nested -{zh:China;zh-tw:Taiwan}- nested}- +!! html +

Nested raw: nested Taiwan nested +

+!! end + +!! test +Templates inside raw output of variant escape tags (R flag) +!! options +language=zh variant=zh-tw +!! wikitext +Nested raw: -{R|nested {{echo|hi}} templates}- +!! html +

Nested raw: nested hi templates +

+!! end + !! test Strings evaluating false shouldn't be ignored by Language converter (T51072) !! options @@ -21113,12 +21455,10 @@ language=sr variant=sr-ec

!! end -# FIXME: This test is currently broken in the PHP parser T153761 !! test T146304: Don't break template parsing if language converter markup is in the parameter. !! options language=sr variant=sr-ec -disabled !! wikitext {{echo|-{R|foo}-}} !! html/php -- 2.20.1