Fix langauge converter parser test with self-close tags
authorBrian Wolff <bawolff+wn@gmail.com>
Wed, 15 Nov 2017 05:34:10 +0000 (05:34 +0000)
committerBrian Wolff <bawolff+wn@gmail.com>
Wed, 15 Nov 2017 06:03:22 +0000 (06:03 +0000)
This fixes an issue in f21f3942 where if there was an html
element with an alt or title attribute containing an &lt;
entity, an ascii EOT control character (0x04) may become
inserted into the text if language converter was enabled.

Due to a really old bug in language converter, self-closed tags
got turned into non-self closed tags. However due a different
bug which was fixed in f21f3942 this code path was rarely taken
so nobody noticed until now.

Follow-up Idbc45cac12

Bug: T180552
Change-Id: I077d30c50fcb419837fef937d27caca307153d2d

languages/LanguageConverter.php
tests/parser/parserTests.txt

index 8e98abd..1d78402 100644 (file)
@@ -353,7 +353,6 @@ class LanguageConverter {
                if ( $this->guessVariant( $text, $toVariant ) ) {
                        return $text;
                }
-
                /* we convert everything except:
                   1. HTML markups (anything between < and >)
                   2. HTML entities
@@ -389,6 +388,7 @@ class LanguageConverter {
                // Guard against delimiter nulls in the input
                // (should never happen: see T159174)
                $text = str_replace( "\000", '', $text );
+               $text = str_replace( "\004", '', $text );
 
                $markupMatches = null;
                $elementMatches = null;
@@ -403,6 +403,13 @@ class LanguageConverter {
                                        // We hit the end.
                                        $elementPos = strlen( $text );
                                        $element = '';
+                               } elseif( substr( $element, -1 ) === "\004" ) {
+                                       // This can sometimes happen if we have
+                                       // unclosed html tags (For example
+                                       // when converting a title attribute
+                                       // during a recursive call that contains
+                                       // a &lt; e.g. <div title="&lt;">.
+                                       $element = substr( $element, 0, -1 );
                                }
                        } else {
                                // If we hit here, then Language Converter could be tricked
@@ -430,7 +437,14 @@ class LanguageConverter {
                        if ( $element !== ''
                                && preg_match( '/^(<[^>\s]*+)\s([^>]*+)(.*+)$/', $element, $elementMatches )
                        ) {
+                               // FIXME, this decodes entities, so if you have something
+                               // like <div title="foo&lt;bar"> the bar won't get
+                               // translated since after entity decoding it looks like
+                               // unclosed html and we call this method recursively
+                               // on attributes.
                                $attrs = Sanitizer::decodeTagAttributes( $elementMatches[2] );
+                               // Ensure self-closing tags stay self-closing.
+                               $close = substr( $elementMatches[2], -1 ) === '/' ? ' /' : '';
                                $changed = false;
                                foreach ( [ 'title', 'alt' ] as $attrName ) {
                                        if ( !isset( $attrs[$attrName] ) ) {
@@ -449,7 +463,7 @@ class LanguageConverter {
                                }
                                if ( $changed ) {
                                        $element = $elementMatches[1] . Html::expandAttributes( $attrs ) .
-                                               $elementMatches[3];
+                                               $close . $elementMatches[3];
                                }
                        }
                        $literalBlob .= $element . "\000";
index 7e678d8..56879ff 100644 (file)
@@ -18515,7 +18515,7 @@ language=sr variant=sr-el
 [[File:Foobar.jpg|alt=-{}-foAjrjvi-{}-]]
 !! html
 <p>
-</p><p><a href="/wiki/%D0%94%D0%B0%D1%82%D0%BE%D1%82%D0%B5%D0%BA%D0%B0:Foobar.jpg" class="image"><img alt="&quot; onload=&quot;alert(1)&quot; data-foo=&quot;" src="http://example.com/images/3/3a/Foobar.jpg" width="1941" height="220"></a>
+</p><p><a href="/wiki/%D0%94%D0%B0%D1%82%D0%BE%D1%82%D0%B5%D0%BA%D0%B0:Foobar.jpg" class="image"><img alt="&quot; onload=&quot;alert(1)&quot; data-foo=&quot;" src="http://example.com/images/3/3a/Foobar.jpg" width="1941" height="220" /></a>
 </p>
 !! end