Merge "Parse wikitext in gallery caption"
[lhc/web/wiklou.git] / includes / parser / Parser.php
index 3509200..81e23ad 100644 (file)
@@ -1452,6 +1452,8 @@ class Parser {
                } else {
                        # attempt to sanitize at least some nesting problems
                        # (T4702 and quite a few others)
+                       # This code path is buggy and deprecated!
+                       wfDeprecated( 'disabling tidy', '1.33' );
                        $tidyregs = [
                                # ''Something [http://www.cool.com cool''] -->
                                # <i>Something</i><a href="http://www.cool.com"..><i>cool></i></a>
@@ -2026,7 +2028,19 @@ class Parser {
         * @return string
         */
        public static function normalizeLinkUrl( $url ) {
-               # First, make sure unsafe characters are encoded
+               # Test for RFC 3986 IPv6 syntax
+               $scheme = '[a-z][a-z0-9+.-]*:';
+               $userinfo = '(?:[a-z0-9\-._~!$&\'()*+,;=:]|%[0-9a-f]{2})*';
+               $ipv6Host = '\\[((?:[0-9a-f:]|%3[0-A]|%[46][1-6])+)\\]';
+               if ( preg_match( "<^(?:{$scheme})?//(?:{$userinfo}@)?{$ipv6Host}(?:[:/?#].*|)$>i", $url, $m ) &&
+                       IP::isValid( rawurldecode( $m[1] ) )
+               ) {
+                       $isIPv6 = rawurldecode( $m[1] );
+               } else {
+                       $isIPv6 = false;
+               }
+
+               # Make sure unsafe characters are encoded
                $url = preg_replace_callback( '/[\x00-\x20"<>\[\\\\\]^`{|}\x7F-\xFF]/',
                        function ( $m ) {
                                return rawurlencode( $m[0] );
@@ -2058,6 +2072,16 @@ class Parser {
                $ret = self::normalizeUrlComponent(
                        substr( $url, 0, $end ), '"#%<>[\]^`{|}/?' ) . $ret;
 
+               # Fix IPv6 syntax
+               if ( $isIPv6 !== false ) {
+                       $ipv6Host = "%5B({$isIPv6})%5D";
+                       $ret = preg_replace(
+                               "<^((?:{$scheme})?//(?:{$userinfo}@)?){$ipv6Host}(?=[:/?#]|$)>i",
+                               "$1[$2]",
+                               $ret
+                       );
+               }
+
                return $ret;
        }
 
@@ -5042,9 +5066,10 @@ class Parser {
                        $ig->setShowFilename( false );
                }
                if ( isset( $params['caption'] ) ) {
-                       $caption = $params['caption'];
-                       $caption = htmlspecialchars( $caption );
-                       $caption = $this->replaceInternalLinks( $caption );
+                       // NOTE: We aren't passing a frame here or below.  Frame info
+                       // is currently opaque to Parsoid, which acts on OT_PREPROCESS.
+                       // See T107332#4030581
+                       $caption = $this->recursiveTagParse( $params['caption'] );
                        $ig->setCaptionHtml( $caption );
                }
                if ( isset( $params['perrow'] ) ) {
@@ -5133,7 +5158,7 @@ class Parser {
                                                                $alt = $this->stripAltText( $match, false );
                                                                break;
                                                        case 'gallery-internal-link':
-                                                               $linkValue = strip_tags( $this->replaceLinkHoldersText( $match ) );
+                                                               $linkValue = $this->stripAltText( $match, false );
                                                                if ( preg_match( '/^-{R|(.*)}-$/', $linkValue ) ) {
                                                                        // Result of LanguageConverter::markNoConversion
                                                                        // invoked on an external link.
@@ -5258,6 +5283,8 @@ class Parser {
                #  * bottom
                #  * text-bottom
 
+               global $wgMediaInTargetLanguage;
+
                # Protect LanguageConverter markup when splitting into parts
                $parts = StringUtils::delimiterExplode(
                        '-{', '}-', '|', $options, true /* allow nesting */
@@ -5327,7 +5354,10 @@ class Parser {
                                                                $value = $this->stripAltText( $value, $holders );
                                                                break;
                                                        case 'link':
-                                                               list( $paramName, $value ) = $this->parseLinkParameter( $value );
+                                                               list( $paramName, $value ) =
+                                                                       $this->parseLinkParameter(
+                                                                               $this->stripAltText( $value, $holders )
+                                                                       );
                                                                if ( $paramName ) {
                                                                        $validated = true;
                                                                        if ( $paramName === 'no-link' ) {
@@ -5415,6 +5445,9 @@ class Parser {
                        # Use the "caption" for the tooltip text
                        $params['frame']['title'] = $this->stripAltText( $caption, $holders );
                }
+               if ( $wgMediaInTargetLanguage ) {
+                       $params['handler']['targetlang'] = $this->getTargetLanguage()->getCode();
+               }
 
                Hooks::run( 'ParserMakeImageParams', [ $title, $file, &$params, $this ] );
 
@@ -5492,6 +5525,40 @@ class Parser {
                # that are later expanded to html- so expand them now and
                # remove the tags
                $tooltip = $this->mStripState->unstripBoth( $tooltip );
+               # Compatibility hack!  In HTML certain entity references not terminated
+               # by a semicolon are decoded (but not if we're in an attribute; that's
+               # how link URLs get away without properly escaping & in queries).
+               # But wikitext has always required semicolon-termination of entities,
+               # so encode & where needed to avoid decode of semicolon-less entities.
+               # See T209236 and
+               # https://www.w3.org/TR/html5/syntax.html#named-character-references
+               # T210437 discusses moving this workaround to Sanitizer::stripAllTags.
+               $tooltip = preg_replace( "/
+                       &                       # 1. entity prefix
+                       (?=                     # 2. followed by:
+                       (?:                     #  a. one of the legacy semicolon-less named entities
+                               A(?:Elig|MP|acute|circ|grave|ring|tilde|uml)|
+                               C(?:OPY|cedil)|E(?:TH|acute|circ|grave|uml)|
+                               GT|I(?:acute|circ|grave|uml)|LT|Ntilde|
+                               O(?:acute|circ|grave|slash|tilde|uml)|QUOT|REG|THORN|
+                               U(?:acute|circ|grave|uml)|Yacute|
+                               a(?:acute|c(?:irc|ute)|elig|grave|mp|ring|tilde|uml)|brvbar|
+                               c(?:cedil|edil|urren)|cent(?!erdot;)|copy(?!sr;)|deg|
+                               divide(?!ontimes;)|e(?:acute|circ|grave|th|uml)|
+                               frac(?:1(?:2|4)|34)|
+                               gt(?!c(?:c|ir)|dot|lPar|quest|r(?:a(?:pprox|rr)|dot|eq(?:less|qless)|less|sim);)|
+                               i(?:acute|circ|excl|grave|quest|uml)|laquo|
+                               lt(?!c(?:c|ir)|dot|hree|imes|larr|quest|r(?:Par|i(?:e|f|));)|
+                               m(?:acr|i(?:cro|ddot))|n(?:bsp|tilde)|
+                               not(?!in(?:E|dot|v(?:a|b|c)|)|ni(?:v(?:a|b|c)|);)|
+                               o(?:acute|circ|grave|rd(?:f|m)|slash|tilde|uml)|
+                               p(?:lusmn|ound)|para(?!llel;)|quot|r(?:aquo|eg)|
+                               s(?:ect|hy|up(?:1|2|3)|zlig)|thorn|times(?!b(?:ar|)|d;)|
+                               u(?:acute|circ|grave|ml|uml)|y(?:acute|en|uml)
+                       )
+                       (?:[^;]|$))     #  b. and not followed by a semicolon
+                       # S = study, for efficiency
+                       /Sx", '&amp;', $tooltip );
                $tooltip = Sanitizer::stripAllTags( $tooltip );
 
                return $tooltip;