Merge "Parse wikitext in gallery caption"

[lhc/web/wiklou.git] / includes / parser / Parser.php
diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php

index 3509200..81e23ad 100644 (file)
--- a/includes/parser/Parser.php
+++ b/includes/parser/Parser.php
@@ -1452,6 +1452,8 @@ class Parser {
                 } else {
                         # attempt to sanitize at least some nesting problems
                         # (T4702 and quite a few others)
+                       # This code path is buggy and deprecated!
+                       wfDeprecated( 'disabling tidy', '1.33' );
                         $tidyregs = [
                                 # ''Something [http://www.cool.com cool''] -->
                                 # <i>Something</i><a href="http://www.cool.com"..><i>cool></i></a>
@@ -2026,7 +2028,19 @@ class Parser {
          * @return string
          */
         public static function normalizeLinkUrl( $url ) {
-               # First, make sure unsafe characters are encoded
+               # Test for RFC 3986 IPv6 syntax
+               $scheme = '[a-z][a-z0-9+.-]*:';
+               $userinfo = '(?:[a-z0-9\-._~!$&\'()*+,;=:]|%[0-9a-f]{2})*';
+               $ipv6Host = '\\[((?:[0-9a-f:]|%3[0-A]|%[46][1-6])+)\\]';
+               if ( preg_match( "<^(?:{$scheme})?//(?:{$userinfo}@)?{$ipv6Host}(?:[:/?#].*|)$>i", $url, $m ) &&
+                       IP::isValid( rawurldecode( $m[1] ) )
+               ) {
+                       $isIPv6 = rawurldecode( $m[1] );
+               } else {
+                       $isIPv6 = false;
+               }
+
+               # Make sure unsafe characters are encoded
                 $url = preg_replace_callback( '/[\x00-\x20"<>\[\\\\\]^`{|}\x7F-\xFF]/',
                         function ( $m ) {
                                 return rawurlencode( $m[0] );
@@ -2058,6 +2072,16 @@ class Parser {
                 $ret = self::normalizeUrlComponent(
                         substr( $url, 0, $end ), '"#%<>[\]^`{|}/?' ) . $ret;
  
+               # Fix IPv6 syntax
+               if ( $isIPv6 !== false ) {
+                       $ipv6Host = "%5B({$isIPv6})%5D";
+                       $ret = preg_replace(
+                               "<^((?:{$scheme})?//(?:{$userinfo}@)?){$ipv6Host}(?=[:/?#]|$)>i",
+                               "$1[$2]",
+                               $ret
+                       );
+               }
+
                 return $ret;
         }
  
@@ -5042,9 +5066,10 @@ class Parser {
                         $ig->setShowFilename( false );
                 }
                 if ( isset( $params['caption'] ) ) {
-                       $caption = $params['caption'];
-                       $caption = htmlspecialchars( $caption );
-                       $caption = $this->replaceInternalLinks( $caption );
+                       // NOTE: We aren't passing a frame here or below.  Frame info
+                       // is currently opaque to Parsoid, which acts on OT_PREPROCESS.
+                       // See T107332#4030581
+                       $caption = $this->recursiveTagParse( $params['caption'] );
                         $ig->setCaptionHtml( $caption );
                 }
                 if ( isset( $params['perrow'] ) ) {
@@ -5133,7 +5158,7 @@ class Parser {
                                                                 $alt = $this->stripAltText( $match, false );
                                                                 break;
                                                         case 'gallery-internal-link':
-                                                               $linkValue = strip_tags( $this->replaceLinkHoldersText( $match ) );
+                                                               $linkValue = $this->stripAltText( $match, false );
                                                                 if ( preg_match( '/^-{R|(.*)}-$/', $linkValue ) ) {
                                                                         // Result of LanguageConverter::markNoConversion
                                                                         // invoked on an external link.
@@ -5258,6 +5283,8 @@ class Parser {
                 #  * bottom
                 #  * text-bottom
  
+               global $wgMediaInTargetLanguage;
+
                 # Protect LanguageConverter markup when splitting into parts
                 $parts = StringUtils::delimiterExplode(
                         '-{', '}-', '|', $options, true /* allow nesting */
@@ -5327,7 +5354,10 @@ class Parser {
                                                                 $value = $this->stripAltText( $value, $holders );
                                                                 break;
                                                         case 'link':
-                                                               list( $paramName, $value ) = $this->parseLinkParameter( $value );
+                                                               list( $paramName, $value ) =
+                                                                       $this->parseLinkParameter(
+                                                                               $this->stripAltText( $value, $holders )
+                                                                       );
                                                                 if ( $paramName ) {
                                                                         $validated = true;
                                                                         if ( $paramName === 'no-link' ) {
@@ -5415,6 +5445,9 @@ class Parser {
                         # Use the "caption" for the tooltip text
                         $params['frame']['title'] = $this->stripAltText( $caption, $holders );
                 }
+               if ( $wgMediaInTargetLanguage ) {
+                       $params['handler']['targetlang'] = $this->getTargetLanguage()->getCode();
+               }
  
                 Hooks::run( 'ParserMakeImageParams', [ $title, $file, &$params, $this ] );
  
@@ -5492,6 +5525,40 @@ class Parser {
                 # that are later expanded to html- so expand them now and
                 # remove the tags
                 $tooltip = $this->mStripState->unstripBoth( $tooltip );
+               # Compatibility hack!  In HTML certain entity references not terminated
+               # by a semicolon are decoded (but not if we're in an attribute; that's
+               # how link URLs get away without properly escaping & in queries).
+               # But wikitext has always required semicolon-termination of entities,
+               # so encode & where needed to avoid decode of semicolon-less entities.
+               # See T209236 and
+               # https://www.w3.org/TR/html5/syntax.html#named-character-references
+               # T210437 discusses moving this workaround to Sanitizer::stripAllTags.
+               $tooltip = preg_replace( "/
+                       &                       # 1. entity prefix
+                       (?=                     # 2. followed by:
+                       (?:                     #  a. one of the legacy semicolon-less named entities
+                               A(?:Elig|MP|acute|circ|grave|ring|tilde|uml)|
+                               C(?:OPY|cedil)|E(?:TH|acute|circ|grave|uml)|
+                               GT|I(?:acute|circ|grave|uml)|LT|Ntilde|
+                               O(?:acute|circ|grave|slash|tilde|uml)|QUOT|REG|THORN|
+                               U(?:acute|circ|grave|uml)|Yacute|
+                               a(?:acute|c(?:irc|ute)|elig|grave|mp|ring|tilde|uml)|brvbar|
+                               c(?:cedil|edil|urren)|cent(?!erdot;)|copy(?!sr;)|deg|
+                               divide(?!ontimes;)|e(?:acute|circ|grave|th|uml)|
+                               frac(?:1(?:2|4)|34)|
+                               gt(?!c(?:c|ir)|dot|lPar|quest|r(?:a(?:pprox|rr)|dot|eq(?:less|qless)|less|sim);)|
+                               i(?:acute|circ|excl|grave|quest|uml)|laquo|
+                               lt(?!c(?:c|ir)|dot|hree|imes|larr|quest|r(?:Par|i(?:e|f|));)|
+                               m(?:acr|i(?:cro|ddot))|n(?:bsp|tilde)|
+                               not(?!in(?:E|dot|v(?:a|b|c)|)|ni(?:v(?:a|b|c)|);)|
+                               o(?:acute|circ|grave|rd(?:f|m)|slash|tilde|uml)|
+                               p(?:lusmn|ound)|para(?!llel;)|quot|r(?:aquo|eg)|
+                               s(?:ect|hy|up(?:1|2|3)|zlig)|thorn|times(?!b(?:ar|)|d;)|
+                               u(?:acute|circ|grave|ml|uml)|y(?:acute|en|uml)
+                       )
+                       (?:[^;]|$))     #  b. and not followed by a semicolon
+                       # S = study, for efficiency
+                       /Sx", '&amp;', $tooltip );
                 $tooltip = Sanitizer::stripAllTags( $tooltip );
  
                 return $tooltip;