Merge "Parse wikitext in gallery caption"
[lhc/web/wiklou.git] / includes / parser / Parser.php
index dcb2c89..81e23ad 100644 (file)
@@ -273,25 +273,30 @@ class Parser {
        /** @var SpecialPageFactory */
        private $specialPageFactory;
 
+       /** @var Config */
+       private $siteConfig;
+
        /**
-        * @param array $conf See $wgParserConf documentation
+        * @param array $parserConf See $wgParserConf documentation
         * @param MagicWordFactory|null $magicWordFactory
         * @param Language|null $contLang Content language
         * @param ParserFactory|null $factory
         * @param string|null $urlProtocols As returned from wfUrlProtocols()
         * @param SpecialPageFactory|null $spFactory
+        * @param Config|null $siteConfig
         */
        public function __construct(
-               array $conf = [], MagicWordFactory $magicWordFactory = null, Language $contLang = null,
-               ParserFactory $factory = null, $urlProtocols = null, SpecialPageFactory $spFactory = null
+               array $parserConf = [], MagicWordFactory $magicWordFactory = null,
+               Language $contLang = null, ParserFactory $factory = null, $urlProtocols = null,
+               SpecialPageFactory $spFactory = null, Config $siteConfig = null
        ) {
-               $this->mConf = $conf;
+               $this->mConf = $parserConf;
                $this->mUrlProtocols = $urlProtocols ?? wfUrlProtocols();
                $this->mExtLinkBracketedRegex = '/\[(((?i)' . $this->mUrlProtocols . ')' .
                        self::EXT_LINK_ADDR .
                        self::EXT_LINK_URL_CLASS . '*)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F\\x{FFFD}]*?)\]/Su';
-               if ( isset( $conf['preprocessorClass'] ) ) {
-                       $this->mPreprocessorClass = $conf['preprocessorClass'];
+               if ( isset( $parserConf['preprocessorClass'] ) ) {
+                       $this->mPreprocessorClass = $parserConf['preprocessorClass'];
                } elseif ( wfIsHHVM() ) {
                        # Under HHVM Preprocessor_Hash is much faster than Preprocessor_DOM
                        $this->mPreprocessorClass = Preprocessor_Hash::class;
@@ -314,6 +319,7 @@ class Parser {
 
                $this->factory = $factory ?? $services->getParserFactory();
                $this->specialPageFactory = $spFactory ?? $services->getSpecialPageFactory();
+               $this->siteConfig = $siteConfig ?? MediaWikiServices::getInstance()->getMainConfig();
        }
 
        /**
@@ -542,8 +548,6 @@ class Parser {
         * @return string
         */
        protected function makeLimitReport() {
-               global $wgShowHostnames;
-
                $maxIncludeSize = $this->mOptions->getMaxIncludeSize();
 
                $cpuTime = $this->mOutput->getTimeSinceStart( 'cpu' );
@@ -584,7 +588,7 @@ class Parser {
                Hooks::run( 'ParserLimitReportPrepare', [ $this, $this->mOutput ] );
 
                $limitReport = "NewPP limit report\n";
-               if ( $wgShowHostnames ) {
+               if ( $this->siteConfig->get( 'ShowHostnames' ) ) {
                        $limitReport .= 'Parsed by ' . wfHostname() . "\n";
                }
                $limitReport .= 'Cached time: ' . $this->mOutput->getCacheTime() . "\n";
@@ -635,7 +639,7 @@ class Parser {
                $this->mOutput->setLimitReportData( 'limitreport-timingprofile', $profileReport );
 
                // Add other cache related metadata
-               if ( $wgShowHostnames ) {
+               if ( $this->siteConfig->get( 'ShowHostnames' ) ) {
                        $this->mOutput->setLimitReportData( 'cachereport-origin', wfHostname() );
                }
                $this->mOutput->setLimitReportData( 'cachereport-timestamp',
@@ -1448,6 +1452,8 @@ class Parser {
                } else {
                        # attempt to sanitize at least some nesting problems
                        # (T4702 and quite a few others)
+                       # This code path is buggy and deprecated!
+                       wfDeprecated( 'disabling tidy', '1.33' );
                        $tidyregs = [
                                # ''Something [http://www.cool.com cool''] -->
                                # <i>Something</i><a href="http://www.cool.com"..><i>cool></i></a>
@@ -2022,7 +2028,19 @@ class Parser {
         * @return string
         */
        public static function normalizeLinkUrl( $url ) {
-               # First, make sure unsafe characters are encoded
+               # Test for RFC 3986 IPv6 syntax
+               $scheme = '[a-z][a-z0-9+.-]*:';
+               $userinfo = '(?:[a-z0-9\-._~!$&\'()*+,;=:]|%[0-9a-f]{2})*';
+               $ipv6Host = '\\[((?:[0-9a-f:]|%3[0-A]|%[46][1-6])+)\\]';
+               if ( preg_match( "<^(?:{$scheme})?//(?:{$userinfo}@)?{$ipv6Host}(?:[:/?#].*|)$>i", $url, $m ) &&
+                       IP::isValid( rawurldecode( $m[1] ) )
+               ) {
+                       $isIPv6 = rawurldecode( $m[1] );
+               } else {
+                       $isIPv6 = false;
+               }
+
+               # Make sure unsafe characters are encoded
                $url = preg_replace_callback( '/[\x00-\x20"<>\[\\\\\]^`{|}\x7F-\xFF]/',
                        function ( $m ) {
                                return rawurlencode( $m[0] );
@@ -2054,6 +2072,16 @@ class Parser {
                $ret = self::normalizeUrlComponent(
                        substr( $url, 0, $end ), '"#%<>[\]^`{|}/?' ) . $ret;
 
+               # Fix IPv6 syntax
+               if ( $isIPv6 !== false ) {
+                       $ipv6Host = "%5B({$isIPv6})%5D";
+                       $ret = preg_replace(
+                               "<^((?:{$scheme})?//(?:{$userinfo}@)?){$ipv6Host}(?=[:/?#]|$)>i",
+                               "$1[$2]",
+                               $ret
+                       );
+               }
+
                return $ret;
        }
 
@@ -2153,8 +2181,6 @@ class Parser {
         * @private
         */
        public function replaceInternalLinks2( &$s ) {
-               global $wgExtraInterlanguageLinkPrefixes;
-
                static $tc = false, $e1, $e1_img;
                # the % is needed to support urlencoded titles as well
                if ( !$tc ) {
@@ -2359,7 +2385,7 @@ class Parser {
                                if (
                                        $iw && $this->mOptions->getInterwikiMagic() && $nottalk && (
                                                Language::fetchLanguageName( $iw, null, 'mw' ) ||
-                                               in_array( $iw, $wgExtraInterlanguageLinkPrefixes )
+                                               in_array( $iw, $this->siteConfig->get( 'ExtraInterlanguageLinkPrefixes' ) )
                                        )
                                ) {
                                        # T26502: filter duplicates
@@ -2541,9 +2567,6 @@ class Parser {
         * @return string
         */
        public function getVariableValue( $index, $frame = false ) {
-               global $wgSitename, $wgServer, $wgServerName;
-               global $wgArticlePath, $wgScriptPath, $wgStylePath;
-
                if ( is_null( $this->mTitle ) ) {
                        // If no title set, bad things are going to happen
                        // later. Title should always be set since this
@@ -2845,22 +2868,21 @@ class Parser {
                                $value = SpecialVersion::getVersion();
                                break;
                        case 'articlepath':
-                               return $wgArticlePath;
+                               return $this->siteConfig->get( 'ArticlePath' );
                        case 'sitename':
-                               return $wgSitename;
+                               return $this->siteConfig->get( 'Sitename' );
                        case 'server':
-                               return $wgServer;
+                               return $this->siteConfig->get( 'Server' );
                        case 'servername':
-                               return $wgServerName;
+                               return $this->siteConfig->get( 'ServerName' );
                        case 'scriptpath':
-                               return $wgScriptPath;
+                               return $this->siteConfig->get( 'ScriptPath' );
                        case 'stylepath':
-                               return $wgStylePath;
+                               return $this->siteConfig->get( 'StylePath' );
                        case 'directionmark':
                                return $pageLang->getDirMark();
                        case 'contentlanguage':
-                               global $wgLanguageCode;
-                               return $wgLanguageCode;
+                               return $this->siteConfig->get( 'LanguageCode' );
                        case 'pagelanguage':
                                $value = $pageLang->getCode();
                                break;
@@ -3803,9 +3825,7 @@ class Parser {
         * @return string
         */
        public function interwikiTransclude( $title, $action ) {
-               global $wgEnableScaryTranscluding, $wgTranscludeCacheExpiry;
-
-               if ( !$wgEnableScaryTranscluding ) {
+               if ( !$this->siteConfig->get( 'EnableScaryTranscluding' ) ) {
                        return wfMessage( 'scarytranscludedisabled' )->inContentLanguage()->text();
                }
 
@@ -3825,7 +3845,7 @@ class Parser {
                                ( $wikiId !== false ) ? $wikiId : 'external',
                                sha1( $url )
                        ),
-                       $wgTranscludeCacheExpiry,
+                       $this->siteConfig->get( 'TranscludeCacheExpiry' ),
                        function ( $oldValue, &$ttl ) use ( $url, $fname, $cache ) {
                                $req = MWHttpRequest::factory( $url, [], $fname );
 
@@ -4127,8 +4147,6 @@ class Parser {
         * @private
         */
        public function formatHeadings( $text, $origText, $isMain = true ) {
-               global $wgMaxTocLevel;
-
                # Inhibit editsection links if requested in the page
                if ( isset( $this->mDoubleUnderscores['noeditsection'] ) ) {
                        $maybeShowEditLink = false;
@@ -4199,6 +4217,7 @@ class Parser {
 
                $headlines = $numMatches !== false ? $matches[3] : [];
 
+               $maxTocLevel = $this->siteConfig->get( 'MaxTocLevel' );
                foreach ( $headlines as $headline ) {
                        $isTemplate = false;
                        $titleText = false;
@@ -4221,7 +4240,7 @@ class Parser {
                                # Increase TOC level
                                $toclevel++;
                                $sublevelCount[$toclevel] = 0;
-                               if ( $toclevel < $wgMaxTocLevel ) {
+                               if ( $toclevel < $maxTocLevel ) {
                                        $prevtoclevel = $toclevel;
                                        $toc .= Linker::tocIndent();
                                        $numVisible++;
@@ -4243,8 +4262,8 @@ class Parser {
                                if ( $i == 0 ) {
                                        $toclevel = 1;
                                }
-                               if ( $toclevel < $wgMaxTocLevel ) {
-                                       if ( $prevtoclevel < $wgMaxTocLevel ) {
+                               if ( $toclevel < $maxTocLevel ) {
+                                       if ( $prevtoclevel < $maxTocLevel ) {
                                                # Unindent only if the previous toc level was shown :p
                                                $toc .= Linker::tocUnindent( $prevtoclevel - $toclevel );
                                                $prevtoclevel = $toclevel;
@@ -4254,7 +4273,7 @@ class Parser {
                                }
                        } else {
                                # No change in level, end TOC line
-                               if ( $toclevel < $wgMaxTocLevel ) {
+                               if ( $toclevel < $maxTocLevel ) {
                                        $toc .= Linker::tocLineEnd();
                                }
                        }
@@ -4379,7 +4398,7 @@ class Parser {
                                ) . ' ' . $headline;
                        }
 
-                       if ( $enoughToc && ( !isset( $wgMaxTocLevel ) || $toclevel < $wgMaxTocLevel ) ) {
+                       if ( $enoughToc && ( !isset( $maxTocLevel ) || $toclevel < $maxTocLevel ) ) {
                                $toc .= Linker::tocLine( $linkAnchor, $tocline,
                                        $numbering, $toclevel, ( $isTemplate ? false : $sectionIndex ) );
                        }
@@ -4460,7 +4479,7 @@ class Parser {
                }
 
                if ( $enoughToc ) {
-                       if ( $prevtoclevel > 0 && $prevtoclevel < $wgMaxTocLevel ) {
+                       if ( $prevtoclevel > 0 && $prevtoclevel < $maxTocLevel ) {
                                $toc .= Linker::tocUnindent( $prevtoclevel - 1 );
                        }
                        $toc = Linker::tocList( $toc, $this->mOptions->getUserLangObj() );
@@ -4639,8 +4658,6 @@ class Parser {
         * @return string
         */
        public function getUserSig( &$user, $nickname = false, $fancySig = null ) {
-               global $wgMaxSigChars;
-
                $username = $user->getName();
 
                # If not given, retrieve from the user object.
@@ -4654,7 +4671,7 @@ class Parser {
 
                $nickname = $nickname == null ? $username : $nickname;
 
-               if ( mb_strlen( $nickname ) > $wgMaxSigChars ) {
+               if ( mb_strlen( $nickname ) > $this->siteConfig->get( 'MaxSigChars' ) ) {
                        $nickname = $username;
                        wfDebug( __METHOD__ . ": $username has overlong signature.\n" );
                } elseif ( $fancySig !== false ) {
@@ -5049,9 +5066,10 @@ class Parser {
                        $ig->setShowFilename( false );
                }
                if ( isset( $params['caption'] ) ) {
-                       $caption = $params['caption'];
-                       $caption = htmlspecialchars( $caption );
-                       $caption = $this->replaceInternalLinks( $caption );
+                       // NOTE: We aren't passing a frame here or below.  Frame info
+                       // is currently opaque to Parsoid, which acts on OT_PREPROCESS.
+                       // See T107332#4030581
+                       $caption = $this->recursiveTagParse( $params['caption'] );
                        $ig->setCaptionHtml( $caption );
                }
                if ( isset( $params['perrow'] ) ) {
@@ -5140,7 +5158,7 @@ class Parser {
                                                                $alt = $this->stripAltText( $match, false );
                                                                break;
                                                        case 'gallery-internal-link':
-                                                               $linkValue = strip_tags( $this->replaceLinkHoldersText( $match ) );
+                                                               $linkValue = $this->stripAltText( $match, false );
                                                                if ( preg_match( '/^-{R|(.*)}-$/', $linkValue ) ) {
                                                                        // Result of LanguageConverter::markNoConversion
                                                                        // invoked on an external link.
@@ -5265,6 +5283,8 @@ class Parser {
                #  * bottom
                #  * text-bottom
 
+               global $wgMediaInTargetLanguage;
+
                # Protect LanguageConverter markup when splitting into parts
                $parts = StringUtils::delimiterExplode(
                        '-{', '}-', '|', $options, true /* allow nesting */
@@ -5334,7 +5354,10 @@ class Parser {
                                                                $value = $this->stripAltText( $value, $holders );
                                                                break;
                                                        case 'link':
-                                                               list( $paramName, $value ) = $this->parseLinkParameter( $value );
+                                                               list( $paramName, $value ) =
+                                                                       $this->parseLinkParameter(
+                                                                               $this->stripAltText( $value, $holders )
+                                                                       );
                                                                if ( $paramName ) {
                                                                        $validated = true;
                                                                        if ( $paramName === 'no-link' ) {
@@ -5422,6 +5445,9 @@ class Parser {
                        # Use the "caption" for the tooltip text
                        $params['frame']['title'] = $this->stripAltText( $caption, $holders );
                }
+               if ( $wgMediaInTargetLanguage ) {
+                       $params['handler']['targetlang'] = $this->getTargetLanguage()->getCode();
+               }
 
                Hooks::run( 'ParserMakeImageParams', [ $title, $file, &$params, $this ] );
 
@@ -5499,6 +5525,40 @@ class Parser {
                # that are later expanded to html- so expand them now and
                # remove the tags
                $tooltip = $this->mStripState->unstripBoth( $tooltip );
+               # Compatibility hack!  In HTML certain entity references not terminated
+               # by a semicolon are decoded (but not if we're in an attribute; that's
+               # how link URLs get away without properly escaping & in queries).
+               # But wikitext has always required semicolon-termination of entities,
+               # so encode & where needed to avoid decode of semicolon-less entities.
+               # See T209236 and
+               # https://www.w3.org/TR/html5/syntax.html#named-character-references
+               # T210437 discusses moving this workaround to Sanitizer::stripAllTags.
+               $tooltip = preg_replace( "/
+                       &                       # 1. entity prefix
+                       (?=                     # 2. followed by:
+                       (?:                     #  a. one of the legacy semicolon-less named entities
+                               A(?:Elig|MP|acute|circ|grave|ring|tilde|uml)|
+                               C(?:OPY|cedil)|E(?:TH|acute|circ|grave|uml)|
+                               GT|I(?:acute|circ|grave|uml)|LT|Ntilde|
+                               O(?:acute|circ|grave|slash|tilde|uml)|QUOT|REG|THORN|
+                               U(?:acute|circ|grave|uml)|Yacute|
+                               a(?:acute|c(?:irc|ute)|elig|grave|mp|ring|tilde|uml)|brvbar|
+                               c(?:cedil|edil|urren)|cent(?!erdot;)|copy(?!sr;)|deg|
+                               divide(?!ontimes;)|e(?:acute|circ|grave|th|uml)|
+                               frac(?:1(?:2|4)|34)|
+                               gt(?!c(?:c|ir)|dot|lPar|quest|r(?:a(?:pprox|rr)|dot|eq(?:less|qless)|less|sim);)|
+                               i(?:acute|circ|excl|grave|quest|uml)|laquo|
+                               lt(?!c(?:c|ir)|dot|hree|imes|larr|quest|r(?:Par|i(?:e|f|));)|
+                               m(?:acr|i(?:cro|ddot))|n(?:bsp|tilde)|
+                               not(?!in(?:E|dot|v(?:a|b|c)|)|ni(?:v(?:a|b|c)|);)|
+                               o(?:acute|circ|grave|rd(?:f|m)|slash|tilde|uml)|
+                               p(?:lusmn|ound)|para(?!llel;)|quot|r(?:aquo|eg)|
+                               s(?:ect|hy|up(?:1|2|3)|zlig)|thorn|times(?!b(?:ar|)|d;)|
+                               u(?:acute|circ|grave|ml|uml)|y(?:acute|en|uml)
+                       )
+                       (?:[^;]|$))     #  b. and not followed by a semicolon
+                       # S = study, for efficiency
+                       /Sx", '&amp;', $tooltip );
                $tooltip = Sanitizer::stripAllTags( $tooltip );
 
                return $tooltip;
@@ -5931,9 +5991,9 @@ class Parser {
                return '#' . Sanitizer::escapeIdForLink( $sectionName );
        }
 
-       private static function makeLegacyAnchor( $sectionName ) {
-               global $wgFragmentMode;
-               if ( isset( $wgFragmentMode[1] ) && $wgFragmentMode[1] === 'legacy' ) {
+       private function makeLegacyAnchor( $sectionName ) {
+               $fragmentMode = $this->siteConfig->get( 'FragmentMode' );
+               if ( isset( $fragmentMode[1] ) && $fragmentMode[1] === 'legacy' ) {
                        // ForAttribute() and ForLink() are the same for legacy encoding
                        $id = Sanitizer::escapeIdForAttribute( $sectionName, Sanitizer::ID_FALLBACK );
                } else {
@@ -5971,7 +6031,7 @@ class Parser {
                # Strip out wikitext links(they break the anchor)
                $text = $this->stripSectionName( $text );
                $sectionName = self::getSectionNameFromStrippedText( $text );
-               return self::makeLegacyAnchor( $sectionName );
+               return $this->makeLegacyAnchor( $sectionName );
        }
 
        /**