X-Git-Url: https://git.heureux-cyclage.org/?p=lhc%2Fweb%2Fwiklou.git;a=blobdiff_plain;f=languages%2FLanguageConverter.php;h=24b1d0c07662d8ac7a94ba962b5c50fe1de6b8ab;hp=00bc02db4582e597a5f37f91ee6c9f11a422587b;hb=942728ab20e01bba200031b01eb606ee59279bd2;hpb=fbe78cfa094645b907d0fd2885c5797321f794eb diff --git a/languages/LanguageConverter.php b/languages/LanguageConverter.php index 00bc02db45..24b1d0c076 100644 --- a/languages/LanguageConverter.php +++ b/languages/LanguageConverter.php @@ -39,6 +39,7 @@ class LanguageConverter { */ static public $languagesWithVariants = [ 'en', + 'crh', 'gan', 'iu', 'kk', @@ -162,6 +163,8 @@ class LanguageConverter { $req = $this->getURLVariant(); + Hooks::run( 'GetLangPreferredVariant', [ &$req ] ); + if ( $wgUser->isSafeToLoad() && $wgUser->isLoggedIn() && !$req ) { $req = $this->getUserVariant(); } elseif ( !$req ) { @@ -353,7 +356,6 @@ class LanguageConverter { if ( $this->guessVariant( $text, $toVariant ) ) { return $text; } - /* we convert everything except: 1. HTML markups (anything between < and >) 2. HTML entities @@ -376,9 +378,12 @@ class LanguageConverter { $scriptfix = ']*+>[^<]*+(?:(?:(?!<\/script>).)[^<]*+)*+<\/script>|'; // disable conversion of
 tags
 		$prefix = ']*+>[^<]*+(?:(?:(?!<\/pre>).)[^<]*+)*+<\/pre>|';
+		// The "|.*+)" at the end, is in case we missed some part of html syntax,
+		// we will fail securely (hopefully) by matching the rest of the string.
+		$htmlFullTag = '<(?:[^>=]*+(?>[^>=]*+=\s*+(?:"[^"]*"|\'[^\']*\'|[^\'">\s]*+))*+[^>=]*+>|.*+)|';
 
-		$reg = '/' . $codefix . $scriptfix . $prefix .
-			'<[^>]++>|&[a-zA-Z#][a-z0-9]++;' . $marker . $htmlfix . '|\004$/s';
+		$reg = '/' . $codefix . $scriptfix . $prefix . $htmlFullTag .
+			'&[a-zA-Z#][a-z0-9]++;' . $marker . $htmlfix . '|\004$/s';
 		$startPos = 0;
 		$sourceBlob = '';
 		$literalBlob = '';
@@ -386,6 +391,7 @@ class LanguageConverter {
 		// Guard against delimiter nulls in the input
 		// (should never happen: see T159174)
 		$text = str_replace( "\000", '', $text );
+		$text = str_replace( "\004", '', $text );
 
 		$markupMatches = null;
 		$elementMatches = null;
@@ -400,6 +406,13 @@ class LanguageConverter {
 					// We hit the end.
 					$elementPos = strlen( $text );
 					$element = '';
+				} elseif ( substr( $element, -1 ) === "\004" ) {
+					// This can sometimes happen if we have
+					// unclosed html tags (For example
+					// when converting a title attribute
+					// during a recursive call that contains
+					// a < e.g. 
. + $element = substr( $element, 0, -1 ); } } else { // If we hit here, then Language Converter could be tricked @@ -409,11 +422,11 @@ class LanguageConverter { $log = LoggerFactory::getInstance( 'languageconverter' ); $log->error( "Hit pcre.backtrack_limit in " . __METHOD__ . ". Disabling language conversion for this page.", - array( + [ "method" => __METHOD__, "variant" => $toVariant, "startOfText" => substr( $text, 0, 500 ) - ) + ] ); return $text; } @@ -427,7 +440,14 @@ class LanguageConverter { if ( $element !== '' && preg_match( '/^(<[^>\s]*+)\s([^>]*+)(.*+)$/', $element, $elementMatches ) ) { + // FIXME, this decodes entities, so if you have something + // like
the bar won't get + // translated since after entity decoding it looks like + // unclosed html and we call this method recursively + // on attributes. $attrs = Sanitizer::decodeTagAttributes( $elementMatches[2] ); + // Ensure self-closing tags stay self-closing. + $close = substr( $elementMatches[2], -1 ) === '/' ? ' /' : ''; $changed = false; foreach ( [ 'title', 'alt' ] as $attrName ) { if ( !isset( $attrs[$attrName] ) ) { @@ -446,7 +466,7 @@ class LanguageConverter { } if ( $changed ) { $element = $elementMatches[1] . Html::expandAttributes( $attrs ) . - $elementMatches[3]; + $close . $elementMatches[3]; } } $literalBlob .= $element . "\000"; @@ -658,29 +678,42 @@ class LanguageConverter { $out = ''; $length = strlen( $text ); $shouldConvert = !$this->guessVariant( $text, $variant ); - - while ( $startPos < $length ) { - $pos = strpos( $text, '-{', $startPos ); - - if ( $pos === false ) { + $continue = 1; + + $noScript = '.*?<\/script>(*SKIP)(*FAIL)'; + $noStyle = '.*?<\/style>(*SKIP)(*FAIL)'; + // phpcs:ignore Generic.Files.LineLength + $noHtml = '<(?:[^>=]*+(?>[^>=]*+=\s*+(?:"[^"]*"|\'[^\']*\'|[^\'">\s]*+))*+[^>=]*+>|.*+)(*SKIP)(*FAIL)'; + while ( $startPos < $length && $continue ) { + $continue = preg_match( + // Only match -{ outside of html. + "/$noScript|$noStyle|$noHtml|-\{/", + $text, + $m, + PREG_OFFSET_CAPTURE, + $startPos + ); + + if ( !$continue ) { // No more markup, append final segment $fragment = substr( $text, $startPos ); $out .= $shouldConvert ? $this->autoConvert( $fragment, $variant ) : $fragment; return $out; } - // Markup found + // Offset of the match of the regex pattern. + $pos = $m[0][1]; + // Append initial segment $fragment = substr( $text, $startPos, $pos - $startPos ); $out .= $shouldConvert ? $this->autoConvert( $fragment, $variant ) : $fragment; - - // Advance position + // -{ marker found, not in attribute + // Advance position up to -{ marker. $startPos = $pos; - // Do recursive conversion + // Note: This passes $startPos by reference, and advances it. $out .= $this->recursiveConvertRule( $text, $variant, $startPos, $depth + 1 ); } - return $out; } @@ -737,7 +770,7 @@ class LanguageConverter { $warningDone = true; } $startPos += 2; - continue; + break; } // Recursively parse another rule $inner .= $this->recursiveConvertRule( $text, $variant, $startPos, $depth + 1 );