Fix phpcs issues from LanguageConverter patches

[lhc/web/wiklou.git] / languages / LanguageConverter.php
diff --git a/languages/LanguageConverter.php b/languages/LanguageConverter.php

index 67c0ca7..8e98abd 100644 (file)
--- a/languages/LanguageConverter.php
+++ b/languages/LanguageConverter.php
@@ -20,6 +20,8 @@
   */
  use MediaWiki\MediaWikiServices;
  
+use MediaWiki\Logger\LoggerFactory;
+
  /**
   * Base class for language conversion.
   * @ingroup Language
@@ -353,24 +355,33 @@ class LanguageConverter {
                 }
  
                 /* we convert everything except:
-                * 1. HTML markups (anything between < and >)
-                * 2. HTML entities
-                * 3. placeholders created by the parser
-                */
-               $marker = '|' . Parser::MARKER_PREFIX . '[\-a-zA-Z0-9]+';
+                  1. HTML markups (anything between < and >)
+                  2. HTML entities
+                  3. placeholders created by the parser
+                  IMPORTANT: Beware of failure from pcre.backtrack_limit (T124404).
+                  Minimize use of backtracking where possible.
+               */
+               $marker = '|' . Parser::MARKER_PREFIX . '[^\x7f]++\x7f';
  
                 // this one is needed when the text is inside an HTML markup
-               $htmlfix = '|<[^>]+$|^[^<>]*>';
+               $htmlfix = '|<[^>\004]++(?=\004$)|^[^<>]*+>';
+
+               // Optimize for the common case where these tags have
+               // few or no children. Thus try and possesively get as much as
+               // possible, and only engage in backtracking when we hit a '<'.
  
                 // disable convert to variants between <code> tags
-               $codefix = '<code>.+?<\/code>|';
+               $codefix = '<code>[^<]*+(?:(?:(?!<\/code>).)[^<]*+)*+<\/code>|';
                 // disable conversion of <script> tags
-               $scriptfix = '<script.*?>.*?<\/script>|';
+               $scriptfix = '<script[^>]*+>[^<]*+(?:(?:(?!<\/script>).)[^<]*+)*+<\/script>|';
                 // disable conversion of <pre> tags
-               $prefix = '<pre.*?>.*?<\/pre>|';
+               $prefix = '<pre[^>]*+>[^<]*+(?:(?:(?!<\/pre>).)[^<]*+)*+<\/pre>|';
+               // The "|.*+)" at the end, is in case we missed some part of html syntax,
+               // we will fail securely (hopefully) by matching the rest of the string.
+               $htmlFullTag = '<(?:[^>=]*+(?>[^>=]*+=\s*+(?:"[^"]*"|\'[^\']*\'|[^\'">\s]*+))*+[^>=]*+>|.*+)|';
  
-               $reg = '/' . $codefix . $scriptfix . $prefix .
-                       '<[^>]+>|&[a-zA-Z#][a-z0-9]+;' . $marker . $htmlfix . '/s';
+               $reg = '/' . $codefix . $scriptfix . $prefix . $htmlFullTag .
+                       '&[a-zA-Z#][a-z0-9]++;' . $marker . $htmlfix . '|\004$/s';
                 $startPos = 0;
                 $sourceBlob = '';
                 $literalBlob = '';
@@ -381,15 +392,34 @@ class LanguageConverter {
  
                 $markupMatches = null;
                 $elementMatches = null;
+
+               // We add a marker (\004) at the end of text, to ensure we always match the
+               // entire text (Otherwise, pcre.backtrack_limit might cause silent failure)
                 while ( $startPos < strlen( $text ) ) {
-                       if ( preg_match( $reg, $text, $markupMatches, PREG_OFFSET_CAPTURE, $startPos ) ) {
+                       if ( preg_match( $reg, $text . "\004", $markupMatches, PREG_OFFSET_CAPTURE, $startPos ) ) {
                                 $elementPos = $markupMatches[0][1];
                                 $element = $markupMatches[0][0];
+                               if ( $element === "\004" ) {
+                                       // We hit the end.
+                                       $elementPos = strlen( $text );
+                                       $element = '';
+                               }
                         } else {
-                               $elementPos = strlen( $text );
-                               $element = '';
+                               // If we hit here, then Language Converter could be tricked
+                               // into doing an XSS, so we refuse to translate.
+                               // If non-crazy input manages to reach this code path,
+                               // we should consider it a bug.
+                               $log = LoggerFactory::getInstance( 'languageconverter' );
+                               $log->error( "Hit pcre.backtrack_limit in " . __METHOD__
+                                       . ". Disabling language conversion for this page.",
+                                       [
+                                               "method" => __METHOD__,
+                                               "variant" => $toVariant,
+                                               "startOfText" => substr( $text, 0, 500 )
+                                       ]
+                               );
+                               return $text;
                         }
-
                         // Queue the part before the markup for translation in a batch
                         $sourceBlob .= substr( $text, $startPos, $elementPos - $startPos ) . "\000";
  
@@ -398,7 +428,7 @@ class LanguageConverter {
  
                         // Translate any alt or title attributes inside the matched element
                         if ( $element !== ''
-                               && preg_match( '/^(<[^>\s]*)\s([^>]*)(.*)$/', $element, $elementMatches )
+                               && preg_match( '/^(<[^>\s]*+)\s([^>]*+)(.*+)$/', $element, $elementMatches )
                         ) {
                                 $attrs = Sanitizer::decodeTagAttributes( $elementMatches[2] );
                                 $changed = false;
@@ -631,29 +661,43 @@ class LanguageConverter {
                 $out = '';
                 $length = strlen( $text );
                 $shouldConvert = !$this->guessVariant( $text, $variant );
-
-               while ( $startPos < $length ) {
-                       $pos = strpos( $text, '-{', $startPos );
-
-                       if ( $pos === false ) {
+               $continue = 1;
+
+               $noScript = '<script.*?>.*?<\/script>(*SKIP)(*FAIL)';
+               $noStyle = '<style.*?>.*?<\/style>(*SKIP)(*FAIL)';
+               // @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong
+               $noHtml = '<(?:[^>=]*+(?>[^>=]*+=\s*+(?:"[^"]*"|\'[^\']*\'|[^\'">\s]*+))*+[^>=]*+>|.*+)(*SKIP)(*FAIL)';
+               // @codingStandardsIgnoreEnd
+               while ( $startPos < $length && $continue ) {
+                       $continue = preg_match(
+                               // Only match -{ outside of html.
+                               "/$noScript|$noStyle|$noHtml|-\{/",
+                               $text,
+                               $m,
+                               PREG_OFFSET_CAPTURE,
+                               $startPos
+                       );
+
+                       if ( !$continue ) {
                                 // No more markup, append final segment
                                 $fragment = substr( $text, $startPos );
                                 $out .= $shouldConvert ? $this->autoConvert( $fragment, $variant ) : $fragment;
                                 return $out;
                         }
  
-                       // Markup found
+                       // Offset of the match of the regex pattern.
+                       $pos = $m[0][1];
+
                         // Append initial segment
                         $fragment = substr( $text, $startPos, $pos - $startPos );
                         $out .= $shouldConvert ? $this->autoConvert( $fragment, $variant ) : $fragment;
-
-                       // Advance position
+                       // -{ marker found, not in attribute
+                       // Advance position up to -{ marker.
                         $startPos = $pos;
-
                         // Do recursive conversion
+                       // Note: This passes $startPos by reference, and advances it.
                         $out .= $this->recursiveConvertRule( $text, $variant, $startPos, $depth + 1 );
                 }
-
                 return $out;
         }