Use "break" instead of "continue"
[lhc/web/wiklou.git] / languages / LanguageConverter.php
index 00bc02d..24b1d0c 100644 (file)
@@ -39,6 +39,7 @@ class LanguageConverter {
         */
        static public $languagesWithVariants = [
                'en',
+               'crh',
                'gan',
                'iu',
                'kk',
@@ -162,6 +163,8 @@ class LanguageConverter {
 
                $req = $this->getURLVariant();
 
+               Hooks::run( 'GetLangPreferredVariant', [ &$req ] );
+
                if ( $wgUser->isSafeToLoad() && $wgUser->isLoggedIn() && !$req ) {
                        $req = $this->getUserVariant();
                } elseif ( !$req ) {
@@ -353,7 +356,6 @@ class LanguageConverter {
                if ( $this->guessVariant( $text, $toVariant ) ) {
                        return $text;
                }
-
                /* we convert everything except:
                   1. HTML markups (anything between < and >)
                   2. HTML entities
@@ -376,9 +378,12 @@ class LanguageConverter {
                $scriptfix = '<script[^>]*+>[^<]*+(?:(?:(?!<\/script>).)[^<]*+)*+<\/script>|';
                // disable conversion of <pre> tags
                $prefix = '<pre[^>]*+>[^<]*+(?:(?:(?!<\/pre>).)[^<]*+)*+<\/pre>|';
+               // The "|.*+)" at the end, is in case we missed some part of html syntax,
+               // we will fail securely (hopefully) by matching the rest of the string.
+               $htmlFullTag = '<(?:[^>=]*+(?>[^>=]*+=\s*+(?:"[^"]*"|\'[^\']*\'|[^\'">\s]*+))*+[^>=]*+>|.*+)|';
 
-               $reg = '/' . $codefix . $scriptfix . $prefix .
-                       '<[^>]++>|&[a-zA-Z#][a-z0-9]++;' . $marker . $htmlfix . '|\004$/s';
+               $reg = '/' . $codefix . $scriptfix . $prefix . $htmlFullTag .
+                       '&[a-zA-Z#][a-z0-9]++;' . $marker . $htmlfix . '|\004$/s';
                $startPos = 0;
                $sourceBlob = '';
                $literalBlob = '';
@@ -386,6 +391,7 @@ class LanguageConverter {
                // Guard against delimiter nulls in the input
                // (should never happen: see T159174)
                $text = str_replace( "\000", '', $text );
+               $text = str_replace( "\004", '', $text );
 
                $markupMatches = null;
                $elementMatches = null;
@@ -400,6 +406,13 @@ class LanguageConverter {
                                        // We hit the end.
                                        $elementPos = strlen( $text );
                                        $element = '';
+                               } elseif ( substr( $element, -1 ) === "\004" ) {
+                                       // This can sometimes happen if we have
+                                       // unclosed html tags (For example
+                                       // when converting a title attribute
+                                       // during a recursive call that contains
+                                       // a &lt; e.g. <div title="&lt;">.
+                                       $element = substr( $element, 0, -1 );
                                }
                        } else {
                                // If we hit here, then Language Converter could be tricked
@@ -409,11 +422,11 @@ class LanguageConverter {
                                $log = LoggerFactory::getInstance( 'languageconverter' );
                                $log->error( "Hit pcre.backtrack_limit in " . __METHOD__
                                        . ". Disabling language conversion for this page.",
-                                       array(
+                                       [
                                                "method" => __METHOD__,
                                                "variant" => $toVariant,
                                                "startOfText" => substr( $text, 0, 500 )
-                                       )
+                                       ]
                                );
                                return $text;
                        }
@@ -427,7 +440,14 @@ class LanguageConverter {
                        if ( $element !== ''
                                && preg_match( '/^(<[^>\s]*+)\s([^>]*+)(.*+)$/', $element, $elementMatches )
                        ) {
+                               // FIXME, this decodes entities, so if you have something
+                               // like <div title="foo&lt;bar"> the bar won't get
+                               // translated since after entity decoding it looks like
+                               // unclosed html and we call this method recursively
+                               // on attributes.
                                $attrs = Sanitizer::decodeTagAttributes( $elementMatches[2] );
+                               // Ensure self-closing tags stay self-closing.
+                               $close = substr( $elementMatches[2], -1 ) === '/' ? ' /' : '';
                                $changed = false;
                                foreach ( [ 'title', 'alt' ] as $attrName ) {
                                        if ( !isset( $attrs[$attrName] ) ) {
@@ -446,7 +466,7 @@ class LanguageConverter {
                                }
                                if ( $changed ) {
                                        $element = $elementMatches[1] . Html::expandAttributes( $attrs ) .
-                                               $elementMatches[3];
+                                               $close . $elementMatches[3];
                                }
                        }
                        $literalBlob .= $element . "\000";
@@ -658,29 +678,42 @@ class LanguageConverter {
                $out = '';
                $length = strlen( $text );
                $shouldConvert = !$this->guessVariant( $text, $variant );
-
-               while ( $startPos < $length ) {
-                       $pos = strpos( $text, '-{', $startPos );
-
-                       if ( $pos === false ) {
+               $continue = 1;
+
+               $noScript = '<script.*?>.*?<\/script>(*SKIP)(*FAIL)';
+               $noStyle = '<style.*?>.*?<\/style>(*SKIP)(*FAIL)';
+               // phpcs:ignore Generic.Files.LineLength
+               $noHtml = '<(?:[^>=]*+(?>[^>=]*+=\s*+(?:"[^"]*"|\'[^\']*\'|[^\'">\s]*+))*+[^>=]*+>|.*+)(*SKIP)(*FAIL)';
+               while ( $startPos < $length && $continue ) {
+                       $continue = preg_match(
+                               // Only match -{ outside of html.
+                               "/$noScript|$noStyle|$noHtml|-\{/",
+                               $text,
+                               $m,
+                               PREG_OFFSET_CAPTURE,
+                               $startPos
+                       );
+
+                       if ( !$continue ) {
                                // No more markup, append final segment
                                $fragment = substr( $text, $startPos );
                                $out .= $shouldConvert ? $this->autoConvert( $fragment, $variant ) : $fragment;
                                return $out;
                        }
 
-                       // Markup found
+                       // Offset of the match of the regex pattern.
+                       $pos = $m[0][1];
+
                        // Append initial segment
                        $fragment = substr( $text, $startPos, $pos - $startPos );
                        $out .= $shouldConvert ? $this->autoConvert( $fragment, $variant ) : $fragment;
-
-                       // Advance position
+                       // -{ marker found, not in attribute
+                       // Advance position up to -{ marker.
                        $startPos = $pos;
-
                        // Do recursive conversion
+                       // Note: This passes $startPos by reference, and advances it.
                        $out .= $this->recursiveConvertRule( $text, $variant, $startPos, $depth + 1 );
                }
-
                return $out;
        }
 
@@ -737,7 +770,7 @@ class LanguageConverter {
                                                        $warningDone = true;
                                                }
                                                $startPos += 2;
-                                               continue;
+                                               break;
                                        }
                                        // Recursively parse another rule
                                        $inner .= $this->recursiveConvertRule( $text, $variant, $startPos, $depth + 1 );