*/
use MediaWiki\MediaWikiServices;
+use MediaWiki\Logger\LoggerFactory;
+
/**
* Base class for language conversion.
* @ingroup Language
*/
static public $languagesWithVariants = [
'en',
+ 'crh',
'gan',
'iu',
'kk',
if ( $this->guessVariant( $text, $toVariant ) ) {
return $text;
}
-
/* we convert everything except:
- * 1. HTML markups (anything between < and >)
- * 2. HTML entities
- * 3. placeholders created by the parser
- */
- $marker = '|' . Parser::MARKER_PREFIX . '[\-a-zA-Z0-9]+';
+ 1. HTML markups (anything between < and >)
+ 2. HTML entities
+ 3. placeholders created by the parser
+ IMPORTANT: Beware of failure from pcre.backtrack_limit (T124404).
+ Minimize use of backtracking where possible.
+ */
+ $marker = '|' . Parser::MARKER_PREFIX . '[^\x7f]++\x7f';
// this one is needed when the text is inside an HTML markup
- $htmlfix = '|<[^>]+$|^[^<>]*>';
+ $htmlfix = '|<[^>\004]++(?=\004$)|^[^<>]*+>';
+
+ // Optimize for the common case where these tags have
+ // few or no children. Thus try and possesively get as much as
+ // possible, and only engage in backtracking when we hit a '<'.
// disable convert to variants between <code> tags
- $codefix = '<code>.+?<\/code>|';
+ $codefix = '<code>[^<]*+(?:(?:(?!<\/code>).)[^<]*+)*+<\/code>|';
// disable conversion of <script> tags
- $scriptfix = '<script.*?>.*?<\/script>|';
+ $scriptfix = '<script[^>]*+>[^<]*+(?:(?:(?!<\/script>).)[^<]*+)*+<\/script>|';
// disable conversion of <pre> tags
- $prefix = '<pre.*?>.*?<\/pre>|';
+ $prefix = '<pre[^>]*+>[^<]*+(?:(?:(?!<\/pre>).)[^<]*+)*+<\/pre>|';
+ // The "|.*+)" at the end, is in case we missed some part of html syntax,
+ // we will fail securely (hopefully) by matching the rest of the string.
+ $htmlFullTag = '<(?:[^>=]*+(?>[^>=]*+=\s*+(?:"[^"]*"|\'[^\']*\'|[^\'">\s]*+))*+[^>=]*+>|.*+)|';
- $reg = '/' . $codefix . $scriptfix . $prefix .
- '<[^>]+>|&[a-zA-Z#][a-z0-9]+;' . $marker . $htmlfix . '/s';
+ $reg = '/' . $codefix . $scriptfix . $prefix . $htmlFullTag .
+ '&[a-zA-Z#][a-z0-9]++;' . $marker . $htmlfix . '|\004$/s';
$startPos = 0;
$sourceBlob = '';
$literalBlob = '';
// Guard against delimiter nulls in the input
// (should never happen: see T159174)
$text = str_replace( "\000", '', $text );
+ $text = str_replace( "\004", '', $text );
$markupMatches = null;
$elementMatches = null;
+
+ // We add a marker (\004) at the end of text, to ensure we always match the
+ // entire text (Otherwise, pcre.backtrack_limit might cause silent failure)
while ( $startPos < strlen( $text ) ) {
- if ( preg_match( $reg, $text, $markupMatches, PREG_OFFSET_CAPTURE, $startPos ) ) {
+ if ( preg_match( $reg, $text . "\004", $markupMatches, PREG_OFFSET_CAPTURE, $startPos ) ) {
$elementPos = $markupMatches[0][1];
$element = $markupMatches[0][0];
+ if ( $element === "\004" ) {
+ // We hit the end.
+ $elementPos = strlen( $text );
+ $element = '';
+ } elseif ( substr( $element, -1 ) === "\004" ) {
+ // This can sometimes happen if we have
+ // unclosed html tags (For example
+ // when converting a title attribute
+ // during a recursive call that contains
+ // a < e.g. <div title="<">.
+ $element = substr( $element, 0, -1 );
+ }
} else {
- $elementPos = strlen( $text );
- $element = '';
+ // If we hit here, then Language Converter could be tricked
+ // into doing an XSS, so we refuse to translate.
+ // If non-crazy input manages to reach this code path,
+ // we should consider it a bug.
+ $log = LoggerFactory::getInstance( 'languageconverter' );
+ $log->error( "Hit pcre.backtrack_limit in " . __METHOD__
+ . ". Disabling language conversion for this page.",
+ [
+ "method" => __METHOD__,
+ "variant" => $toVariant,
+ "startOfText" => substr( $text, 0, 500 )
+ ]
+ );
+ return $text;
}
-
// Queue the part before the markup for translation in a batch
$sourceBlob .= substr( $text, $startPos, $elementPos - $startPos ) . "\000";
// Translate any alt or title attributes inside the matched element
if ( $element !== ''
- && preg_match( '/^(<[^>\s]*)\s([^>]*)(.*)$/', $element, $elementMatches )
+ && preg_match( '/^(<[^>\s]*+)\s([^>]*+)(.*+)$/', $element, $elementMatches )
) {
+ // FIXME, this decodes entities, so if you have something
+ // like <div title="foo<bar"> the bar won't get
+ // translated since after entity decoding it looks like
+ // unclosed html and we call this method recursively
+ // on attributes.
$attrs = Sanitizer::decodeTagAttributes( $elementMatches[2] );
+ // Ensure self-closing tags stay self-closing.
+ $close = substr( $elementMatches[2], -1 ) === '/' ? ' /' : '';
$changed = false;
foreach ( [ 'title', 'alt' ] as $attrName ) {
if ( !isset( $attrs[$attrName] ) ) {
}
if ( $changed ) {
$element = $elementMatches[1] . Html::expandAttributes( $attrs ) .
- $elementMatches[3];
+ $close . $elementMatches[3];
}
}
$literalBlob .= $element . "\000";
$out = '';
$length = strlen( $text );
$shouldConvert = !$this->guessVariant( $text, $variant );
-
- while ( $startPos < $length ) {
- $pos = strpos( $text, '-{', $startPos );
-
- if ( $pos === false ) {
+ $continue = 1;
+
+ $noScript = '<script.*?>.*?<\/script>(*SKIP)(*FAIL)';
+ $noStyle = '<style.*?>.*?<\/style>(*SKIP)(*FAIL)';
+ // @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong
+ $noHtml = '<(?:[^>=]*+(?>[^>=]*+=\s*+(?:"[^"]*"|\'[^\']*\'|[^\'">\s]*+))*+[^>=]*+>|.*+)(*SKIP)(*FAIL)';
+ // @codingStandardsIgnoreEnd
+ while ( $startPos < $length && $continue ) {
+ $continue = preg_match(
+ // Only match -{ outside of html.
+ "/$noScript|$noStyle|$noHtml|-\{/",
+ $text,
+ $m,
+ PREG_OFFSET_CAPTURE,
+ $startPos
+ );
+
+ if ( !$continue ) {
// No more markup, append final segment
$fragment = substr( $text, $startPos );
$out .= $shouldConvert ? $this->autoConvert( $fragment, $variant ) : $fragment;
return $out;
}
- // Markup found
+ // Offset of the match of the regex pattern.
+ $pos = $m[0][1];
+
// Append initial segment
$fragment = substr( $text, $startPos, $pos - $startPos );
$out .= $shouldConvert ? $this->autoConvert( $fragment, $variant ) : $fragment;
-
- // Advance position
+ // -{ marker found, not in attribute
+ // Advance position up to -{ marker.
$startPos = $pos;
-
// Do recursive conversion
+ // Note: This passes $startPos by reference, and advances it.
$out .= $this->recursiveConvertRule( $text, $variant, $startPos, $depth + 1 );
}
-
return $out;
}
*
* @param string $text Text to be converted
* @param string $variant The target variant code
- * @param int $startPos
+ * @param int &$startPos
* @param int $depth Depth of recursion
*
* @throws MWException