U+0000 is not allowed in HTML5, there's no reason to allow it in wikitext.
It simplifies our code if we can just strip them at the start. Strip in
PST as well so they don't sneak into our database either.
Tweaked the EXT_LINK URLs to account for the fact that invalid characters
get transformed into U+FFFD when using Preprocessor_DOM. See
73649741ed1e
(r65967) for context on that change.
Bug: T159174
Change-Id: I3f67e92b61aacc87a40c3662085c84d1dac08bfb
# Everything except bracket, space, or control characters
# \p{Zs} is unicode 'separator, space' category. It covers the space 0x20
# as well as U+3000 is IDEOGRAPHIC SPACE for T21052
# Everything except bracket, space, or control characters
# \p{Zs} is unicode 'separator, space' category. It covers the space 0x20
# as well as U+3000 is IDEOGRAPHIC SPACE for T21052
- const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}]';
+ # \x{FFFD} is the Unicode replacement character, which Preprocessor_DOM
+ # uses to replace invalid HTML characters.
+ const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}]';
# Simplified expression to match an IPv4 or IPv6 address, or
# at least one character of a host name (embeds EXT_LINK_URL_CLASS)
# Simplified expression to match an IPv4 or IPv6 address, or
# at least one character of a host name (embeds EXT_LINK_URL_CLASS)
- const EXT_LINK_ADDR = '(?:[0-9.]+|\\[(?i:[0-9a-f:.]+)\\]|[^][<>"\\x00-\\x20\\x7F\p{Zs}])';
+ const EXT_LINK_ADDR = '(?:[0-9.]+|\\[(?i:[0-9a-f:.]+)\\]|[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}])';
# RegExp to make image URLs (embeds IPv6 part of EXT_LINK_ADDR)
// @codingStandardsIgnoreStart Generic.Files.LineLength
# RegExp to make image URLs (embeds IPv6 part of EXT_LINK_ADDR)
// @codingStandardsIgnoreStart Generic.Files.LineLength
- const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)((?:\\[(?i:[0-9a-f:.]+)\\])?[^][<>"\\x00-\\x20\\x7F\p{Zs}]+)
+ const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)((?:\\[(?i:[0-9a-f:.]+)\\])?[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}]+)
\\/([A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]+)\\.((?i)gif|png|jpg|jpeg)$/Sxu';
// @codingStandardsIgnoreEnd
\\/([A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]+)\\.((?i)gif|png|jpg|jpeg)$/Sxu';
// @codingStandardsIgnoreEnd
$this->mUrlProtocols = wfUrlProtocols();
$this->mExtLinkBracketedRegex = '/\[(((?i)' . $this->mUrlProtocols . ')' .
self::EXT_LINK_ADDR .
$this->mUrlProtocols = wfUrlProtocols();
$this->mExtLinkBracketedRegex = '/\[(((?i)' . $this->mUrlProtocols . ')' .
self::EXT_LINK_ADDR .
- self::EXT_LINK_URL_CLASS . '*)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F]*?)\]/Su';
+ self::EXT_LINK_URL_CLASS . '*)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F\\x{FFFD}]*?)\]/Su';
if ( isset( $conf['preprocessorClass'] ) ) {
$this->mPreprocessorClass = $conf['preprocessorClass'];
} elseif ( defined( 'HPHP_VERSION' ) ) {
if ( isset( $conf['preprocessorClass'] ) ) {
$this->mPreprocessorClass = $conf['preprocessorClass'];
} elseif ( defined( 'HPHP_VERSION' ) ) {
$text = strtr( $text, "\x7f", "?" );
$magicScopeVariable = $this->lock();
}
$text = strtr( $text, "\x7f", "?" );
$magicScopeVariable = $this->lock();
}
+ // Strip U+0000 NULL (T159174)
+ $text = str_replace( "\000", '', $text );
$this->startParse( $title, $options, self::OT_HTML, $clearState );
$this->startParse( $title, $options, self::OT_HTML, $clearState );
$this->startParse( $title, $options, self::OT_WIKI, $clearState );
$this->setUser( $user );
$this->startParse( $title, $options, self::OT_WIKI, $clearState );
$this->setUser( $user );
+ // Strip U+0000 NULL (T159174)
+ $text = str_replace( "\000", '', $text );
+
// We still normalize line endings for backwards-compatibility
// with other code that just calls PST, but this should already
// be handled in TextContent subclasses
// We still normalize line endings for backwards-compatibility
// with other code that just calls PST, but this should already
// be handled in TextContent subclasses
$literalBlob = '';
// Guard against delimiter nulls in the input
$literalBlob = '';
// Guard against delimiter nulls in the input
+ // (should never happen: see T159174)
$text = str_replace( "\000", '', $text );
$markupMatches = null;
$text = str_replace( "\000", '', $text );
$markupMatches = null;