From 3e32d21210362b9a050862e28b75a11a52b6021e Mon Sep 17 00:00:00 2001 From: "C. Scott Ananian" Date: Mon, 27 Feb 2017 16:27:15 -0500 Subject: [PATCH] Strip U+0000 in wikitext U+0000 is not allowed in HTML5, there's no reason to allow it in wikitext. It simplifies our code if we can just strip them at the start. Strip in PST as well so they don't sneak into our database either. Tweaked the EXT_LINK URLs to account for the fact that invalid characters get transformed into U+FFFD when using Preprocessor_DOM. See 73649741ed1e (r65967) for context on that change. Bug: T159174 Change-Id: I3f67e92b61aacc87a40c3662085c84d1dac08bfb --- includes/parser/Parser.php | 15 +++++++++++---- languages/LanguageConverter.php | 1 + tests/parser/extraParserTests.txt | Bin 1243 -> 1994 bytes 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index 9a9b9d8f19..8db1fe3794 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -89,13 +89,15 @@ class Parser { # Everything except bracket, space, or control characters # \p{Zs} is unicode 'separator, space' category. It covers the space 0x20 # as well as U+3000 is IDEOGRAPHIC SPACE for T21052 - const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}]'; + # \x{FFFD} is the Unicode replacement character, which Preprocessor_DOM + # uses to replace invalid HTML characters. + const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}]'; # Simplified expression to match an IPv4 or IPv6 address, or # at least one character of a host name (embeds EXT_LINK_URL_CLASS) - const EXT_LINK_ADDR = '(?:[0-9.]+|\\[(?i:[0-9a-f:.]+)\\]|[^][<>"\\x00-\\x20\\x7F\p{Zs}])'; + const EXT_LINK_ADDR = '(?:[0-9.]+|\\[(?i:[0-9a-f:.]+)\\]|[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}])'; # RegExp to make image URLs (embeds IPv6 part of EXT_LINK_ADDR) // @codingStandardsIgnoreStart Generic.Files.LineLength - const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)((?:\\[(?i:[0-9a-f:.]+)\\])?[^][<>"\\x00-\\x20\\x7F\p{Zs}]+) + const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)((?:\\[(?i:[0-9a-f:.]+)\\])?[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}]+) \\/([A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]+)\\.((?i)gif|png|jpg|jpeg)$/Sxu'; // @codingStandardsIgnoreEnd @@ -264,7 +266,7 @@ class Parser { $this->mUrlProtocols = wfUrlProtocols(); $this->mExtLinkBracketedRegex = '/\[(((?i)' . $this->mUrlProtocols . ')' . self::EXT_LINK_ADDR . - self::EXT_LINK_URL_CLASS . '*)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F]*?)\]/Su'; + self::EXT_LINK_URL_CLASS . '*)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F\\x{FFFD}]*?)\]/Su'; if ( isset( $conf['preprocessorClass'] ) ) { $this->mPreprocessorClass = $conf['preprocessorClass']; } elseif ( defined( 'HPHP_VERSION' ) ) { @@ -417,6 +419,8 @@ class Parser { $text = strtr( $text, "\x7f", "?" ); $magicScopeVariable = $this->lock(); } + // Strip U+0000 NULL (T159174) + $text = str_replace( "\000", '', $text ); $this->startParse( $title, $options, self::OT_HTML, $clearState ); @@ -4463,6 +4467,9 @@ class Parser { $this->startParse( $title, $options, self::OT_WIKI, $clearState ); $this->setUser( $user ); + // Strip U+0000 NULL (T159174) + $text = str_replace( "\000", '', $text ); + // We still normalize line endings for backwards-compatibility // with other code that just calls PST, but this should already // be handled in TextContent subclasses diff --git a/languages/LanguageConverter.php b/languages/LanguageConverter.php index 361a9a785d..6a426e5671 100644 --- a/languages/LanguageConverter.php +++ b/languages/LanguageConverter.php @@ -380,6 +380,7 @@ class LanguageConverter { $literalBlob = ''; // Guard against delimiter nulls in the input + // (should never happen: see T159174) $text = str_replace( "\000", '', $text ); $markupMatches = null; diff --git a/tests/parser/extraParserTests.txt b/tests/parser/extraParserTests.txt index a48087e25d49c6a92025de4e102ff76c8ab1ef5b..50d1bc9e7a6e2854be9fb018ebcdcf1ca9d6c452 100644 GIT binary patch delta 274 zcmcc3d5V97H6yozMsZ0|W