X-Git-Url: https://git.heureux-cyclage.org/?p=lhc%2Fweb%2Fwiklou.git;a=blobdiff_plain;f=includes%2Fparser%2FParser.php;h=8db1fe3794c10357ef5e20a7e47fa8575de2f32d;hp=86aa06a18efe24a6b3bbf05d22332bfdd5e87a22;hb=49748181dd56ec97e7ba7c13e684a16abceb3cc0;hpb=8d4f21cff8a49c63d11d8b931c5b94386d2525dc diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index 86aa06a18e..8db1fe3794 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -89,13 +89,15 @@ class Parser { # Everything except bracket, space, or control characters # \p{Zs} is unicode 'separator, space' category. It covers the space 0x20 # as well as U+3000 is IDEOGRAPHIC SPACE for T21052 - const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}]'; + # \x{FFFD} is the Unicode replacement character, which Preprocessor_DOM + # uses to replace invalid HTML characters. + const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}]'; # Simplified expression to match an IPv4 or IPv6 address, or # at least one character of a host name (embeds EXT_LINK_URL_CLASS) - const EXT_LINK_ADDR = '(?:[0-9.]+|\\[(?i:[0-9a-f:.]+)\\]|[^][<>"\\x00-\\x20\\x7F\p{Zs}])'; + const EXT_LINK_ADDR = '(?:[0-9.]+|\\[(?i:[0-9a-f:.]+)\\]|[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}])'; # RegExp to make image URLs (embeds IPv6 part of EXT_LINK_ADDR) // @codingStandardsIgnoreStart Generic.Files.LineLength - const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)((?:\\[(?i:[0-9a-f:.]+)\\])?[^][<>"\\x00-\\x20\\x7F\p{Zs}]+) + const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)((?:\\[(?i:[0-9a-f:.]+)\\])?[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}]+) \\/([A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]+)\\.((?i)gif|png|jpg|jpeg)$/Sxu'; // @codingStandardsIgnoreEnd @@ -264,7 +266,7 @@ class Parser { $this->mUrlProtocols = wfUrlProtocols(); $this->mExtLinkBracketedRegex = '/\[(((?i)' . $this->mUrlProtocols . ')' . self::EXT_LINK_ADDR . - self::EXT_LINK_URL_CLASS . '*)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F]*?)\]/Su'; + self::EXT_LINK_URL_CLASS . '*)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F\\x{FFFD}]*?)\]/Su'; if ( isset( $conf['preprocessorClass'] ) ) { $this->mPreprocessorClass = $conf['preprocessorClass']; } elseif ( defined( 'HPHP_VERSION' ) ) { @@ -417,6 +419,8 @@ class Parser { $text = strtr( $text, "\x7f", "?" ); $magicScopeVariable = $this->lock(); } + // Strip U+0000 NULL (T159174) + $text = str_replace( "\000", '', $text ); $this->startParse( $title, $options, self::OT_HTML, $clearState ); @@ -1450,15 +1454,16 @@ class Parser { $spdash = "(?:-|$space)"; # a dash or a non-newline space $spaces = "$space++"; # possessive match of 1 or more spaces $text = preg_replace_callback( - '!(?: # Start cases - (].*?) | # m[1]: Skip link text - (<.*?>) | # m[2]: Skip stuff inside - # HTML elements' . " - (\b(?i:$prots)($addr$urlChar*)) | # m[3]: Free external links - # m[4]: Post-protocol path - \b(?:RFC|PMID) $spaces # m[5]: RFC or PMID, capture number + '!(?: # Start cases + (].*?) | # m[1]: Skip link text + (<.*?>) | # m[2]: Skip stuff inside HTML elements' . " + (\b # m[3]: Free external links + (?i:$prots) + ($addr$urlChar*) # m[4]: Post-protocol path + ) | + \b(?:RFC|PMID) $spaces # m[5]: RFC or PMID, capture number ([0-9]+)\b | - \bISBN $spaces ( # m[6]: ISBN, capture number + \bISBN $spaces ( # m[6]: ISBN, capture number (?: 97[89] $spdash? )? # optional 13-digit ISBN prefix (?: [0-9] $spdash? ){9} # 9 digits with opt. delimiters [0-9Xx] # check digit @@ -4462,6 +4467,9 @@ class Parser { $this->startParse( $title, $options, self::OT_WIKI, $clearState ); $this->setUser( $user ); + // Strip U+0000 NULL (T159174) + $text = str_replace( "\000", '', $text ); + // We still normalize line endings for backwards-compatibility // with other code that just calls PST, but this should already // be handled in TextContent subclasses