Support IPv6 URLs in bracketed and auto links.
authorC. Scott Ananian <cscott@cscott.net>
Thu, 8 Jan 2015 22:00:54 +0000 (17:00 -0500)
committerTim Starling <tstarling@wikimedia.org>
Tue, 18 Aug 2015 22:50:58 +0000 (22:50 +0000)
The corresponding patch for Parsoid is
Ibb33188cdfe2004e469c3f6ee6f30d34d1923283.

Task: T23261
Change-Id: Iff077bf31168b431febb243e2e62f2c6502616bc

includes/Sanitizer.php
includes/parser/Parser.php
tests/parser/parserTests.txt

index 387f24f..30981c3 100644 (file)
@@ -1809,6 +1809,11 @@ class Sanitizer {
 
                        $host = preg_replace( $strip, '', $host );
 
+                       // IPv6 host names are bracketed with [].  Url-decode these.
+                       if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 && preg_match( '!^//%5B(.*?)%5D((:\d+)?)$!', $host, $matches ) ) {
+                               $host = '//[' . $matches[1] . ']' . $matches[2];
+                       }
+
                        // @todo FIXME: Validate hostnames here
 
                        return $protocol . $host . $rest;
index 6189997..2eec08b 100644 (file)
@@ -87,7 +87,11 @@ class Parser {
        # \p{Zs} is unicode 'separator, space' category. It covers the space 0x20
        # as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052
        const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}]';
-       const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)([^][<>"\\x00-\\x20\\x7F\p{Zs}]+)
+       # Simplified expression to match an IPv4 or IPv6 address, or
+       # at least one character of a host name (embeds EXT_LINK_URL_CLASS)
+       const EXT_LINK_ADDR = '(?:[0-9.]+|\\[(?i:[0-9a-f:.]+)\\]|[^][<>"\\x00-\\x20\\x7F\p{Zs}])';
+       # RegExp to make image URLs (embeds IPv6 part of EXT_LINK_ADDR)
+       const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)((?:\\[(?i:[0-9a-f:.]+)\\])?[^][<>"\\x00-\\x20\\x7F\p{Zs}]+)
                \\/([A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]+)\\.((?i)gif|png|jpg|jpeg)$/Sxu';
 
        # Regular expression for a non-newline space
@@ -254,7 +258,8 @@ class Parser {
                $this->mConf = $conf;
                $this->mUrlProtocols = wfUrlProtocols();
                $this->mExtLinkBracketedRegex = '/\[(((?i)' . $this->mUrlProtocols . ')' .
-                       self::EXT_LINK_URL_CLASS . '+)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F]*?)\]/Su';
+                       self::EXT_LINK_ADDR .
+                       self::EXT_LINK_URL_CLASS . '*)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F]*?)\]/Su';
                if ( isset( $conf['preprocessorClass'] ) ) {
                        $this->mPreprocessorClass = $conf['preprocessorClass'];
                } elseif ( defined( 'HPHP_VERSION' ) ) {
@@ -1378,6 +1383,7 @@ class Parser {
        public function doMagicLinks( $text ) {
                $prots = wfUrlProtocolsWithoutProtRel();
                $urlChar = self::EXT_LINK_URL_CLASS;
+               $addr = self::EXT_LINK_ADDR;
                $space = self::SPACE_NOT_NL; #  non-newline space
                $spdash = "(?:-|$space)"; # a dash or a non-newline space
                $spaces = "$space++"; # possessive match of 1 or more spaces
@@ -1386,7 +1392,7 @@ class Parser {
                                (<a[ \t\r\n>].*?</a>) |      # m[1]: Skip link text
                                (<.*?>) |                    # m[2]: Skip stuff inside
                                                             #       HTML elements' . "
-                               (\b(?i:$prots)($urlChar+)) | # m[3]: Free external links
+                               (\b(?i:$prots)($addr$urlChar*)) | # m[3]: Free external links
                                                             # m[4]: Post-protocol path
                                \b(?:RFC|PMID) $spaces       # m[5]: RFC or PMID, capture number
                                        ([0-9]+)\b |
@@ -1499,14 +1505,14 @@ class Parser {
                        $url = substr( $url, 0, -$numSepChars );
                }
 
-               $url = Sanitizer::cleanUrl( $url );
-
                # Verify that we still have a real URL after trail removal, and
                # not just lone protocol
                if ( strlen( $trail ) >= $numPostProto ) {
                        return $url . $trail;
                }
 
+               $url = Sanitizer::cleanUrl( $url );
+
                # Is this an external image?
                $text = $this->maybeMakeExternalImage( $url );
                if ( $text === false ) {
@@ -5415,9 +5421,10 @@ class Parser {
                                                case 'gallery-internal-link':
                                                        $linkValue = strip_tags( $this->replaceLinkHoldersText( $match ) );
                                                        $chars = self::EXT_LINK_URL_CLASS;
+                                                       $addr = self::EXT_LINK_ADDR;
                                                        $prots = $this->mUrlProtocols;
                                                        //check to see if link matches an absolute url, if not then it must be a wiki link.
-                                                       if ( preg_match( "/^($prots)$chars+$/u", $linkValue ) ) {
+                                                       if ( preg_match( "/^($prots)$addr$chars*$/u", $linkValue ) ) {
                                                                $link = $linkValue;
                                                        } else {
                                                                $localLinkTitle = Title::newFromText( $linkValue );
@@ -5599,13 +5606,14 @@ class Parser {
                                                        break;
                                                case 'link':
                                                        $chars = self::EXT_LINK_URL_CLASS;
+                                                       $addr = self::EXT_LINK_ADDR;
                                                        $prots = $this->mUrlProtocols;
                                                        if ( $value === '' ) {
                                                                $paramName = 'no-link';
                                                                $value = true;
                                                                $validated = true;
                                                        } elseif ( preg_match( "/^((?i)$prots)/", $value ) ) {
-                                                               if ( preg_match( "/^((?i)$prots)$chars+$/u", $value, $m ) ) {
+                                                               if ( preg_match( "/^((?i)$prots)$addr$chars*$/u", $value, $m ) ) {
                                                                        $paramName = 'link-url';
                                                                        $this->mOutput->addExternalLink( $value );
                                                                        if ( $this->mOptions->getExternalLinkTarget() ) {
index ffa435c..f6ca577 100644 (file)
@@ -5341,14 +5341,91 @@ http://example.com/index.php?foozoid&#x5B;&#x5D;=bar
 !! end
 
 !! test
-IPv6 urls (bug 21261)
-!! options
-disabled
+IPv6 urls, autolink format (T23261)
 !! wikitext
 http://[2404:130:0:1000::187:2]/index.php
+
+Examples from RFC2373, section 2.2:
+* http://[1080::8:800:200C:417A]/unicast
+* http://[FF01::101]/multicast
+* http://[::1]/loopback
+* http://[::]/unspecified
+* http://[::13.1.68.3]/ipv4compat
+* http://[::FFFF:129.144.52.38]/ipv4compat
+
+Examples from RFC 2732, section 2:
+* http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html
+* http://[1080:0:0:0:8:800:200C:417A]/index.html
+* http://[3ffe:2a00:100:7031::1]
+* http://[1080::8:800:200C:417A]/foo
+* http://[::192.9.5.5]/ipng
+* http://[::FFFF:129.144.52.38]:80/index.html
+* http://[2010:836B:4179::836B:4179]
+
 !! html
 <p><a rel="nofollow" class="external free" href="http://[2404:130:0:1000::187:2]/index.php">http://[2404:130:0:1000::187:2]/index.php</a>
-</p>
+</p><p>Examples from RFC2373, section 2.2:
+</p>
+<ul><li> <a rel="nofollow" class="external free" href="http://[1080::8:800:200C:417A]/unicast">http://[1080::8:800:200C:417A]/unicast</a></li>
+<li> <a rel="nofollow" class="external free" href="http://[FF01::101]/multicast">http://[FF01::101]/multicast</a></li>
+<li> <a rel="nofollow" class="external free" href="http://[::1]/loopback">http://[::1]/loopback</a></li>
+<li> <a rel="nofollow" class="external free" href="http://[::]/unspecified">http://[::]/unspecified</a></li>
+<li> <a rel="nofollow" class="external free" href="http://[::13.1.68.3]/ipv4compat">http://[::13.1.68.3]/ipv4compat</a></li>
+<li> <a rel="nofollow" class="external free" href="http://[::FFFF:129.144.52.38]/ipv4compat">http://[::FFFF:129.144.52.38]/ipv4compat</a></li></ul>
+<p>Examples from <a class="external mw-magiclink-rfc" rel="nofollow" href="//tools.ietf.org/html/rfc2732">RFC 2732</a>, section 2:
+</p>
+<ul><li> <a rel="nofollow" class="external free" href="http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html">http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html</a></li>
+<li> <a rel="nofollow" class="external free" href="http://[1080:0:0:0:8:800:200C:417A]/index.html">http://[1080:0:0:0:8:800:200C:417A]/index.html</a></li>
+<li> <a rel="nofollow" class="external free" href="http://[3ffe:2a00:100:7031::1]">http://[3ffe:2a00:100:7031::1]</a></li>
+<li> <a rel="nofollow" class="external free" href="http://[1080::8:800:200C:417A]/foo">http://[1080::8:800:200C:417A]/foo</a></li>
+<li> <a rel="nofollow" class="external free" href="http://[::192.9.5.5]/ipng">http://[::192.9.5.5]/ipng</a></li>
+<li> <a rel="nofollow" class="external free" href="http://[::FFFF:129.144.52.38]:80/index.html">http://[::FFFF:129.144.52.38]:80/index.html</a></li>
+<li> <a rel="nofollow" class="external free" href="http://[2010:836B:4179::836B:4179]">http://[2010:836B:4179::836B:4179]</a></li></ul>
+
+!! end
+
+!! test
+IPv6 urls, bracketed format (T23261)
+!! wikitext
+[http://[2404:130:0:1000::187:2]/index.php test]
+
+Examples from RFC2373, section 2.2:
+* [http://[1080::8:800:200C:417A] unicast]
+* [http://[FF01::101] multicast]
+* [http://[::1]/ loopback]
+* [http://[::] unspecified]
+* [http://[::13.1.68.3] ipv4compat]
+* [http://[::FFFF:129.144.52.38] ipv4compat]
+
+Examples from RFC 2732, section 2:
+* [http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html 1]
+* [http://[1080:0:0:0:8:800:200C:417A]/index.html 2]
+* [http://[3ffe:2a00:100:7031::1] 3]
+* [http://[1080::8:800:200C:417A]/foo 4]
+* [http://[::192.9.5.5]/ipng 5]
+* [http://[::FFFF:129.144.52.38]:80/index.html 6]
+* [http://[2010:836B:4179::836B:4179] 7]
+
+!! html
+<p><a rel="nofollow" class="external text" href="http://[2404:130:0:1000::187:2]/index.php">test</a>
+</p><p>Examples from RFC2373, section 2.2:
+</p>
+<ul><li> <a rel="nofollow" class="external text" href="http://[1080::8:800:200C:417A]">unicast</a></li>
+<li> <a rel="nofollow" class="external text" href="http://[FF01::101]">multicast</a></li>
+<li> <a rel="nofollow" class="external text" href="http://[::1]/">loopback</a></li>
+<li> <a rel="nofollow" class="external text" href="http://[::]">unspecified</a></li>
+<li> <a rel="nofollow" class="external text" href="http://[::13.1.68.3]">ipv4compat</a></li>
+<li> <a rel="nofollow" class="external text" href="http://[::FFFF:129.144.52.38]">ipv4compat</a></li></ul>
+<p>Examples from <a class="external mw-magiclink-rfc" rel="nofollow" href="//tools.ietf.org/html/rfc2732">RFC 2732</a>, section 2:
+</p>
+<ul><li> <a rel="nofollow" class="external text" href="http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html">1</a></li>
+<li> <a rel="nofollow" class="external text" href="http://[1080:0:0:0:8:800:200C:417A]/index.html">2</a></li>
+<li> <a rel="nofollow" class="external text" href="http://[3ffe:2a00:100:7031::1]">3</a></li>
+<li> <a rel="nofollow" class="external text" href="http://[1080::8:800:200C:417A]/foo">4</a></li>
+<li> <a rel="nofollow" class="external text" href="http://[::192.9.5.5]/ipng">5</a></li>
+<li> <a rel="nofollow" class="external text" href="http://[::FFFF:129.144.52.38]:80/index.html">6</a></li>
+<li> <a rel="nofollow" class="external text" href="http://[2010:836B:4179::836B:4179]">7</a></li></ul>
+
 !! end
 
 !! test