Improve/rename Parser::replaceUnusualEscapes
authorBrad Jorsch <bjorsch@wikimedia.org>
Sat, 21 Dec 2013 02:14:48 +0000 (21:14 -0500)
committerTim Starling <tstarling@wikimedia.org>
Tue, 16 Sep 2014 23:00:16 +0000 (23:00 +0000)
The previous implementation would unescape '&', '=', '+', and '%'. The
first three will break the URL when unescaped in the query string, and
the last will break when unescaped anywhere.

The code is now changed to treat the path, query, and fragment parts of
the URL separately when unescaping. We also escape any unsafe characters
and ensure all percent-encodings use uppercase hexits.

And since the old name is no longer accurate,
Parser::replaceUnusualEscapes is deprecated in favor of
Parser::normalizeLinkUrl.

Bug: 57909
Change-Id: I77dc308d0d016c395ad737c08cf10a7711e25bbd

RELEASE-NOTES-1.24
includes/parser/Parser.php
tests/phpunit/includes/parser/ParserMethodsTest.php

index 20aaad6..0cd1131 100644 (file)
@@ -216,6 +216,8 @@ production.
 * (bug 69789) Title::getContentModel() now loads from the database when
   necessary instead of incorrectly returning the default content model.
 * (bug 69249) wfBaseConvert() now works around PHP Bug #50175 when using GMP.
+* (bug 57909) URLs in the externallinks table will no longer have certain
+  characters decoded in the query string.
 
 === Action API changes in 1.24 ===
 * action=parse API now supports prop=modules, which provides the list of
index 8bf400a..84bb224 100644 (file)
@@ -1402,7 +1402,7 @@ class Parser {
                                $this->getExternalLinkAttribs( $url ) );
                        # Register it in the output object...
                        # Replace unnecessary URL escape codes with their equivalent characters
-                       $pasteurized = self::replaceUnusualEscapes( $url );
+                       $pasteurized = self::normalizeLinkUrl( $url );
                        $this->mOutput->addExternalLink( $pasteurized );
                }
                wfProfileOut( __METHOD__ );
@@ -1710,7 +1710,7 @@ class Parser {
                        # Register link in the output object.
                        # Replace unnecessary URL escape codes with the referenced character
                        # This prevents spammers from hiding links from the filters
-                       $pasteurized = self::replaceUnusualEscapes( $url );
+                       $pasteurized = self::normalizeLinkUrl( $url );
                        $this->mOutput->addExternalLink( $pasteurized );
                }
 
@@ -1759,40 +1759,75 @@ class Parser {
        }
 
        /**
-        * Replace unusual URL escape codes with their equivalent characters
+        * Replace unusual escape codes in a URL with their equivalent characters
         *
+        * @deprecated since 1.24, use normalizeLinkUrl
         * @param string $url
         * @return string
-        *
-        * @todo This can merge genuinely required bits in the path or query string,
-        *       breaking legit URLs. A proper fix would treat the various parts of
-        *       the URL differently; as a workaround, just use the output for
-        *       statistical records, not for actual linking/output.
         */
        public static function replaceUnusualEscapes( $url ) {
-               return preg_replace_callback( '/%[0-9A-Fa-f]{2}/',
-                       array( __CLASS__, 'replaceUnusualEscapesCallback' ), $url );
+               wfDeprecated( __METHOD__, '1.24' );
+               return self::normalizeLinkUrl( $url );
        }
 
        /**
-        * Callback function used in replaceUnusualEscapes().
-        * Replaces unusual URL escape codes with their equivalent character
+        * Replace unusual escape codes in a URL with their equivalent characters
         *
-        * @param array $matches
+        * This generally follows the syntax defined in RFC 3986, with special
+        * consideration for HTTP query strings.
         *
+        * @param string $url
         * @return string
         */
-       private static function replaceUnusualEscapesCallback( $matches ) {
-               $char = urldecode( $matches[0] );
-               $ord = ord( $char );
-               # Is it an unsafe or HTTP reserved character according to RFC 1738?
-               if ( $ord > 32 && $ord < 127 && strpos( '<>"#{}|\^~[]`;/?', $char ) === false ) {
-                       # No, shouldn't be escaped
-                       return $char;
-               } else {
-                       # Yes, leave it escaped
-                       return $matches[0];
+       public static function normalizeLinkUrl( $url ) {
+               # First, make sure unsafe characters are encoded
+               $url = preg_replace_callback( '/[\x00-\x20"<>\[\\\\\]^`{|}\x7F-\xFF]/',
+                       function ( $m ) {
+                               return rawurlencode( $m[0] );
+                       },
+                       $url
+               );
+
+               $ret = '';
+               $end = strlen( $url );
+
+               # Fragment part - 'fragment'
+               $start = strpos( $url, '#' );
+               if ( $start !== false && $start < $end ) {
+                       $ret = self::normalizeUrlComponent(
+                               substr( $url, $start, $end - $start ), '"#%<>[\]^`{|}' ) . $ret;
+                       $end = $start;
+               }
+
+               # Query part - 'query' minus &=+;
+               $start = strpos( $url, '?' );
+               if ( $start !== false && $start < $end ) {
+                       $ret = self::normalizeUrlComponent(
+                               substr( $url, $start, $end - $start ), '"#%<>[\]^`{|}&=+;' ) . $ret;
+                       $end = $start;
                }
+
+               # Scheme and path part - 'pchar'
+               # (we assume no userinfo or encoded colons in the host)
+               $ret = self::normalizeUrlComponent(
+                       substr( $url, 0, $end ), '"#%<>[\]^`{|}/?' ) . $ret;
+
+               return $ret;
+       }
+
+       private static function normalizeUrlComponent( $component, $unsafe ) {
+               $callback = function ( $matches ) use ( $unsafe ) {
+                       $char = urldecode( $matches[0] );
+                       $ord = ord( $char );
+                       if ( $ord > 32 && $ord < 127 && strpos( $unsafe, $char ) === false ) {
+                               # Unescape it
+                               return $char;
+                       } else {
+                               # Leave it escaped, but use uppercase for a-f
+                               return strtoupper( $matches[0] );
+                       }
+               };
+               return preg_replace_callback( '/%[0-9A-Fa-f]{2}/', $callback, $component );
        }
 
        /**
index cbf4803..1790086 100644 (file)
@@ -147,6 +147,41 @@ class ParserMethodsTest extends MediaWikiLangTestCase {
                        ),
                ), $out->getSections(), 'getSections() with proper value when <h2> is used' );
        }
+
+       /**
+        * @dataProvider provideNormalizeLinkUrl
+        * @covers Parser::normalizeLinkUrl
+        * @covers Parser::normalizeUrlComponent
+        */
+       public function testNormalizeLinkUrl( $explanation, $url, $expected ) {
+               $this->assertEquals( $expected, Parser::normalizeLinkUrl( $url ), $explanation );
+       }
+
+       public static function provideNormalizeLinkUrl() {
+               return array(
+                       array(
+                               'Escaping of unsafe characters',
+                               'http://example.org/foo bar?param[]="value"&param[]=valüe',
+                               'http://example.org/foo%20bar?param%5B%5D=%22value%22&param%5B%5D=val%C3%BCe',
+                       ),
+                       array(
+                               'Case normalization of percent-encoded characters',
+                               'http://example.org/%ab%cD%Ef%FF',
+                               'http://example.org/%AB%CD%EF%FF',
+                       ),
+                       array(
+                               'Unescaping of safe characters',
+                               'http://example.org/%3C%66%6f%6F%3E?%3C%66%6f%6F%3E#%3C%66%6f%6F%3E',
+                               'http://example.org/%3Cfoo%3E?%3Cfoo%3E#%3Cfoo%3E',
+                       ),
+                       array(
+                               'Context-sensitive replacement of sometimes-safe characters',
+                               'http://example.org/%23%2F%3F%26%3D%2B%3B?%23%2F%3F%26%3D%2B%3B#%23%2F%3F%26%3D%2B%3B',
+                               'http://example.org/%23%2F%3F&=+;?%23/?%26%3D%2B%3B#%23/?&=+;',
+                       ),
+               );
+       }
+
        // @todo Add tests for cleanSig() / cleanSigInSig(), getSection(),
        // replaceSection(), getPreloadText()
 }