Use new externallinks.el_index_60 field
authorBrad Jorsch <bjorsch@wikimedia.org>
Sat, 19 Nov 2016 00:50:43 +0000 (19:50 -0500)
committerTim Starling <tstarling@wikimedia.org>
Mon, 12 Nov 2018 22:33:18 +0000 (22:33 +0000)
This adds a method to LinkFilter to build the query conditions necessary
to properly use it, and adjusts code to use it.

This also takes the opportunity to clean up the calculation of el_index:
IPs are handled more sensibly and IDNs are canonicalized.

Also weird edge cases for invalid hosts like "http://.example.com" and
corresponding searches like "http://*..example.com" are now handled more
regularly instead of being treated as if the extra dot were omitted,
while explicit specification of the DNS root like "http://example.com./"
is canonicalized to the usual implicit specification.

Note that this patch will break link searches for links where the host
is an IP or IDN until refreshExternallinksIndex.php is run.

Bug: T59176
Bug: T130482
Change-Id: I84d224ef23de22dfe179009ec3a11fd0e4b5f56d

19 files changed:
RELEASE-NOTES-1.33
autoload.php
includes/GlobalFunctions.php
includes/LinkFilter.php
includes/api/ApiQueryBase.php
includes/api/ApiQueryExtLinksUsage.php
includes/api/ApiQueryExternalLinks.php
includes/deferred/LinksUpdate.php
includes/installer/DatabaseUpdater.php
includes/parser/Parser.php
includes/specials/SpecialLinkSearch.php
maintenance/cleanupSpam.php
maintenance/deleteSelfExternals.php
maintenance/mssql/tables.sql
maintenance/refreshExternallinksIndex.php [new file with mode: 0644]
maintenance/tables.sql
tests/phpunit/includes/GlobalFunctions/GlobalTest.php
tests/phpunit/includes/LinkFilterTest.php
tests/phpunit/includes/parser/ParserMethodsTest.php

index a6f8058..a16968e 100644 (file)
@@ -15,6 +15,9 @@ production.
   the current parse language where available.
 
 ==== Changed configuration ====
+* Some external link searches will not work correctly until update.php (or
+  refreshExternallinksIndex.php) is run. These include searches for links using
+  IP addresses, internationalized domain names, and possibly mailto links.
 * …
 
 ==== Removed configuration ====
index 8e764ae..02e35a8 100644 (file)
@@ -1191,6 +1191,7 @@ $wgAutoloadLocalClasses = [
        'RedisConnectionPool' => __DIR__ . '/includes/libs/redis/RedisConnectionPool.php',
        'RedisLockManager' => __DIR__ . '/includes/libs/lockmanager/RedisLockManager.php',
        'RedisPubSubFeedEngine' => __DIR__ . '/includes/rcfeed/RedisPubSubFeedEngine.php',
+       'RefreshExternallinksIndex' => __DIR__ . '/maintenance/refreshExternallinksIndex.php',
        'RefreshFileHeaders' => __DIR__ . '/maintenance/refreshFileHeaders.php',
        'RefreshImageMetadata' => __DIR__ . '/maintenance/refreshImageMetadata.php',
        'RefreshLinks' => __DIR__ . '/maintenance/refreshLinks.php',
index 6e95871..78d619d 100644 (file)
@@ -894,55 +894,13 @@ function wfExpandIRI( $url ) {
 /**
  * Make URL indexes, appropriate for the el_index field of externallinks.
  *
+ * @deprecated since 1.33, use LinkFilter::makeIndexes() instead
  * @param string $url
  * @return array
  */
 function wfMakeUrlIndexes( $url ) {
-       $bits = wfParseUrl( $url );
-
-       // Reverse the labels in the hostname, convert to lower case
-       // For emails reverse domainpart only
-       if ( $bits['scheme'] == 'mailto' ) {
-               $mailparts = explode( '@', $bits['host'], 2 );
-               if ( count( $mailparts ) === 2 ) {
-                       $domainpart = strtolower( implode( '.', array_reverse( explode( '.', $mailparts[1] ) ) ) );
-               } else {
-                       // No domain specified, don't mangle it
-                       $domainpart = '';
-               }
-               $reversedHost = $domainpart . '@' . $mailparts[0];
-       } else {
-               $reversedHost = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) );
-       }
-       // Add an extra dot to the end
-       // Why? Is it in wrong place in mailto links?
-       if ( substr( $reversedHost, -1, 1 ) !== '.' ) {
-               $reversedHost .= '.';
-       }
-       // Reconstruct the pseudo-URL
-       $prot = $bits['scheme'];
-       $index = $prot . $bits['delimiter'] . $reversedHost;
-       // Leave out user and password. Add the port, path, query and fragment
-       if ( isset( $bits['port'] ) ) {
-               $index .= ':' . $bits['port'];
-       }
-       if ( isset( $bits['path'] ) ) {
-               $index .= $bits['path'];
-       } else {
-               $index .= '/';
-       }
-       if ( isset( $bits['query'] ) ) {
-               $index .= '?' . $bits['query'];
-       }
-       if ( isset( $bits['fragment'] ) ) {
-               $index .= '#' . $bits['fragment'];
-       }
-
-       if ( $prot == '' ) {
-               return [ "http:$index", "https:$index" ];
-       } else {
-               return [ $index ];
-       }
+       wfDeprecated( __FUNCTION__, '1.33' );
+       return LinkFilter::makeIndexes( $url );
 }
 
 /**
index 3b03f87..ffb36e0 100644 (file)
@@ -32,6 +32,11 @@ use Wikimedia\Rdbms\LikeMatch;
  * Another cool thing to do would be a web interface for fast spam removal.
  */
 class LinkFilter {
+       /**
+        * Increment this when makeIndexes output changes. It'll cause
+        * maintenance/refreshExternallinksIndex.php to run from update.php.
+        */
+       const VERSION = 1;
 
        /**
         * Check whether $content contains a link to $filterEntry
@@ -58,6 +63,7 @@ class LinkFilter {
        /**
         * Builds a regex pattern for $filterEntry.
         *
+        * @todo This doesn't match the rest of the functionality here.
         * @param string $filterEntry URL, if it begins with "*.", it'll be
         *        replaced to match any subdomain
         * @param string $protocol 'http://' or 'https://'
@@ -75,23 +81,231 @@ class LinkFilter {
        }
 
        /**
-        * Make an array to be used for calls to Database::buildLike(), which
-        * will match the specified string. There are several kinds of filter entry:
-        *     *.domain.com    -  Produces http://com.domain.%, matches domain.com
-        *                        and www.domain.com
-        *     domain.com      -  Produces http://com.domain./%, matches domain.com
-        *                        or domain.com/ but not www.domain.com
-        *     *.domain.com/x  -  Produces http://com.domain.%/x%, matches
-        *                        www.domain.com/xy
-        *     domain.com/x    -  Produces http://com.domain./x%, matches
-        *                        domain.com/xy but not www.domain.com/xy
+        * Indicate whether LinkFilter IDN support is available
+        * @since 1.33
+        * @return bool
+        */
+       public static function supportsIDN() {
+               return is_callable( 'idn_to_utf8' ) && defined( 'INTL_IDNA_VARIANT_UTS46' );
+       }
+
+       /**
+        * Canonicalize a hostname for el_index
+        * @param string $hose
+        * @return string
+        */
+       private static function indexifyHost( $host ) {
+               // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
+
+               // Canonicalize.
+               $host = rawurldecode( $host );
+               if ( $host !== '' && self::supportsIDN() ) {
+                       // @todo Add a PHP fallback
+                       $tmp = idn_to_utf8( $host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46 );
+                       if ( $tmp !== false ) {
+                               $host = $tmp;
+                       }
+               }
+               $okChars = 'a-zA-Z0-9\\-._~!$&\'()*+,;=';
+               if ( StringUtils::isUtf8( $host ) ) {
+                       // Save a little space by not percent-encoding valid UTF-8 bytes
+                       $okChars .= '\x80-\xf4';
+               }
+               $host = preg_replace_callback(
+                       '<[^' . $okChars . ']>',
+                       function ( $m ) {
+                               return rawurlencode( $m[0] );
+                       },
+                       strtolower( $host )
+               );
+
+               // IPv6? RFC 3986 syntax.
+               if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host ), $m ) ) {
+                       $ip = $m[1];
+                       if ( IP::isValid( $ip ) ) {
+                               return 'V6.' . implode( '.', explode( ':', IP::sanitizeIP( $ip ) ) ) . '.';
+                       }
+                       if ( substr( $ip, -2 ) === ':*' ) {
+                               $cutIp = substr( $ip, 0, -2 );
+                               if ( IP::isValid( "{$cutIp}::" ) ) {
+                                       // Wildcard IP doesn't contain "::", so multiple parts can be wild
+                                       $ct = count( explode( ':', $ip ) ) - 1;
+                                       return 'V6.' .
+                                               implode( '.', array_slice( explode( ':', IP::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) .
+                                               '.*.';
+                               }
+                               if ( IP::isValid( "{$cutIp}:1" ) ) {
+                                       // Wildcard IP does contain "::", so only the last part is wild
+                                       return 'V6.' .
+                                               substr( implode( '.', explode( ':', IP::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) .
+                                               '*.';
+                               }
+                       }
+               }
+
+               // Regularlize explicit specification of the DNS root.
+               // Browsers seem to do this for IPv4 literals too.
+               if ( substr( $host, -1 ) === '.' ) {
+                       $host = substr( $host, 0, -1 );
+               }
+
+               // IPv4?
+               $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])';
+               if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", $host ) ) {
+                       return 'V4.' . implode( '.', array_map( function ( $v ) {
+                               return $v === '*' ? $v : (int)$v;
+                       }, explode( '.', $host ) ) ) . '.';
+               }
+
+               // Must be a host name.
+               return implode( '.', array_reverse( explode( '.', $host ) ) ) . '.';
+       }
+
+       /**
+        * Converts a URL into a format for el_index
+        * @since 1.33
+        * @param string $url
+        * @return string[] Usually one entry, but might be two in case of
+        *  protocol-relative URLs. Empty array on error.
+        */
+       public static function makeIndexes( $url ) {
+               // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
+
+               // NOTE: refreshExternallinksIndex.php assumes that only protocol-relative URLs return more
+               // than one index, and that the indexes for protocol-relative URLs only vary in the "http://"
+               // versus "https://" prefix. If you change that, you'll likely need to update
+               // refreshExternallinksIndex.php accordingly.
+
+               $bits = wfParseUrl( $url );
+               if ( !$bits ) {
+                       return [];
+               }
+
+               // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
+               // For emails turn it into "domain.reversed@localpart"
+               if ( $bits['scheme'] == 'mailto' ) {
+                       $mailparts = explode( '@', $bits['host'], 2 );
+                       if ( count( $mailparts ) === 2 ) {
+                               $domainpart = self::indexifyHost( $mailparts[1] );
+                       } else {
+                               // No @, assume it's a local part with no domain
+                               $domainpart = '';
+                       }
+                       $bits['host'] = $domainpart . '@' . $mailparts[0];
+               } else {
+                       $bits['host'] = self::indexifyHost( $bits['host'] );
+               }
+
+               // Reconstruct the pseudo-URL
+               $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
+               // Leave out user and password. Add the port, path, query and fragment
+               if ( isset( $bits['port'] ) ) {
+                       $index .= ':' . $bits['port'];
+               }
+               if ( isset( $bits['path'] ) ) {
+                       $index .= $bits['path'];
+               } else {
+                       $index .= '/';
+               }
+               if ( isset( $bits['query'] ) ) {
+                       $index .= '?' . $bits['query'];
+               }
+               if ( isset( $bits['fragment'] ) ) {
+                       $index .= '#' . $bits['fragment'];
+               }
+
+               if ( $bits['scheme'] == '' ) {
+                       return [ "http:$index", "https:$index" ];
+               } else {
+                       return [ $index ];
+               }
+       }
+
+       /**
+        * Return query conditions which will match the specified string. There are
+        * several kinds of filter entry:
+        *
+        *     *.domain.com    -  Matches domain.com and www.domain.com
+        *     domain.com      -  Matches domain.com or domain.com/ but not www.domain.com
+        *     *.domain.com/x  -  Matches domain.com/xy or www.domain.com/xy. Also probably matches
+        *                        domain.com/foobar/xy due to limitations of LIKE syntax.
+        *     domain.com/x    -  Matches domain.com/xy but not www.domain.com/xy
+        *     192.0.2.*       -  Matches any IP in 192.0.2.0/24. Can also have a path appended.
+        *     [2001:db8::*]   -  Matches any IP in 2001:db8::/112. Can also have a path appended.
+        *     [2001:db8:*]    -  Matches any IP in 2001:db8::/32. Can also have a path appended.
+        *     foo@domain.com  -  With protocol 'mailto:', matches the email address foo@domain.com.
+        *     *@domain.com    -  With protocol 'mailto:', matches any email address at domain.com, but
+        *                        not subdomains like foo@mail.domain.com
         *
         * Asterisks in any other location are considered invalid.
         *
-        * This function does the same as wfMakeUrlIndexes(), except it also takes care
+        * @since 1.33
+        * @param string $filterEntry Filter entry, as described above
+        * @param array $options Options are:
+        *   - protocol: (string) Protocol to query (default http://)
+        *   - oneWildcard: (bool) Stop at the first wildcard (default false)
+        *   - prefix: (string) Field prefix (default 'el'). The query will test
+        *     fields '{$prefix}_index' and '{$prefix}_index_60'
+        *   - db: (IDatabase|null) Database to use.
+        * @return array|bool Conditions to be used for the query (to be ANDed) or
+        *  false on error. To determine if the query is constant on the
+        *  el_index_60 field, check whether key 'el_index_60' is set.
+        */
+       public static function getQueryConditions( $filterEntry, array $options = [] ) {
+               $options += [
+                       'protocol' => 'http://',
+                       'oneWildcard' => false,
+                       'prefix' => 'el',
+                       'db' => null,
+               ];
+
+               // First, get the like array
+               $like = self::makeLikeArray( $filterEntry, $options['protocol'] );
+               if ( $like === false ) {
+                       return $like;
+               }
+
+               // Get the constant prefix (i.e. everything up to the first wildcard)
+               $trimmedLike = self::keepOneWildcard( $like );
+               if ( $options['oneWildcard'] ) {
+                       $like = $trimmedLike;
+               }
+               if ( $trimmedLike[count( $trimmedLike ) - 1] instanceof LikeMatch ) {
+                       array_pop( $trimmedLike );
+               }
+               $index = implode( '', $trimmedLike );
+
+               $p = $options['prefix'];
+               $db = $options['db'] ?: wfGetDB( DB_REPLICA );
+
+               // Build the query
+               $l = strlen( $index );
+               if ( $l >= 60 ) {
+                       // The constant prefix is larger than el_index_60, so we can use a
+                       // constant comparison.
+                       return [
+                               "{$p}_index_60" => substr( $index, 0, 60 ),
+                               "{$p}_index" . $db->buildLike( $like ),
+                       ];
+               }
+
+               // The constant prefix is smaller than el_index_60, so we use a LIKE
+               // for a prefix search.
+               return [
+                       "{$p}_index_60" . $db->buildLike( [ $index, $db->anyString() ] ),
+                       "{$p}_index" . $db->buildLike( $like ),
+               ];
+       }
+
+       /**
+        * Make an array to be used for calls to Database::buildLike(), which
+        * will match the specified string.
+        *
+        * This function does the same as LinkFilter::makeIndexes(), except it also takes care
         * of adding wildcards
         *
-        * @param string $filterEntry Domainparts
+        * @note You probably want self::getQueryConditions() instead
+        * @param string $filterEntry Filter entry, @see self::getQueryConditions()
         * @param string $protocol Protocol (default http://)
         * @return array|bool Array to be passed to Database::buildLike() or false on error
         */
@@ -100,38 +314,27 @@ class LinkFilter {
 
                $target = $protocol . $filterEntry;
                $bits = wfParseUrl( $target );
-
-               if ( $bits == false ) {
-                       // Unknown protocol?
+               if ( !$bits ) {
                        return false;
                }
 
-               if ( substr( $bits['host'], 0, 2 ) == '*.' ) {
-                       $subdomains = true;
-                       $bits['host'] = substr( $bits['host'], 2 );
-                       if ( $bits['host'] == '' ) {
-                               // We don't want to make a clause that will match everything,
-                               // that could be dangerous
-                               return false;
-                       }
-               } else {
-                       $subdomains = false;
-               }
-
-               // Reverse the labels in the hostname, convert to lower case
-               // For emails reverse domainpart only
+               $subdomains = false;
                if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) {
-                       // complete email address
-                       $mailparts = explode( '@', $bits['host'] );
-                       $domainpart = strtolower( implode( '.', array_reverse( explode( '.', $mailparts[1] ) ) ) );
-                       $bits['host'] = $domainpart . '@' . $mailparts[0];
-               } elseif ( $bits['scheme'] === 'mailto' ) {
-                       // domainpart of email address only, do not add '.'
-                       $bits['host'] = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) );
+                       // Email address with domain and non-empty local part
+                       $mailparts = explode( '@', $bits['host'], 2 );
+                       $domainpart = self::indexifyHost( $mailparts[1] );
+                       if ( $mailparts[0] === '*' ) {
+                               $subdomains = true;
+                               $bits['host'] = $domainpart . '@';
+                       } else {
+                               $bits['host'] = $domainpart . '@' . $mailparts[0];
+                       }
                } else {
-                       $bits['host'] = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) );
-                       if ( substr( $bits['host'], -1, 1 ) !== '.' ) {
-                               $bits['host'] .= '.';
+                       // Non-email, or email with only a domain part.
+                       $bits['host'] = self::indexifyHost( $bits['host'] );
+                       if ( substr( $bits['host'], -3 ) === '.*.' ) {
+                               $subdomains = true;
+                               $bits['host'] = substr( $bits['host'], 0, -2 );
                        }
                }
 
@@ -175,6 +378,7 @@ class LinkFilter {
         * Filters an array returned by makeLikeArray(), removing everything past first
         * pattern placeholder.
         *
+        * @note You probably want self::getQueryConditions() instead
         * @param array $arr Array to filter
         * @return array Filtered array
         */
index 8630561..fe01f03 100644 (file)
@@ -402,13 +402,15 @@ abstract class ApiQueryBase extends ApiBase {
        }
 
        /**
+        * @deprecated since 1.33, use LinkFilter::getQueryConditions() instead
         * @param string|null $query
         * @param string|null $protocol
         * @return null|string
         */
        public function prepareUrlQuerySearchString( $query = null, $protocol = null ) {
+               wfDeprecated( __METHOD__, '1.33' );
                $db = $this->getDB();
-               if ( !is_null( $query ) || $query != '' ) {
+               if ( $query !== null && $query !== '' ) {
                        if ( is_null( $protocol ) ) {
                                $protocol = 'http://';
                        }
index fc5d8a0..d508c55 100644 (file)
@@ -47,12 +47,12 @@ class ApiQueryExtLinksUsage extends ApiQueryGeneratorBase {
         */
        private function run( $resultPageSet = null ) {
                $params = $this->extractRequestParams();
+               $db = $this->getDB();
 
                $query = $params['query'];
                $protocol = self::getProtocolPrefix( $params['protocol'] );
 
-               $this->addTables( [ 'page', 'externallinks' ] ); // must be in this order for 'USE INDEX'
-               $this->addOption( 'USE INDEX', 'el_index' );
+               $this->addTables( [ 'page', 'externallinks' ] );
                $this->addWhere( 'page_id=el_from' );
 
                $miser_ns = [];
@@ -62,15 +62,46 @@ class ApiQueryExtLinksUsage extends ApiQueryGeneratorBase {
                        $this->addWhereFld( 'page_namespace', $params['namespace'] );
                }
 
-               // Normalize query to match the normalization applied for the externallinks table
-               $query = Parser::normalizeLinkUrl( $query );
+               $orderBy = [];
 
-               $whereQuery = $this->prepareUrlQuerySearchString( $query, $protocol );
+               if ( $query !== null && $query !== '' ) {
+                       if ( $protocol === null ) {
+                               $protocol = 'http://';
+                       }
+
+                       // Normalize query to match the normalization applied for the externallinks table
+                       $query = Parser::normalizeLinkUrl( $protocol . $query );
+
+                       $conds = LinkFilter::getQueryConditions( $query, [
+                               'protocol' => '',
+                               'oneWildcard' => true,
+                               'db' => $db
+                       ] );
+                       if ( !$conds ) {
+                               $this->dieWithError( 'apierror-badquery' );
+                       }
+                       $this->addWhere( $conds );
+                       if ( !isset( $conds['el_index_60'] ) ) {
+                               $orderBy[] = 'el_index_60';
+                       }
+               } else {
+                       $orderBy[] = 'el_index_60';
 
-               if ( $whereQuery !== null ) {
-                       $this->addWhere( $whereQuery );
+                       if ( $protocol !== null ) {
+                               $this->addWhere( 'el_index_60' . $db->buildLike( "$protocol", $db->anyString() ) );
+                       } else {
+                               // We're querying all protocols, filter out duplicate protocol-relative links
+                               $this->addWhere( $db->makeList( [
+                                       'el_to NOT' . $db->buildLike( '//', $db->anyString() ),
+                                       'el_index_60 ' . $db->buildLike( 'http://', $db->anyString() ),
+                               ], LIST_OR ) );
+                       }
                }
 
+               $orderBy[] = 'el_id';
+               $this->addOption( 'ORDER BY', $orderBy );
+               $this->addFields( $orderBy ); // Make sure
+
                $prop = array_flip( $params['prop'] );
                $fld_ids = isset( $prop['ids'] );
                $fld_title = isset( $prop['title'] );
@@ -88,10 +119,19 @@ class ApiQueryExtLinksUsage extends ApiQueryGeneratorBase {
                }
 
                $limit = $params['limit'];
-               $offset = $params['offset'];
                $this->addOption( 'LIMIT', $limit + 1 );
-               if ( isset( $offset ) ) {
-                       $this->addOption( 'OFFSET', $offset );
+
+               if ( $params['continue'] !== null ) {
+                       $cont = explode( '|', $params['continue'] );
+                       $this->dieContinueUsageIf( count( $cont ) !== count( $orderBy ) );
+                       $i = count( $cont ) - 1;
+                       $cond = $orderBy[$i] . ' >= ' . $db->addQuotes( rawurldecode( $cont[$i] ) );
+                       while ( $i-- > 0 ) {
+                               $field = $orderBy[$i];
+                               $v = $db->addQuotes( rawurldecode( $cont[$i] ) );
+                               $cond = "($field > $v OR ($field = $v AND $cond))";
+                       }
+                       $this->addWhere( $cond );
                }
 
                $res = $this->select( __METHOD__ );
@@ -102,7 +142,7 @@ class ApiQueryExtLinksUsage extends ApiQueryGeneratorBase {
                        if ( ++$count > $limit ) {
                                // We've reached the one extra which shows that there are
                                // additional pages to be had. Stop here...
-                               $this->setContinueEnumParameter( 'offset', $offset + $limit );
+                               $this->setContinue( $orderBy, $row );
                                break;
                        }
 
@@ -131,7 +171,7 @@ class ApiQueryExtLinksUsage extends ApiQueryGeneratorBase {
                                }
                                $fit = $result->addValue( [ 'query', $this->getModuleName() ], null, $vals );
                                if ( !$fit ) {
-                                       $this->setContinueEnumParameter( 'offset', $offset + $count - 1 );
+                                       $this->setContinue( $orderBy, $row );
                                        break;
                                }
                        } else {
@@ -145,6 +185,14 @@ class ApiQueryExtLinksUsage extends ApiQueryGeneratorBase {
                }
        }
 
+       private function setContinue( $orderBy, $row ) {
+               $fields = [];
+               foreach ( $orderBy as $field ) {
+                       $fields[] = strtr( $row->$field, [ '%' => '%25', '|' => '%7C' ] );
+               }
+               $this->setContinueEnumParameter( 'continue', implode( '|', $fields ) );
+       }
+
        public function getAllowedParams() {
                $ret = [
                        'prop' => [
@@ -157,8 +205,7 @@ class ApiQueryExtLinksUsage extends ApiQueryGeneratorBase {
                                ],
                                ApiBase::PARAM_HELP_MSG_PER_VALUE => [],
                        ],
-                       'offset' => [
-                               ApiBase::PARAM_TYPE => 'integer',
+                       'continue' => [
                                ApiBase::PARAM_HELP_MSG => 'api-help-param-continue',
                        ],
                        'protocol' => [
index 6c219d4..b5731a3 100644 (file)
@@ -37,6 +37,7 @@ class ApiQueryExternalLinks extends ApiQueryBase {
                }
 
                $params = $this->extractRequestParams();
+               $db = $this->getDB();
 
                $query = $params['query'];
                $protocol = ApiQueryExtLinksUsage::getProtocolPrefix( $params['protocol'] );
@@ -49,26 +50,64 @@ class ApiQueryExternalLinks extends ApiQueryBase {
                $this->addTables( 'externallinks' );
                $this->addWhereFld( 'el_from', array_keys( $this->getPageSet()->getGoodTitles() ) );
 
-               $whereQuery = $this->prepareUrlQuerySearchString( $query, $protocol );
-
-               if ( $whereQuery !== null ) {
-                       $this->addWhere( $whereQuery );
-               }
+               $orderBy = [];
 
                // Don't order by el_from if it's constant in the WHERE clause
                if ( count( $this->getPageSet()->getGoodTitles() ) != 1 ) {
-                       $this->addOption( 'ORDER BY', 'el_from' );
+                       $orderBy[] = 'el_from';
                }
 
-               // If we're querying all protocols, use DISTINCT to avoid repeating protocol-relative links twice
-               if ( $protocol === null ) {
-                       $this->addOption( 'DISTINCT' );
+               if ( $query !== null && $query !== '' ) {
+                       if ( $protocol === null ) {
+                               $protocol = 'http://';
+                       }
+
+                       // Normalize query to match the normalization applied for the externallinks table
+                       $query = Parser::normalizeLinkUrl( $protocol . $query );
+
+                       $conds = LinkFilter::getQueryConditions( $query, [
+                               'protocol' => '',
+                               'oneWildcard' => true,
+                               'db' => $db
+                       ] );
+                       if ( !$conds ) {
+                               $this->dieWithError( 'apierror-badquery' );
+                       }
+                       $this->addWhere( $conds );
+                       if ( !isset( $conds['el_index_60'] ) ) {
+                               $orderBy[] = 'el_index_60';
+                       }
+               } else {
+                       $orderBy[] = 'el_index_60';
+
+                       if ( $protocol !== null ) {
+                               $this->addWhere( 'el_index_60' . $db->buildLike( "$protocol", $db->anyString() ) );
+                       } else {
+                               // We're querying all protocols, filter out duplicate protocol-relative links
+                               $this->addWhere( $db->makeList( [
+                                       'el_to NOT' . $db->buildLike( '//', $db->anyString() ),
+                                       'el_index_60 ' . $db->buildLike( 'http://', $db->anyString() ),
+                               ], LIST_OR ) );
+                       }
                }
 
+               $orderBy[] = 'el_id';
+               $this->addOption( 'ORDER BY', $orderBy );
+               $this->addFields( $orderBy ); // Make sure
+
                $this->addOption( 'LIMIT', $params['limit'] + 1 );
-               $offset = $params['offset'] ?? 0;
-               if ( $offset ) {
-                       $this->addOption( 'OFFSET', $params['offset'] );
+
+               if ( $params['continue'] !== null ) {
+                       $cont = explode( '|', $params['continue'] );
+                       $this->dieContinueUsageIf( count( $cont ) !== count( $orderBy ) );
+                       $i = count( $cont ) - 1;
+                       $cond = $orderBy[$i] . ' >= ' . $db->addQuotes( rawurldecode( $cont[$i] ) );
+                       while ( $i-- > 0 ) {
+                               $field = $orderBy[$i];
+                               $v = $db->addQuotes( rawurldecode( $cont[$i] ) );
+                               $cond = "($field > $v OR ($field = $v AND $cond))";
+                       }
+                       $this->addWhere( $cond );
                }
 
                $res = $this->select( __METHOD__ );
@@ -78,7 +117,7 @@ class ApiQueryExternalLinks extends ApiQueryBase {
                        if ( ++$count > $params['limit'] ) {
                                // We've reached the one extra which shows that
                                // there are additional pages to be had. Stop here...
-                               $this->setContinueEnumParameter( 'offset', $offset + $params['limit'] );
+                               $this->setContinue( $orderBy, $row );
                                break;
                        }
                        $entry = [];
@@ -90,12 +129,20 @@ class ApiQueryExternalLinks extends ApiQueryBase {
                        ApiResult::setContentValue( $entry, 'url', $to );
                        $fit = $this->addPageSubItem( $row->el_from, $entry );
                        if ( !$fit ) {
-                               $this->setContinueEnumParameter( 'offset', $offset + $count - 1 );
+                               $this->setContinue( $orderBy, $row );
                                break;
                        }
                }
        }
 
+       private function setContinue( $orderBy, $row ) {
+               $fields = [];
+               foreach ( $orderBy as $field ) {
+                       $fields[] = strtr( $row->$field, [ '%' => '%25', '|' => '%7C' ] );
+               }
+               $this->setContinueEnumParameter( 'continue', implode( '|', $fields ) );
+       }
+
        public function getCacheMode( $params ) {
                return 'public';
        }
@@ -109,8 +156,7 @@ class ApiQueryExternalLinks extends ApiQueryBase {
                                ApiBase::PARAM_MAX => ApiBase::LIMIT_BIG1,
                                ApiBase::PARAM_MAX2 => ApiBase::LIMIT_BIG2
                        ],
-                       'offset' => [
-                               ApiBase::PARAM_TYPE => 'integer',
+                       'continue' => [
                                ApiBase::PARAM_HELP_MSG => 'api-help-param-continue',
                        ],
                        'protocol' => [
index 577a272..b4863f8 100644 (file)
@@ -567,7 +567,7 @@ class LinksUpdate extends DataUpdate implements EnqueueableDataUpdate {
                $arr = [];
                $diffs = array_diff_key( $this->mExternals, $existing );
                foreach ( $diffs as $url => $dummy ) {
-                       foreach ( wfMakeUrlIndexes( $url ) as $index ) {
+                       foreach ( LinkFilter::makeIndexes( $url ) as $index ) {
                                $arr[] = [
                                        'el_from' => $this->mId,
                                        'el_to' => $url,
index f5d01d6..925fc5a 100644 (file)
@@ -87,6 +87,7 @@ abstract class DatabaseUpdater {
                AddRFCandPMIDInterwiki::class,
                PopulatePPSortKey::class,
                PopulateIpChanges::class,
+               RefreshExternallinksIndex::class,
        ];
 
        /**
index 3dc2eeb..93dfb9d 100644 (file)
@@ -2026,7 +2026,19 @@ class Parser {
         * @return string
         */
        public static function normalizeLinkUrl( $url ) {
-               # First, make sure unsafe characters are encoded
+               # Test for RFC 3986 IPv6 syntax
+               $scheme = '[a-z][a-z0-9+.-]*:';
+               $userinfo = '(?:[a-z0-9\-._~!$&\'()*+,;=:]|%[0-9a-f]{2})*';
+               $ipv6Host = '\\[((?:[0-9a-f:]|%3[0-A]|%[46][1-6])+)\\]';
+               if ( preg_match( "<^(?:{$scheme})?//(?:{$userinfo}@)?{$ipv6Host}(?:[:/?#].*|)$>i", $url, $m ) &&
+                       IP::isValid( rawurldecode( $m[1] ) )
+               ) {
+                       $isIPv6 = rawurldecode( $m[1] );
+               } else {
+                       $isIPv6 = false;
+               }
+
+               # Make sure unsafe characters are encoded
                $url = preg_replace_callback( '/[\x00-\x20"<>\[\\\\\]^`{|}\x7F-\xFF]/',
                        function ( $m ) {
                                return rawurlencode( $m[0] );
@@ -2058,6 +2070,16 @@ class Parser {
                $ret = self::normalizeUrlComponent(
                        substr( $url, 0, $end ), '"#%<>[\]^`{|}/?' ) . $ret;
 
+               # Fix IPv6 syntax
+               if ( $isIPv6 !== false ) {
+                       $ipv6Host = "%5B({$isIPv6})%5D";
+                       $ret = preg_replace(
+                               "<^((?:{$scheme})?//(?:{$userinfo}@)?){$ipv6Host}(?=[:/?#]|$)>i",
+                               "$1[$2]",
+                               $ret
+                       );
+               }
+
                return $ret;
        }
 
index ef95254..d08fe5c 100644 (file)
@@ -69,7 +69,7 @@ class LinkSearchPage extends QueryPage {
                        }
                }
 
-               $target2 = $target;
+               $target2 = Parser::normalizeLinkUrl( $target );
                // Get protocol, default is http://
                $protocol = 'http://';
                $bits = wfParseUrl( $target );
@@ -128,7 +128,7 @@ class LinkSearchPage extends QueryPage {
 
                if ( $target != '' ) {
                        $this->setParams( [
-                               'query' => Parser::normalizeLinkUrl( $target2 ),
+                               'query' => $target2,
                                'namespace' => $namespace,
                                'protocol' => $protocol ] );
                        parent::execute( $par );
@@ -146,37 +146,6 @@ class LinkSearchPage extends QueryPage {
                return false;
        }
 
-       /**
-        * Return an appropriately formatted LIKE query and the clause
-        *
-        * @param string $query Search pattern to search for
-        * @param string $prot Protocol, e.g. 'http://'
-        *
-        * @return array
-        */
-       static function mungeQuery( $query, $prot ) {
-               $field = 'el_index';
-               $dbr = wfGetDB( DB_REPLICA );
-
-               if ( $query === '*' && $prot !== '' ) {
-                       // Allow queries like 'ftp://*' to find all ftp links
-                       $rv = [ $prot, $dbr->anyString() ];
-               } else {
-                       $rv = LinkFilter::makeLikeArray( $query, $prot );
-               }
-
-               if ( $rv === false ) {
-                       // LinkFilter doesn't handle wildcard in IP, so we'll have to munge here.
-                       $pattern = '/^(:?[0-9]{1,3}\.)+\*\s*$|^(:?[0-9]{1,3}\.){3}[0-9]{1,3}:[0-9]*\*\s*$/';
-                       if ( preg_match( $pattern, $query ) ) {
-                               $rv = [ $prot . rtrim( $query, " \t*" ), $dbr->anyString() ];
-                               $field = 'el_to';
-                       }
-               }
-
-               return [ $rv, $field ];
-       }
-
        function linkParameters() {
                $params = [];
                $params['target'] = $this->mProt . $this->mQuery;
@@ -189,16 +158,29 @@ class LinkSearchPage extends QueryPage {
 
        public function getQueryInfo() {
                $dbr = wfGetDB( DB_REPLICA );
-               // strip everything past first wildcard, so that
-               // index-based-only lookup would be done
-               list( $this->mungedQuery, $clause ) = self::mungeQuery( $this->mQuery, $this->mProt );
+
+               if ( $this->mQuery === '*' && $this->mProt !== '' ) {
+                       $this->mungedQuery = [
+                               'el_index_60' . $dbr->buildLike( $this->mProt, $dbr->anyString() ),
+                       ];
+               } else {
+                       $this->mungedQuery = LinkFilter::getQueryConditions( $this->mQuery, [
+                               'protocol' => $this->mProt,
+                               'oneWildcard' => true,
+                               'db' => $dbr
+                       ] );
+               }
                if ( $this->mungedQuery === false ) {
                        // Invalid query; return no results
                        return [ 'tables' => 'page', 'fields' => 'page_id', 'conds' => '0=1' ];
                }
 
-               $stripped = LinkFilter::keepOneWildcard( $this->mungedQuery );
-               $like = $dbr->buildLike( $stripped );
+               $orderBy = [];
+               if ( !isset( $this->mungedQuery['el_index_60'] ) ) {
+                       $orderBy[] = 'el_index_60';
+               }
+               $orderBy[] = 'el_id';
+
                $retval = [
                        'tables' => [ 'page', 'externallinks' ],
                        'fields' => [
@@ -207,11 +189,13 @@ class LinkSearchPage extends QueryPage {
                                'value' => 'el_index',
                                'url' => 'el_to'
                        ],
-                       'conds' => [
-                               'page_id = el_from',
-                               "$clause $like"
-                       ],
-                       'options' => [ 'USE INDEX' => $clause ]
+                       'conds' => array_merge(
+                               [
+                                       'page_id = el_from',
+                               ],
+                               $this->mungedQuery
+                       ),
+                       'options' => [ 'ORDER BY' => $orderBy ]
                ];
 
                if ( $this->mNs !== null && !$this->getConfig()->get( 'MiserMode' ) ) {
@@ -248,9 +232,7 @@ class LinkSearchPage extends QueryPage {
 
        /**
         * Override to squash the ORDER BY.
-        * We do a truncated index search, so the optimizer won't trust
-        * it as good enough for optimizing sort. The implicit ordering
-        * from the scan will usually do well enough for our needs.
+        * Not much point in descending order here.
         * @return array
         */
        function getOrderFields() {
index 24ca86d..17d2e18 100644 (file)
@@ -54,13 +54,13 @@ class CleanupSpam extends Maintenance {
 
                $spec = $this->getArg();
 
-               $likes = [];
+               $protConds = [];
                foreach ( [ 'http://', 'https://' ] as $prot ) {
-                       $like = LinkFilter::makeLikeArray( $spec, $prot );
-                       if ( !$like ) {
+                       $conds = LinkFilter::getQueryConditions( $spec, [ 'protocol' => $prot ] );
+                       if ( !$conds ) {
                                $this->fatalError( "Not a valid hostname specification: $spec" );
                        }
-                       $likes[$prot] = $like;
+                       $protConds[$prot] = $conds;
                }
 
                if ( $this->hasOption( 'all' ) ) {
@@ -71,11 +71,11 @@ class CleanupSpam extends Maintenance {
                                /** @var $dbr Database */
                                $dbr = $this->getDB( DB_REPLICA, [], $wikiID );
 
-                               foreach ( $likes as $like ) {
+                               foreach ( $protConds as $conds ) {
                                        $count = $dbr->selectField(
                                                'externallinks',
                                                'COUNT(*)',
-                                               [ 'el_index' . $dbr->buildLike( $like ) ],
+                                               $conds,
                                                __METHOD__
                                        );
                                        if ( $count ) {
@@ -99,11 +99,11 @@ class CleanupSpam extends Maintenance {
                        $count = 0;
                        /** @var $dbr Database */
                        $dbr = $this->getDB( DB_REPLICA );
-                       foreach ( $likes as $prot => $like ) {
+                       foreach ( $protConds as $prot => $conds ) {
                                $res = $dbr->select(
                                        'externallinks',
                                        [ 'DISTINCT el_from' ],
-                                       [ 'el_index' . $dbr->buildLike( $like ) ],
+                                       $conds,
                                        __METHOD__
                                );
                                $count = $dbr->numRows( $res );
index 9849dc5..76a6a1f 100644 (file)
@@ -38,17 +38,44 @@ class DeleteSelfExternals extends Maintenance {
 
        public function execute() {
                global $wgServer;
+
+               // Extract the host and scheme from $wgServer
+               $bits = wfParseUrl( $wgServer );
+               if ( !$bits ) {
+                       $this->error( 'Could not parse $wgServer' );
+                       exit( 1 );
+               }
+
                $this->output( "Deleting self externals from $wgServer\n" );
                $db = $this->getDB( DB_MASTER );
-               while ( 1 ) {
-                       $this->commitTransaction( $db, __METHOD__ );
-                       $q = $db->limitResult( "DELETE /* deleteSelfExternals */ FROM externallinks WHERE el_to"
-                               . $db->buildLike( $wgServer . '/', $db->anyString() ), $this->getBatchSize() );
-                       $this->output( "Deleting a batch\n" );
-                       $db->query( $q );
-                       if ( !$db->affectedRows() ) {
-                               return;
+
+               // If it's protocol-relative, we need to do both http and https.
+               // Otherwise, just do the specified scheme.
+               $host = $bits['host'];
+               if ( isset( $bits['port'] ) ) {
+                       $host .= ':' . $bits['port'];
+               }
+               if ( $bits['scheme'] != '' ) {
+                       $conds = [ LinkFilter::getQueryConditions( $host, [ 'protocol' => $bits['scheme'] . '://' ] ) ];
+               } else {
+                       $conds = [
+                               LinkFilter::getQueryConditions( $host, [ 'protocol' => 'http://' ] ),
+                               LinkFilter::getQueryConditions( $host, [ 'protocol' => 'https://' ] ),
+                       ];
+               }
+
+               foreach ( $conds as $cond ) {
+                       if ( !$cond ) {
+                               continue;
                        }
+                       $cond = $db->makeList( $cond, LIST_AND );
+                       do {
+                               $this->commitTransaction( $db, __METHOD__ );
+                               $q = $db->limitResult( "DELETE /* deleteSelfExternals */ FROM externallinks WHERE $cond",
+                                       $this->mBatchSize );
+                               $this->output( "Deleting a batch\n" );
+                               $db->query( $q );
+                       } while ( $db->affectedRows() );
                }
        }
 }
index 2b95b43..63e0aa8 100644 (file)
@@ -545,6 +545,9 @@ CREATE TABLE /*_*/externallinks (
   -- which allows for fast searching for all pages under example.com with the
   -- clause:
   --      WHERE el_index LIKE 'http://com.example.%'
+  --
+  -- Note if you enable or disable PHP's intl extension, you'll need to run
+  -- maintenance/refreshExternallinksIndex.php to refresh this field.
   el_index nvarchar(450) NOT NULL,
 
   -- This is el_index truncated to 60 bytes to allow for sortable queries that
diff --git a/maintenance/refreshExternallinksIndex.php b/maintenance/refreshExternallinksIndex.php
new file mode 100644 (file)
index 0000000..1551a94
--- /dev/null
@@ -0,0 +1,120 @@
+<?php
+/**
+ * Refresh the externallinks table el_index and el_index_60 from el_to
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup Maintenance
+ */
+
+require_once __DIR__ . '/Maintenance.php';
+
+/**
+ * Maintenance script that refreshes the externallinks table el_index and
+ * el_index_60 from el_to
+ *
+ * @ingroup Maintenance
+ * @since 1.33
+ */
+class RefreshExternallinksIndex extends LoggedUpdateMaintenance {
+       public function __construct() {
+               parent::__construct();
+               $this->addDescription(
+                       'Refresh the externallinks table el_index and el_index_60 from el_to' );
+               $this->setBatchSize( 10000 );
+       }
+
+       protected function getUpdateKey() {
+               return static::class
+                       . ' v' . LinkFilter::VERSION
+                       . ( LinkFilter::supportsIDN() ? '+' : '-' ) . 'IDN';
+       }
+
+       protected function updateSkippedMessage() {
+               return 'externallinks table indexes up to date';
+       }
+
+       protected function doDBUpdates() {
+               $dbw = $this->getDB( DB_MASTER );
+               if ( !$dbw->tableExists( 'externallinks' ) ) {
+                       $this->error( "externallinks table does not exist" );
+                       return false;
+               }
+               $this->output( "Updating externallinks table index fields\n" );
+
+               $minmax = $dbw->selectRow(
+                       'externallinks',
+                       [ 'min' => 'MIN(el_id)', 'max' => 'MAX(el_id)' ],
+                       '',
+                       __METHOD__
+               );
+
+               $updated = 0;
+               $deleted = 0;
+               $start = $minmax->min - 1;
+               $last = $minmax->max;
+               while ( $start < $last ) {
+                       $end = min( $start + $this->mBatchSize, $last );
+                       $this->output( "el_id $start - $end of $last\n" );
+                       $res = $dbw->select( 'externallinks', [ 'el_id', 'el_to', 'el_index' ],
+                               [
+                                       "el_id > $start",
+                                       "el_id <= $end",
+                               ],
+                               __METHOD__,
+                               [ 'ORDER BY' => 'el_id' ]
+                       );
+                       foreach ( $res as $row ) {
+                               $newIndexes = LinkFilter::makeIndexes( $row->el_to );
+                               if ( !$newIndexes ) {
+                                       $dbw->delete( 'externallinks', [ 'el_id' => $row->el_id ], __METHOD__ );
+                                       $deleted++;
+                                       continue;
+                               }
+                               if ( in_array( $row->el_index, $newIndexes, true ) ) {
+                                       continue;
+                               }
+
+                               if ( count( $newIndexes ) === 1 ) {
+                                       $newIndex = $newIndexes[0];
+                               } else {
+                                       // Assume the scheme is the only difference between the different $newIndexes.
+                                       // Keep this row's scheme, assuming there's another row with the other scheme.
+                                       $newIndex = substr( $row->el_index, 0, strpos( $row->el_index, ':' ) ) .
+                                               substr( $newIndexes[0], strpos( $newIndexes[0], ':' ) );
+                               }
+                               $dbw->update( 'externallinks',
+                                       [
+                                               'el_index' => $newIndex,
+                                               'el_index_60' => substr( $newIndex, 0, 60 ),
+                                       ],
+                                       [ 'el_id' => $row->el_id ],
+                                       __METHOD__
+                               );
+                               $updated++;
+                       }
+                       wfWaitForSlaves();
+                       $start = $end;
+               }
+               $this->output( "Done, $updated rows updated, $deleted deleted.\n" );
+
+               return true;
+       }
+}
+
+$maintClass = "RefreshExternallinksIndex";
+require_once RUN_MAINTENANCE_IF_MAIN;
index 8edc3c3..c46e4c6 100644 (file)
@@ -930,6 +930,9 @@ CREATE TABLE /*_*/externallinks (
   -- which allows for fast searching for all pages under example.com with the
   -- clause:
   --      WHERE el_index LIKE 'http://com.example.%'
+  --
+  -- Note if you enable or disable PHP's intl extension, you'll need to run
+  -- maintenance/refreshExternallinksIndex.php to refresh this field.
   el_index blob NOT NULL,
 
   -- This is el_index truncated to 60 bytes to allow for sortable queries that
index 32c190e..22fe3ce 100644 (file)
@@ -589,63 +589,6 @@ class GlobalTest extends MediaWikiTestCase {
                ];
        }
 
-       /**
-        * @dataProvider provideMakeUrlIndexes()
-        * @covers ::wfMakeUrlIndexes
-        */
-       public function testMakeUrlIndexes( $url, $expected ) {
-               $index = wfMakeUrlIndexes( $url );
-               $this->assertEquals( $expected, $index, "wfMakeUrlIndexes(\"$url\")" );
-       }
-
-       public static function provideMakeUrlIndexes() {
-               return [
-                       // Testcase for T30627
-                       [
-                               'https://example.org/test.cgi?id=12345',
-                               [ 'https://org.example./test.cgi?id=12345' ]
-                       ],
-                       [
-                               // mailtos are handled special
-                               // is this really right though? that final . probably belongs earlier?
-                               'mailto:wiki@wikimedia.org',
-                               [ 'mailto:org.wikimedia@wiki.' ]
-                       ],
-
-                       // file URL cases per T30627...
-                       [
-                               // three slashes: local filesystem path Unix-style
-                               'file:///whatever/you/like.txt',
-                               [ 'file://./whatever/you/like.txt' ]
-                       ],
-                       [
-                               // three slashes: local filesystem path Windows-style
-                               'file:///c:/whatever/you/like.txt',
-                               [ 'file://./c:/whatever/you/like.txt' ]
-                       ],
-                       [
-                               // two slashes: UNC filesystem path Windows-style
-                               'file://intranet/whatever/you/like.txt',
-                               [ 'file://intranet./whatever/you/like.txt' ]
-                       ],
-                       // Multiple-slash cases that can sorta work on Mozilla
-                       // if you hack it just right are kinda pathological,
-                       // and unreliable cross-platform or on IE which means they're
-                       // unlikely to appear on intranets.
-                       // Those will survive the algorithm but with results that
-                       // are less consistent.
-
-                       // protocol-relative URL cases per T31854...
-                       [
-                               '//example.org/test.cgi?id=12345',
-                               [
-                                       'http://org.example./test.cgi?id=12345',
-                                       'https://org.example./test.cgi?id=12345'
-                               ]
-                       ],
-               ];
-       }
-
        /**
         * @dataProvider provideWfMatchesDomainList
         * @covers ::wfMatchesDomainList
index 51b54d2..02fbd81 100644 (file)
@@ -75,7 +75,10 @@ class LinkFilterTest extends MediaWikiLangTestCase {
                        [ 'http://', 'test.com', 'http://name:pass@test.com' ],
                        [ 'http://', '*.test.com', 'http://a.b.c.test.com/dir/dir/file?a=6' ],
                        [ null, 'http://*.test.com', 'http://www.test.com' ],
+                       [ 'http://', '.test.com', 'http://.test.com' ],
+                       [ 'http://', '*..test.com', 'http://foo..test.com' ],
                        [ 'mailto:', 'name@mail.test123.com', 'mailto:name@mail.test123.com' ],
+                       [ 'mailto:', '*@mail.test123.com', 'mailto:name@mail.test123.com' ],
                        [ '',
                                'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg',
                                'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg'
@@ -127,39 +130,66 @@ class LinkFilterTest extends MediaWikiLangTestCase {
                                'http://xx23124:__ffdfdef__@www.test.com:12345/dir' ,
                                'http://name:pass@www.test.com:12345/dir/dir/file.xyz.php#__se__?arg1=_&arg2[]=4rtg'
                        ],
+                       [ 'http://', '127.0.0.1', 'http://127.000.000.001' ],
+                       [ 'http://', '127.0.0.*', 'http://127.000.000.010' ],
+                       [ 'http://', '127.0.*', 'http://127.000.123.010' ],
+                       [ 'http://', '127.*', 'http://127.127.127.127' ],
+                       [ 'http://', '[0:0:0:0:0:0:0:0001]', 'http://[::1]' ],
+                       [ 'http://', '[2001:db8:0:0:*]', 'http://[2001:0DB8::]' ],
+                       [ 'http://', '[2001:db8:0:0:*]', 'http://[2001:0DB8::123]' ],
+                       [ 'http://', '[2001:db8:0:0:*]', 'http://[2001:0DB8::123:456]' ],
+                       [ 'http://', 'xn--f-vgaa.example.com', 'http://fóó.example.com', [ 'idn' => true ] ],
+                       [ 'http://', 'xn--f-vgaa.example.com', 'http://f%c3%b3%C3%B3.example.com', [ 'idn' => true ] ],
+                       [ 'http://', 'fóó.example.com', 'http://xn--f-vgaa.example.com', [ 'idn' => true ] ],
+                       [ 'http://', 'f%c3%b3%C3%B3.example.com', 'http://xn--f-vgaa.example.com', [ 'idn' => true ] ],
+                       [ 'http://', 'f%c3%b3%C3%B3.example.com', 'http://fóó.example.com' ],
+                       [ 'http://', 'fóó.example.com', 'http://f%c3%b3%C3%B3.example.com' ],
+
+                       [ 'http://', 'example.com./foo', 'http://example.com/foo' ],
+                       [ 'http://', 'example.com/foo', 'http://example.com./foo' ],
+                       [ 'http://', '127.0.0.1./foo', 'http://127.0.0.1/foo' ],
+                       [ 'http://', '127.0.0.1/foo', 'http://127.0.0.1./foo' ],
 
                        // Tests for false positives
-                       [ 'http://', 'test.com', 'http://www.test.com', false ],
-                       [ 'http://', 'www1.test.com', 'http://www.test.com', false ],
-                       [ 'http://', '*.test.com', 'http://www.test.t.com', false ],
-                       [ '', 'http://test.com:8080', 'http://www.test.com:8080', false ],
-                       [ '', 'https://test.com', 'http://test.com', false ],
-                       [ '', 'http://test.com', 'https://test.com', false ],
-                       [ 'http://', 'http://test.com', 'http://test.com', false ],
-                       [ null, 'http://www.test.com', 'http://www.test.com:80', false ],
-                       [ null, 'http://www.test.com:80', 'http://www.test.com', false ],
-                       [ null, 'http://*.test.com:80', 'http://www.test.com', false ],
+                       [ 'http://', 'test.com', 'http://www.test.com', [ 'found' => false ] ],
+                       [ 'http://', 'www1.test.com', 'http://www.test.com', [ 'found' => false ] ],
+                       [ 'http://', '*.test.com', 'http://www.test.t.com', [ 'found' => false ] ],
+                       [ 'http://', 'test.com', 'http://xtest.com', [ 'found' => false ] ],
+                       [ 'http://', '*.test.com', 'http://xtest.com', [ 'found' => false ] ],
+                       [ 'http://', '.test.com', 'http://test.com', [ 'found' => false ] ],
+                       [ 'http://', '.test.com', 'http://www.test.com', [ 'found' => false ] ],
+                       [ 'http://', '*..test.com', 'http://test.com', [ 'found' => false ] ],
+                       [ 'http://', '*..test.com', 'http://www.test.com', [ 'found' => false ] ],
+                       [ '', 'http://test.com:8080', 'http://www.test.com:8080', [ 'found' => false ] ],
+                       [ '', 'https://test.com', 'http://test.com', [ 'found' => false ] ],
+                       [ '', 'http://test.com', 'https://test.com', [ 'found' => false ] ],
+                       [ 'http://', 'http://test.com', 'http://test.com', [ 'found' => false ] ],
+                       [ null, 'http://www.test.com', 'http://www.test.com:80', [ 'found' => false ] ],
+                       [ null, 'http://www.test.com:80', 'http://www.test.com', [ 'found' => false ] ],
+                       [ null, 'http://*.test.com:80', 'http://www.test.com', [ 'found' => false ] ],
                        [ '', 'https://gerrit.wikimedia.org/r/#/XXX/status:open,n,z',
-                               'https://gerrit.wikimedia.org/r/#/q/status:open,n,z', false ],
+                               'https://gerrit.wikimedia.org/r/#/q/status:open,n,z', [ 'found' => false ] ],
                        [ '', 'https://*.wikimedia.org/r/#/q/status:open,n,z',
-                               'https://gerrit.wikimedia.org/r/#/XXX/status:open,n,z', false ],
-                       [ 'mailto:', '@test.com', '@abc.test.com', false ],
-                       [ 'mailto:', 'mail@test.com', 'mail2@test.com', false ],
-                       [ '', 'mailto:mail@test.com', 'mail2@test.com', false ],
-                       [ '', 'mailto:@test.com', '@abc.test.com', false ],
-                       [ 'ftp://', '*.co', 'ftp://www.co.uk', false ],
-                       [ 'ftp://', '*.co', 'ftp://www.co.m', false ],
-                       [ 'ftp://', '*.co/dir/', 'ftp://www.co/dir2/', false ],
-                       [ 'ftp://', 'www.co/dir/', 'ftp://www.co/dir2/', false ],
-                       [ 'ftp://', 'test.com/dir/', 'ftp://test.com/', false ],
-                       [ '', 'http://test.com:8080/dir/', 'http://test.com:808/dir/', false ],
-                       [ '', 'http://test.com/dir/index.html', 'http://test.com/dir/index.php', false ],
+                               'https://gerrit.wikimedia.org/r/#/XXX/status:open,n,z', [ 'found' => false ] ],
+                       [ 'mailto:', '@test.com', '@abc.test.com', [ 'found' => false ] ],
+                       [ 'mailto:', 'mail@test.com', 'mail2@test.com', [ 'found' => false ] ],
+                       [ '', 'mailto:mail@test.com', 'mail2@test.com', [ 'found' => false ] ],
+                       [ '', 'mailto:@test.com', '@abc.test.com', [ 'found' => false ] ],
+                       [ 'ftp://', '*.co', 'ftp://www.co.uk', [ 'found' => false ] ],
+                       [ 'ftp://', '*.co', 'ftp://www.co.m', [ 'found' => false ] ],
+                       [ 'ftp://', '*.co/dir/', 'ftp://www.co/dir2/', [ 'found' => false ] ],
+                       [ 'ftp://', 'www.co/dir/', 'ftp://www.co/dir2/', [ 'found' => false ] ],
+                       [ 'ftp://', 'test.com/dir/', 'ftp://test.com/', [ 'found' => false ] ],
+                       [ '', 'http://test.com:8080/dir/', 'http://test.com:808/dir/', [ 'found' => false ] ],
+                       [ '', 'http://test.com/dir/index.html', 'http://test.com/dir/index.php', [ 'found' => false ] ],
+                       [ 'http://', '127.0.0.*', 'http://127.0.1.0', [ 'found' => false ] ],
+                       [ 'http://', '[2001:db8::*]', 'http://[2001:0DB8::123:456]', [ 'found' => false ] ],
 
                        // These are false positives too and ideally shouldn't match, but that
                        // would require using regexes and RLIKE instead of LIKE
-                       // [ null, 'http://*.test.com', 'http://www.test.com:80', false ],
+                       // [ null, 'http://*.test.com', 'http://www.test.com:80', [ 'found' => false ] ],
                        // [ '', 'https://*.wikimedia.org/r/#/q/status:open,n,z',
-                       //      'https://gerrit.wikimedia.org/XXX/r/#/q/status:open,n,z', false ],
+                       //      'https://gerrit.wikimedia.org/XXX/r/#/q/status:open,n,z', [ 'found' => false ] ],
                ];
        }
 
@@ -167,17 +197,24 @@ class LinkFilterTest extends MediaWikiLangTestCase {
         * testMakeLikeArrayWithValidPatterns()
         *
         * Tests whether the LIKE clause produced by LinkFilter::makeLikeArray($pattern, $protocol)
-        * will find one of the URL indexes produced by wfMakeUrlIndexes($url)
+        * will find one of the URL indexes produced by LinkFilter::makeIndexes($url)
         *
         * @dataProvider provideValidPatterns
         *
         * @param string $protocol Protocol, e.g. 'http://' or 'mailto:'
         * @param string $pattern Search pattern to feed to LinkFilter::makeLikeArray
-        * @param string $url URL to feed to wfMakeUrlIndexes
-        * @param bool $shouldBeFound Should the URL be found? (defaults true)
+        * @param string $url URL to feed to LinkFilter::makeIndexes
+        * @param array $options
+        *  - found: (bool) Should the URL be found? (defaults true)
+        *  - idn: (bool) Does this test require the idn conversion (default false)
         */
-       function testMakeLikeArrayWithValidPatterns( $protocol, $pattern, $url, $shouldBeFound = true ) {
-               $indexes = wfMakeUrlIndexes( $url );
+       function testMakeLikeArrayWithValidPatterns( $protocol, $pattern, $url, $options = [] ) {
+               $options += [ 'found' => true, 'idn' => false ];
+               if ( !empty( $options['idn'] ) && !LinkFilter::supportsIDN() ) {
+                       $this->markTestSkipped( 'LinkFilter IDN support is not available' );
+               }
+
+               $indexes = LinkFilter::makeIndexes( $url );
                $likeArray = LinkFilter::makeLikeArray( $pattern, $protocol );
 
                $this->assertTrue( $likeArray !== false,
@@ -186,7 +223,7 @@ class LinkFilterTest extends MediaWikiLangTestCase {
 
                $regex = $this->createRegexFromLIKE( $likeArray );
                $debugmsg = "Regex: '" . $regex . "'\n";
-               $debugmsg .= count( $indexes ) . " index(es) created by wfMakeUrlIndexes():\n";
+               $debugmsg .= count( $indexes ) . " index(es) created by LinkFilter::makeIndexes():\n";
 
                $matches = 0;
 
@@ -195,7 +232,7 @@ class LinkFilterTest extends MediaWikiLangTestCase {
                        $debugmsg .= "\t'$index'\n";
                }
 
-               if ( $shouldBeFound ) {
+               if ( !empty( $options['found'] ) ) {
                        $this->assertTrue(
                                $matches > 0,
                                "Search pattern '$protocol$pattern' does not find url '$url' \n$debugmsg"
@@ -251,4 +288,183 @@ class LinkFilterTest extends MediaWikiLangTestCase {
                );
        }
 
+       /**
+        * @dataProvider provideMakeIndexes()
+        * @covers LinkFilter::makeIndexes
+        */
+       public function testMakeIndexes( $url, $expected ) {
+               // Set global so file:// tests can work
+               $this->setMwGlobals( [
+                       'wgUrlProtocols' => [
+                               'http://',
+                               'https://',
+                               'mailto:',
+                               '//',
+                               'file://', # Non-default
+                       ],
+               ] );
+
+               $index = LinkFilter::makeIndexes( $url );
+               $this->assertEquals( $expected, $index, "LinkFilter::makeIndexes(\"$url\")" );
+       }
+
+       public static function provideMakeIndexes() {
+               return [
+                       // Testcase for T30627
+                       [
+                               'https://example.org/test.cgi?id=12345',
+                               [ 'https://org.example./test.cgi?id=12345' ]
+                       ],
+                       [
+                               // mailtos are handled special
+                               'mailto:wiki@wikimedia.org',
+                               [ 'mailto:org.wikimedia.@wiki' ]
+                       ],
+                       [
+                               // mailtos are handled special
+                               'mailto:wiki',
+                               [ 'mailto:@wiki' ]
+                       ],
+
+                       // file URL cases per T30627...
+                       [
+                               // three slashes: local filesystem path Unix-style
+                               'file:///whatever/you/like.txt',
+                               [ 'file://./whatever/you/like.txt' ]
+                       ],
+                       [
+                               // three slashes: local filesystem path Windows-style
+                               'file:///c:/whatever/you/like.txt',
+                               [ 'file://./c:/whatever/you/like.txt' ]
+                       ],
+                       [
+                               // two slashes: UNC filesystem path Windows-style
+                               'file://intranet/whatever/you/like.txt',
+                               [ 'file://intranet./whatever/you/like.txt' ]
+                       ],
+                       // Multiple-slash cases that can sorta work on Mozilla
+                       // if you hack it just right are kinda pathological,
+                       // and unreliable cross-platform or on IE which means they're
+                       // unlikely to appear on intranets.
+                       // Those will survive the algorithm but with results that
+                       // are less consistent.
+
+                       // protocol-relative URL cases per T31854...
+                       [
+                               '//example.org/test.cgi?id=12345',
+                               [
+                                       'http://org.example./test.cgi?id=12345',
+                                       'https://org.example./test.cgi?id=12345'
+                               ]
+                       ],
+
+                       // IP addresses
+                       [
+                               'http://192.0.2.0/foo',
+                               [ 'http://V4.192.0.2.0./foo' ]
+                       ],
+                       [
+                               'http://192.0.0002.0/foo',
+                               [ 'http://V4.192.0.2.0./foo' ]
+                       ],
+                       [
+                               'http://[2001:db8::1]/foo',
+                               [ 'http://V6.2001.DB8.0.0.0.0.0.1./foo' ]
+                       ],
+
+                       // Explicit specification of the DNS root
+                       [
+                               'http://example.com./foo',
+                               [ 'http://com.example./foo' ]
+                       ],
+                       [
+                               'http://192.0.2.0./foo',
+                               [ 'http://V4.192.0.2.0./foo' ]
+                       ],
+
+                       // Weird edge case
+                       [
+                               'http://.example.com/foo',
+                               [ 'http://com.example../foo' ]
+                       ],
+               ];
+       }
+
+       /**
+        * @dataProvider provideGetQueryConditions
+        * @covers LinkFilter::getQueryConditions
+        */
+       public function testGetQueryConditions( $query, $options, $expected ) {
+               $conds = LinkFilter::getQueryConditions( $query, $options );
+               $this->assertEquals( $expected, $conds );
+       }
+
+       public static function provideGetQueryConditions() {
+               return [
+                       'Basic example' => [
+                               'example.com',
+                               [],
+                               [
+                                       'el_index_60 LIKE \'http://com.example./%\' ESCAPE \'`\' ',
+                                       'el_index LIKE \'http://com.example./%\' ESCAPE \'`\' ',
+                               ],
+                       ],
+                       'Basic example with path' => [
+                               'example.com/foobar',
+                               [],
+                               [
+                                       'el_index_60 LIKE \'http://com.example./foobar%\' ESCAPE \'`\' ',
+                                       'el_index LIKE \'http://com.example./foobar%\' ESCAPE \'`\' ',
+                               ],
+                       ],
+                       'Wildcard domain' => [
+                               '*.example.com',
+                               [],
+                               [
+                                       'el_index_60 LIKE \'http://com.example.%\' ESCAPE \'`\' ',
+                                       'el_index LIKE \'http://com.example.%\' ESCAPE \'`\' ',
+                               ],
+                       ],
+                       'Wildcard domain with path' => [
+                               '*.example.com/foobar',
+                               [],
+                               [
+                                       'el_index_60 LIKE \'http://com.example.%\' ESCAPE \'`\' ',
+                                       'el_index LIKE \'http://com.example.%/foobar%\' ESCAPE \'`\' ',
+                               ],
+                       ],
+                       'Wildcard domain with path, oneWildcard=true' => [
+                               '*.example.com/foobar',
+                               [ 'oneWildcard' => true ],
+                               [
+                                       'el_index_60 LIKE \'http://com.example.%\' ESCAPE \'`\' ',
+                                       'el_index LIKE \'http://com.example.%\' ESCAPE \'`\' ',
+                               ],
+                       ],
+                       'Constant prefix' => [
+                               'example.com/blah/blah/blah/blah/blah/blah/blah/blah/blah/blah?foo=',
+                               [],
+                               [
+                                       'el_index_60' => 'http://com.example./blah/blah/blah/blah/blah/blah/blah/blah/',
+                                       'el_index LIKE ' .
+                                               '\'http://com.example./blah/blah/blah/blah/blah/blah/blah/blah/blah/blah?foo=%\' ' .
+                                               'ESCAPE \'`\' ',
+                               ],
+                       ],
+                       'Bad protocol' => [
+                               'test/',
+                               [ 'protocol' => 'invalid://' ],
+                               false
+                       ],
+                       'Various options' => [
+                               'example.com',
+                               [ 'protocol' => 'https://', 'prefix' => 'xx' ],
+                               [
+                                       'xx_index_60 LIKE \'https://com.example./%\' ESCAPE \'`\' ',
+                                       'xx_index LIKE \'https://com.example./%\' ESCAPE \'`\' ',
+                               ],
+                       ],
+               ];
+       }
+
 }
index d702084..e102b9b 100644 (file)
@@ -181,6 +181,16 @@ class ParserMethodsTest extends MediaWikiLangTestCase {
                                'http://example.org/%23%2F%3F%26%3D%2B%3B?%23%2F%3F%26%3D%2B%3B#%23%2F%3F%26%3D%2B%3B',
                                'http://example.org/%23%2F%3F&=+;?%23/?%26%3D%2B%3B#%23/?&=+;',
                        ],
+                       [
+                               'IPv6 links aren\'t escaped',
+                               'http://[::1]/foobar',
+                               'http://[::1]/foobar',
+                       ],
+                       [
+                               'non-IPv6 links aren\'t unescaped',
+                               'http://%5B::1%5D/foobar',
+                               'http://%5B::1%5D/foobar',
+                       ],
                ];
        }