Split parser related files to have one class in one file
[lhc/web/wiklou.git] / includes / LinkFilter.php
index 3b03f87..a4e94da 100644 (file)
@@ -32,6 +32,11 @@ use Wikimedia\Rdbms\LikeMatch;
  * Another cool thing to do would be a web interface for fast spam removal.
  */
 class LinkFilter {
+       /**
+        * Increment this when makeIndexes output changes. It'll cause
+        * maintenance/refreshExternallinksIndex.php to run from update.php.
+        */
+       const VERSION = 1;
 
        /**
         * Check whether $content contains a link to $filterEntry
@@ -49,7 +54,7 @@ class LinkFilter {
                        return 0;
                }
 
-               $text = $content->getNativeData();
+               $text = $content->getText();
 
                $regex = self::makeRegex( $filterEntry, $protocol );
                return preg_match( $regex, $text );
@@ -58,6 +63,7 @@ class LinkFilter {
        /**
         * Builds a regex pattern for $filterEntry.
         *
+        * @todo This doesn't match the rest of the functionality here.
         * @param string $filterEntry URL, if it begins with "*.", it'll be
         *        replaced to match any subdomain
         * @param string $protocol 'http://' or 'https://'
@@ -75,23 +81,231 @@ class LinkFilter {
        }
 
        /**
-        * Make an array to be used for calls to Database::buildLike(), which
-        * will match the specified string. There are several kinds of filter entry:
-        *     *.domain.com    -  Produces http://com.domain.%, matches domain.com
-        *                        and www.domain.com
-        *     domain.com      -  Produces http://com.domain./%, matches domain.com
-        *                        or domain.com/ but not www.domain.com
-        *     *.domain.com/x  -  Produces http://com.domain.%/x%, matches
-        *                        www.domain.com/xy
-        *     domain.com/x    -  Produces http://com.domain./x%, matches
-        *                        domain.com/xy but not www.domain.com/xy
+        * Indicate whether LinkFilter IDN support is available
+        * @since 1.33
+        * @return bool
+        */
+       public static function supportsIDN() {
+               return is_callable( 'idn_to_utf8' ) && defined( 'INTL_IDNA_VARIANT_UTS46' );
+       }
+
+       /**
+        * Canonicalize a hostname for el_index
+        * @param string $hose
+        * @return string
+        */
+       private static function indexifyHost( $host ) {
+               // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
+
+               // Canonicalize.
+               $host = rawurldecode( $host );
+               if ( $host !== '' && self::supportsIDN() ) {
+                       // @todo Add a PHP fallback
+                       $tmp = idn_to_utf8( $host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46 );
+                       if ( $tmp !== false ) {
+                               $host = $tmp;
+                       }
+               }
+               $okChars = 'a-zA-Z0-9\\-._~!$&\'()*+,;=';
+               if ( StringUtils::isUtf8( $host ) ) {
+                       // Save a little space by not percent-encoding valid UTF-8 bytes
+                       $okChars .= '\x80-\xf4';
+               }
+               $host = preg_replace_callback(
+                       '<[^' . $okChars . ']>',
+                       function ( $m ) {
+                               return rawurlencode( $m[0] );
+                       },
+                       strtolower( $host )
+               );
+
+               // IPv6? RFC 3986 syntax.
+               if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host ), $m ) ) {
+                       $ip = $m[1];
+                       if ( IP::isValid( $ip ) ) {
+                               return 'V6.' . implode( '.', explode( ':', IP::sanitizeIP( $ip ) ) ) . '.';
+                       }
+                       if ( substr( $ip, -2 ) === ':*' ) {
+                               $cutIp = substr( $ip, 0, -2 );
+                               if ( IP::isValid( "{$cutIp}::" ) ) {
+                                       // Wildcard IP doesn't contain "::", so multiple parts can be wild
+                                       $ct = count( explode( ':', $ip ) ) - 1;
+                                       return 'V6.' .
+                                               implode( '.', array_slice( explode( ':', IP::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) .
+                                               '.*.';
+                               }
+                               if ( IP::isValid( "{$cutIp}:1" ) ) {
+                                       // Wildcard IP does contain "::", so only the last part is wild
+                                       return 'V6.' .
+                                               substr( implode( '.', explode( ':', IP::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) .
+                                               '*.';
+                               }
+                       }
+               }
+
+               // Regularlize explicit specification of the DNS root.
+               // Browsers seem to do this for IPv4 literals too.
+               if ( substr( $host, -1 ) === '.' ) {
+                       $host = substr( $host, 0, -1 );
+               }
+
+               // IPv4?
+               $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])';
+               if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", $host ) ) {
+                       return 'V4.' . implode( '.', array_map( function ( $v ) {
+                               return $v === '*' ? $v : (int)$v;
+                       }, explode( '.', $host ) ) ) . '.';
+               }
+
+               // Must be a host name.
+               return implode( '.', array_reverse( explode( '.', $host ) ) ) . '.';
+       }
+
+       /**
+        * Converts a URL into a format for el_index
+        * @since 1.33
+        * @param string $url
+        * @return string[] Usually one entry, but might be two in case of
+        *  protocol-relative URLs. Empty array on error.
+        */
+       public static function makeIndexes( $url ) {
+               // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
+
+               // NOTE: refreshExternallinksIndex.php assumes that only protocol-relative URLs return more
+               // than one index, and that the indexes for protocol-relative URLs only vary in the "http://"
+               // versus "https://" prefix. If you change that, you'll likely need to update
+               // refreshExternallinksIndex.php accordingly.
+
+               $bits = wfParseUrl( $url );
+               if ( !$bits ) {
+                       return [];
+               }
+
+               // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
+               // For emails turn it into "domain.reversed@localpart"
+               if ( $bits['scheme'] == 'mailto' ) {
+                       $mailparts = explode( '@', $bits['host'], 2 );
+                       if ( count( $mailparts ) === 2 ) {
+                               $domainpart = self::indexifyHost( $mailparts[1] );
+                       } else {
+                               // No @, assume it's a local part with no domain
+                               $domainpart = '';
+                       }
+                       $bits['host'] = $domainpart . '@' . $mailparts[0];
+               } else {
+                       $bits['host'] = self::indexifyHost( $bits['host'] );
+               }
+
+               // Reconstruct the pseudo-URL
+               $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
+               // Leave out user and password. Add the port, path, query and fragment
+               if ( isset( $bits['port'] ) ) {
+                       $index .= ':' . $bits['port'];
+               }
+               if ( isset( $bits['path'] ) ) {
+                       $index .= $bits['path'];
+               } else {
+                       $index .= '/';
+               }
+               if ( isset( $bits['query'] ) ) {
+                       $index .= '?' . $bits['query'];
+               }
+               if ( isset( $bits['fragment'] ) ) {
+                       $index .= '#' . $bits['fragment'];
+               }
+
+               if ( $bits['scheme'] == '' ) {
+                       return [ "http:$index", "https:$index" ];
+               } else {
+                       return [ $index ];
+               }
+       }
+
+       /**
+        * Return query conditions which will match the specified string. There are
+        * several kinds of filter entry:
+        *
+        *     *.domain.com    -  Matches domain.com and www.domain.com
+        *     domain.com      -  Matches domain.com or domain.com/ but not www.domain.com
+        *     *.domain.com/x  -  Matches domain.com/xy or www.domain.com/xy. Also probably matches
+        *                        domain.com/foobar/xy due to limitations of LIKE syntax.
+        *     domain.com/x    -  Matches domain.com/xy but not www.domain.com/xy
+        *     192.0.2.*       -  Matches any IP in 192.0.2.0/24. Can also have a path appended.
+        *     [2001:db8::*]   -  Matches any IP in 2001:db8::/112. Can also have a path appended.
+        *     [2001:db8:*]    -  Matches any IP in 2001:db8::/32. Can also have a path appended.
+        *     foo@domain.com  -  With protocol 'mailto:', matches the email address foo@domain.com.
+        *     *@domain.com    -  With protocol 'mailto:', matches any email address at domain.com, but
+        *                        not subdomains like foo@mail.domain.com
         *
         * Asterisks in any other location are considered invalid.
         *
-        * This function does the same as wfMakeUrlIndexes(), except it also takes care
+        * @since 1.33
+        * @param string $filterEntry Filter entry, as described above
+        * @param array $options Options are:
+        *   - protocol: (string) Protocol to query (default http://)
+        *   - oneWildcard: (bool) Stop at the first wildcard (default false)
+        *   - prefix: (string) Field prefix (default 'el'). The query will test
+        *     fields '{$prefix}_index' and '{$prefix}_index_60'
+        *   - db: (IDatabase|null) Database to use.
+        * @return array|bool Conditions to be used for the query (to be ANDed) or
+        *  false on error. To determine if the query is constant on the
+        *  el_index_60 field, check whether key 'el_index_60' is set.
+        */
+       public static function getQueryConditions( $filterEntry, array $options = [] ) {
+               $options += [
+                       'protocol' => 'http://',
+                       'oneWildcard' => false,
+                       'prefix' => 'el',
+                       'db' => null,
+               ];
+
+               // First, get the like array
+               $like = self::makeLikeArray( $filterEntry, $options['protocol'] );
+               if ( $like === false ) {
+                       return $like;
+               }
+
+               // Get the constant prefix (i.e. everything up to the first wildcard)
+               $trimmedLike = self::keepOneWildcard( $like );
+               if ( $options['oneWildcard'] ) {
+                       $like = $trimmedLike;
+               }
+               if ( $trimmedLike[count( $trimmedLike ) - 1] instanceof LikeMatch ) {
+                       array_pop( $trimmedLike );
+               }
+               $index = implode( '', $trimmedLike );
+
+               $p = $options['prefix'];
+               $db = $options['db'] ?: wfGetDB( DB_REPLICA );
+
+               // Build the query
+               $l = strlen( $index );
+               if ( $l >= 60 ) {
+                       // The constant prefix is larger than el_index_60, so we can use a
+                       // constant comparison.
+                       return [
+                               "{$p}_index_60" => substr( $index, 0, 60 ),
+                               "{$p}_index" . $db->buildLike( $like ),
+                       ];
+               }
+
+               // The constant prefix is smaller than el_index_60, so we use a LIKE
+               // for a prefix search.
+               return [
+                       "{$p}_index_60" . $db->buildLike( [ $index, $db->anyString() ] ),
+                       "{$p}_index" . $db->buildLike( $like ),
+               ];
+       }
+
+       /**
+        * Make an array to be used for calls to Database::buildLike(), which
+        * will match the specified string.
+        *
+        * This function does the same as LinkFilter::makeIndexes(), except it also takes care
         * of adding wildcards
         *
-        * @param string $filterEntry Domainparts
+        * @note You probably want self::getQueryConditions() instead
+        * @param string $filterEntry Filter entry, @see self::getQueryConditions()
         * @param string $protocol Protocol (default http://)
         * @return array|bool Array to be passed to Database::buildLike() or false on error
         */
@@ -100,38 +314,27 @@ class LinkFilter {
 
                $target = $protocol . $filterEntry;
                $bits = wfParseUrl( $target );
-
-               if ( $bits == false ) {
-                       // Unknown protocol?
+               if ( !$bits ) {
                        return false;
                }
 
-               if ( substr( $bits['host'], 0, 2 ) == '*.' ) {
-                       $subdomains = true;
-                       $bits['host'] = substr( $bits['host'], 2 );
-                       if ( $bits['host'] == '' ) {
-                               // We don't want to make a clause that will match everything,
-                               // that could be dangerous
-                               return false;
-                       }
-               } else {
-                       $subdomains = false;
-               }
-
-               // Reverse the labels in the hostname, convert to lower case
-               // For emails reverse domainpart only
+               $subdomains = false;
                if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) {
-                       // complete email address
-                       $mailparts = explode( '@', $bits['host'] );
-                       $domainpart = strtolower( implode( '.', array_reverse( explode( '.', $mailparts[1] ) ) ) );
-                       $bits['host'] = $domainpart . '@' . $mailparts[0];
-               } elseif ( $bits['scheme'] === 'mailto' ) {
-                       // domainpart of email address only, do not add '.'
-                       $bits['host'] = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) );
+                       // Email address with domain and non-empty local part
+                       $mailparts = explode( '@', $bits['host'], 2 );
+                       $domainpart = self::indexifyHost( $mailparts[1] );
+                       if ( $mailparts[0] === '*' ) {
+                               $subdomains = true;
+                               $bits['host'] = $domainpart . '@';
+                       } else {
+                               $bits['host'] = $domainpart . '@' . $mailparts[0];
+                       }
                } else {
-                       $bits['host'] = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) );
-                       if ( substr( $bits['host'], -1, 1 ) !== '.' ) {
-                               $bits['host'] .= '.';
+                       // Non-email, or email with only a domain part.
+                       $bits['host'] = self::indexifyHost( $bits['host'] );
+                       if ( substr( $bits['host'], -3 ) === '.*.' ) {
+                               $subdomains = true;
+                               $bits['host'] = substr( $bits['host'], 0, -2 );
                        }
                }
 
@@ -175,6 +378,7 @@ class LinkFilter {
         * Filters an array returned by makeLikeArray(), removing everything past first
         * pattern placeholder.
         *
+        * @note You probably want self::getQueryConditions() instead
         * @param array $arr Array to filter
         * @return array Filtered array
         */