* Another cool thing to do would be a web interface for fast spam removal.
*/
class LinkFilter {
+ /**
+ * Increment this when makeIndexes output changes. It'll cause
+ * maintenance/refreshExternallinksIndex.php to run from update.php.
+ */
+ const VERSION = 1;
/**
* Check whether $content contains a link to $filterEntry
/**
* Builds a regex pattern for $filterEntry.
*
+ * @todo This doesn't match the rest of the functionality here.
* @param string $filterEntry URL, if it begins with "*.", it'll be
* replaced to match any subdomain
* @param string $protocol 'http://' or 'https://'
* @return string Regex pattern, for preg_match()
*/
private static function makeRegex( $filterEntry, $protocol ) {
- $regex = '!' . preg_quote( $protocol );
+ $regex = '!' . preg_quote( $protocol, '!' );
if ( substr( $filterEntry, 0, 2 ) == '*.' ) {
$regex .= '(?:[A-Za-z0-9.-]+\.|)';
$filterEntry = substr( $filterEntry, 2 );
}
/**
- * Make an array to be used for calls to Database::buildLike(), which
- * will match the specified string. There are several kinds of filter entry:
- * *.domain.com - Produces http://com.domain.%, matches domain.com
- * and www.domain.com
- * domain.com - Produces http://com.domain./%, matches domain.com
- * or domain.com/ but not www.domain.com
- * *.domain.com/x - Produces http://com.domain.%/x%, matches
- * www.domain.com/xy
- * domain.com/x - Produces http://com.domain./x%, matches
- * domain.com/xy but not www.domain.com/xy
+ * Indicate whether LinkFilter IDN support is available
+ * @since 1.33
+ * @return bool
+ */
+ public static function supportsIDN() {
+ return is_callable( 'idn_to_utf8' ) && defined( 'INTL_IDNA_VARIANT_UTS46' );
+ }
+
+ /**
+ * Canonicalize a hostname for el_index
+ * @param string $hose
+ * @return string
+ */
+ private static function indexifyHost( $host ) {
+ // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
+
+ // Canonicalize.
+ $host = rawurldecode( $host );
+ if ( $host !== '' && self::supportsIDN() ) {
+ // @todo Add a PHP fallback
+ $tmp = idn_to_utf8( $host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46 );
+ if ( $tmp !== false ) {
+ $host = $tmp;
+ }
+ }
+ $okChars = 'a-zA-Z0-9\\-._~!$&\'()*+,;=';
+ if ( StringUtils::isUtf8( $host ) ) {
+ // Save a little space by not percent-encoding valid UTF-8 bytes
+ $okChars .= '\x80-\xf4';
+ }
+ $host = preg_replace_callback(
+ '<[^' . $okChars . ']>',
+ function ( $m ) {
+ return rawurlencode( $m[0] );
+ },
+ strtolower( $host )
+ );
+
+ // IPv6? RFC 3986 syntax.
+ if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host ), $m ) ) {
+ $ip = $m[1];
+ if ( IP::isValid( $ip ) ) {
+ return 'V6.' . implode( '.', explode( ':', IP::sanitizeIP( $ip ) ) ) . '.';
+ }
+ if ( substr( $ip, -2 ) === ':*' ) {
+ $cutIp = substr( $ip, 0, -2 );
+ if ( IP::isValid( "{$cutIp}::" ) ) {
+ // Wildcard IP doesn't contain "::", so multiple parts can be wild
+ $ct = count( explode( ':', $ip ) ) - 1;
+ return 'V6.' .
+ implode( '.', array_slice( explode( ':', IP::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) .
+ '.*.';
+ }
+ if ( IP::isValid( "{$cutIp}:1" ) ) {
+ // Wildcard IP does contain "::", so only the last part is wild
+ return 'V6.' .
+ substr( implode( '.', explode( ':', IP::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) .
+ '*.';
+ }
+ }
+ }
+
+ // Regularlize explicit specification of the DNS root.
+ // Browsers seem to do this for IPv4 literals too.
+ if ( substr( $host, -1 ) === '.' ) {
+ $host = substr( $host, 0, -1 );
+ }
+
+ // IPv4?
+ $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])';
+ if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", $host ) ) {
+ return 'V4.' . implode( '.', array_map( function ( $v ) {
+ return $v === '*' ? $v : (int)$v;
+ }, explode( '.', $host ) ) ) . '.';
+ }
+
+ // Must be a host name.
+ return implode( '.', array_reverse( explode( '.', $host ) ) ) . '.';
+ }
+
+ /**
+ * Converts a URL into a format for el_index
+ * @since 1.33
+ * @param string $url
+ * @return string[] Usually one entry, but might be two in case of
+ * protocol-relative URLs. Empty array on error.
+ */
+ public static function makeIndexes( $url ) {
+ // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
+
+ // NOTE: refreshExternallinksIndex.php assumes that only protocol-relative URLs return more
+ // than one index, and that the indexes for protocol-relative URLs only vary in the "http://"
+ // versus "https://" prefix. If you change that, you'll likely need to update
+ // refreshExternallinksIndex.php accordingly.
+
+ $bits = wfParseUrl( $url );
+ if ( !$bits ) {
+ return [];
+ }
+
+ // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
+ // For emails turn it into "domain.reversed@localpart"
+ if ( $bits['scheme'] == 'mailto' ) {
+ $mailparts = explode( '@', $bits['host'], 2 );
+ if ( count( $mailparts ) === 2 ) {
+ $domainpart = self::indexifyHost( $mailparts[1] );
+ } else {
+ // No @, assume it's a local part with no domain
+ $domainpart = '';
+ }
+ $bits['host'] = $domainpart . '@' . $mailparts[0];
+ } else {
+ $bits['host'] = self::indexifyHost( $bits['host'] );
+ }
+
+ // Reconstruct the pseudo-URL
+ $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
+ // Leave out user and password. Add the port, path, query and fragment
+ if ( isset( $bits['port'] ) ) {
+ $index .= ':' . $bits['port'];
+ }
+ if ( isset( $bits['path'] ) ) {
+ $index .= $bits['path'];
+ } else {
+ $index .= '/';
+ }
+ if ( isset( $bits['query'] ) ) {
+ $index .= '?' . $bits['query'];
+ }
+ if ( isset( $bits['fragment'] ) ) {
+ $index .= '#' . $bits['fragment'];
+ }
+
+ if ( $bits['scheme'] == '' ) {
+ return [ "http:$index", "https:$index" ];
+ } else {
+ return [ $index ];
+ }
+ }
+
+ /**
+ * Return query conditions which will match the specified string. There are
+ * several kinds of filter entry:
+ *
+ * *.domain.com - Matches domain.com and www.domain.com
+ * domain.com - Matches domain.com or domain.com/ but not www.domain.com
+ * *.domain.com/x - Matches domain.com/xy or www.domain.com/xy. Also probably matches
+ * domain.com/foobar/xy due to limitations of LIKE syntax.
+ * domain.com/x - Matches domain.com/xy but not www.domain.com/xy
+ * 192.0.2.* - Matches any IP in 192.0.2.0/24. Can also have a path appended.
+ * [2001:db8::*] - Matches any IP in 2001:db8::/112. Can also have a path appended.
+ * [2001:db8:*] - Matches any IP in 2001:db8::/32. Can also have a path appended.
+ * foo@domain.com - With protocol 'mailto:', matches the email address foo@domain.com.
+ * *@domain.com - With protocol 'mailto:', matches any email address at domain.com, but
+ * not subdomains like foo@mail.domain.com
*
* Asterisks in any other location are considered invalid.
*
- * This function does the same as wfMakeUrlIndexes(), except it also takes care
+ * @since 1.33
+ * @param string $filterEntry Filter entry, as described above
+ * @param array $options Options are:
+ * - protocol: (string) Protocol to query (default http://)
+ * - oneWildcard: (bool) Stop at the first wildcard (default false)
+ * - prefix: (string) Field prefix (default 'el'). The query will test
+ * fields '{$prefix}_index' and '{$prefix}_index_60'
+ * - db: (IDatabase|null) Database to use.
+ * @return array|bool Conditions to be used for the query (to be ANDed) or
+ * false on error. To determine if the query is constant on the
+ * el_index_60 field, check whether key 'el_index_60' is set.
+ */
+ public static function getQueryConditions( $filterEntry, array $options = [] ) {
+ $options += [
+ 'protocol' => 'http://',
+ 'oneWildcard' => false,
+ 'prefix' => 'el',
+ 'db' => null,
+ ];
+
+ // First, get the like array
+ $like = self::makeLikeArray( $filterEntry, $options['protocol'] );
+ if ( $like === false ) {
+ return $like;
+ }
+
+ // Get the constant prefix (i.e. everything up to the first wildcard)
+ $trimmedLike = self::keepOneWildcard( $like );
+ if ( $options['oneWildcard'] ) {
+ $like = $trimmedLike;
+ }
+ if ( $trimmedLike[count( $trimmedLike ) - 1] instanceof LikeMatch ) {
+ array_pop( $trimmedLike );
+ }
+ $index = implode( '', $trimmedLike );
+
+ $p = $options['prefix'];
+ $db = $options['db'] ?: wfGetDB( DB_REPLICA );
+
+ // Build the query
+ $l = strlen( $index );
+ if ( $l >= 60 ) {
+ // The constant prefix is larger than el_index_60, so we can use a
+ // constant comparison.
+ return [
+ "{$p}_index_60" => substr( $index, 0, 60 ),
+ "{$p}_index" . $db->buildLike( $like ),
+ ];
+ }
+
+ // The constant prefix is smaller than el_index_60, so we use a LIKE
+ // for a prefix search.
+ return [
+ "{$p}_index_60" . $db->buildLike( [ $index, $db->anyString() ] ),
+ "{$p}_index" . $db->buildLike( $like ),
+ ];
+ }
+
+ /**
+ * Make an array to be used for calls to Database::buildLike(), which
+ * will match the specified string.
+ *
+ * This function does the same as LinkFilter::makeIndexes(), except it also takes care
* of adding wildcards
*
- * @param string $filterEntry Domainparts
+ * @note You probably want self::getQueryConditions() instead
+ * @param string $filterEntry Filter entry, @see self::getQueryConditions()
* @param string $protocol Protocol (default http://)
* @return array|bool Array to be passed to Database::buildLike() or false on error
*/
$target = $protocol . $filterEntry;
$bits = wfParseUrl( $target );
-
- if ( $bits == false ) {
- // Unknown protocol?
+ if ( !$bits ) {
return false;
}
- if ( substr( $bits['host'], 0, 2 ) == '*.' ) {
- $subdomains = true;
- $bits['host'] = substr( $bits['host'], 2 );
- if ( $bits['host'] == '' ) {
- // We don't want to make a clause that will match everything,
- // that could be dangerous
- return false;
- }
- } else {
- $subdomains = false;
- }
-
- // Reverse the labels in the hostname, convert to lower case
- // For emails reverse domainpart only
+ $subdomains = false;
if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) {
- // complete email address
- $mailparts = explode( '@', $bits['host'] );
- $domainpart = strtolower( implode( '.', array_reverse( explode( '.', $mailparts[1] ) ) ) );
- $bits['host'] = $domainpart . '@' . $mailparts[0];
- } elseif ( $bits['scheme'] === 'mailto' ) {
- // domainpart of email address only, do not add '.'
- $bits['host'] = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) );
+ // Email address with domain and non-empty local part
+ $mailparts = explode( '@', $bits['host'], 2 );
+ $domainpart = self::indexifyHost( $mailparts[1] );
+ if ( $mailparts[0] === '*' ) {
+ $subdomains = true;
+ $bits['host'] = $domainpart . '@';
+ } else {
+ $bits['host'] = $domainpart . '@' . $mailparts[0];
+ }
} else {
- $bits['host'] = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) );
- if ( substr( $bits['host'], -1, 1 ) !== '.' ) {
- $bits['host'] .= '.';
+ // Non-email, or email with only a domain part.
+ $bits['host'] = self::indexifyHost( $bits['host'] );
+ if ( substr( $bits['host'], -3 ) === '.*.' ) {
+ $subdomains = true;
+ $bits['host'] = substr( $bits['host'], 0, -2 );
}
}
* Filters an array returned by makeLikeArray(), removing everything past first
* pattern placeholder.
*
+ * @note You probably want self::getQueryConditions() instead
* @param array $arr Array to filter
* @return array Filtered array
*/