X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2FLinkFilter.php;h=a4e94da7cf12daf3b282e169cbbc661c31d9f061;hb=3f16fda078025ccf2716f0a6bf10a650244c9d89;hp=3b03f879764d83b60df40e7d31c0df221cb06496;hpb=0c045de899977975884aa93f3f4e2642dd175f83;p=lhc%2Fweb%2Fwiklou.git diff --git a/includes/LinkFilter.php b/includes/LinkFilter.php index 3b03f87976..a4e94da7cf 100644 --- a/includes/LinkFilter.php +++ b/includes/LinkFilter.php @@ -32,6 +32,11 @@ use Wikimedia\Rdbms\LikeMatch; * Another cool thing to do would be a web interface for fast spam removal. */ class LinkFilter { + /** + * Increment this when makeIndexes output changes. It'll cause + * maintenance/refreshExternallinksIndex.php to run from update.php. + */ + const VERSION = 1; /** * Check whether $content contains a link to $filterEntry @@ -49,7 +54,7 @@ class LinkFilter { return 0; } - $text = $content->getNativeData(); + $text = $content->getText(); $regex = self::makeRegex( $filterEntry, $protocol ); return preg_match( $regex, $text ); @@ -58,6 +63,7 @@ class LinkFilter { /** * Builds a regex pattern for $filterEntry. * + * @todo This doesn't match the rest of the functionality here. * @param string $filterEntry URL, if it begins with "*.", it'll be * replaced to match any subdomain * @param string $protocol 'http://' or 'https://' @@ -75,23 +81,231 @@ class LinkFilter { } /** - * Make an array to be used for calls to Database::buildLike(), which - * will match the specified string. There are several kinds of filter entry: - * *.domain.com - Produces http://com.domain.%, matches domain.com - * and www.domain.com - * domain.com - Produces http://com.domain./%, matches domain.com - * or domain.com/ but not www.domain.com - * *.domain.com/x - Produces http://com.domain.%/x%, matches - * www.domain.com/xy - * domain.com/x - Produces http://com.domain./x%, matches - * domain.com/xy but not www.domain.com/xy + * Indicate whether LinkFilter IDN support is available + * @since 1.33 + * @return bool + */ + public static function supportsIDN() { + return is_callable( 'idn_to_utf8' ) && defined( 'INTL_IDNA_VARIANT_UTS46' ); + } + + /** + * Canonicalize a hostname for el_index + * @param string $hose + * @return string + */ + private static function indexifyHost( $host ) { + // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION! + + // Canonicalize. + $host = rawurldecode( $host ); + if ( $host !== '' && self::supportsIDN() ) { + // @todo Add a PHP fallback + $tmp = idn_to_utf8( $host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46 ); + if ( $tmp !== false ) { + $host = $tmp; + } + } + $okChars = 'a-zA-Z0-9\\-._~!$&\'()*+,;='; + if ( StringUtils::isUtf8( $host ) ) { + // Save a little space by not percent-encoding valid UTF-8 bytes + $okChars .= '\x80-\xf4'; + } + $host = preg_replace_callback( + '<[^' . $okChars . ']>', + function ( $m ) { + return rawurlencode( $m[0] ); + }, + strtolower( $host ) + ); + + // IPv6? RFC 3986 syntax. + if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host ), $m ) ) { + $ip = $m[1]; + if ( IP::isValid( $ip ) ) { + return 'V6.' . implode( '.', explode( ':', IP::sanitizeIP( $ip ) ) ) . '.'; + } + if ( substr( $ip, -2 ) === ':*' ) { + $cutIp = substr( $ip, 0, -2 ); + if ( IP::isValid( "{$cutIp}::" ) ) { + // Wildcard IP doesn't contain "::", so multiple parts can be wild + $ct = count( explode( ':', $ip ) ) - 1; + return 'V6.' . + implode( '.', array_slice( explode( ':', IP::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) . + '.*.'; + } + if ( IP::isValid( "{$cutIp}:1" ) ) { + // Wildcard IP does contain "::", so only the last part is wild + return 'V6.' . + substr( implode( '.', explode( ':', IP::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) . + '*.'; + } + } + } + + // Regularlize explicit specification of the DNS root. + // Browsers seem to do this for IPv4 literals too. + if ( substr( $host, -1 ) === '.' ) { + $host = substr( $host, 0, -1 ); + } + + // IPv4? + $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])'; + if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", $host ) ) { + return 'V4.' . implode( '.', array_map( function ( $v ) { + return $v === '*' ? $v : (int)$v; + }, explode( '.', $host ) ) ) . '.'; + } + + // Must be a host name. + return implode( '.', array_reverse( explode( '.', $host ) ) ) . '.'; + } + + /** + * Converts a URL into a format for el_index + * @since 1.33 + * @param string $url + * @return string[] Usually one entry, but might be two in case of + * protocol-relative URLs. Empty array on error. + */ + public static function makeIndexes( $url ) { + // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION! + + // NOTE: refreshExternallinksIndex.php assumes that only protocol-relative URLs return more + // than one index, and that the indexes for protocol-relative URLs only vary in the "http://" + // versus "https://" prefix. If you change that, you'll likely need to update + // refreshExternallinksIndex.php accordingly. + + $bits = wfParseUrl( $url ); + if ( !$bits ) { + return []; + } + + // Reverse the labels in the hostname, convert to lower case, unless it's an IP. + // For emails turn it into "domain.reversed@localpart" + if ( $bits['scheme'] == 'mailto' ) { + $mailparts = explode( '@', $bits['host'], 2 ); + if ( count( $mailparts ) === 2 ) { + $domainpart = self::indexifyHost( $mailparts[1] ); + } else { + // No @, assume it's a local part with no domain + $domainpart = ''; + } + $bits['host'] = $domainpart . '@' . $mailparts[0]; + } else { + $bits['host'] = self::indexifyHost( $bits['host'] ); + } + + // Reconstruct the pseudo-URL + $index = $bits['scheme'] . $bits['delimiter'] . $bits['host']; + // Leave out user and password. Add the port, path, query and fragment + if ( isset( $bits['port'] ) ) { + $index .= ':' . $bits['port']; + } + if ( isset( $bits['path'] ) ) { + $index .= $bits['path']; + } else { + $index .= '/'; + } + if ( isset( $bits['query'] ) ) { + $index .= '?' . $bits['query']; + } + if ( isset( $bits['fragment'] ) ) { + $index .= '#' . $bits['fragment']; + } + + if ( $bits['scheme'] == '' ) { + return [ "http:$index", "https:$index" ]; + } else { + return [ $index ]; + } + } + + /** + * Return query conditions which will match the specified string. There are + * several kinds of filter entry: + * + * *.domain.com - Matches domain.com and www.domain.com + * domain.com - Matches domain.com or domain.com/ but not www.domain.com + * *.domain.com/x - Matches domain.com/xy or www.domain.com/xy. Also probably matches + * domain.com/foobar/xy due to limitations of LIKE syntax. + * domain.com/x - Matches domain.com/xy but not www.domain.com/xy + * 192.0.2.* - Matches any IP in 192.0.2.0/24. Can also have a path appended. + * [2001:db8::*] - Matches any IP in 2001:db8::/112. Can also have a path appended. + * [2001:db8:*] - Matches any IP in 2001:db8::/32. Can also have a path appended. + * foo@domain.com - With protocol 'mailto:', matches the email address foo@domain.com. + * *@domain.com - With protocol 'mailto:', matches any email address at domain.com, but + * not subdomains like foo@mail.domain.com * * Asterisks in any other location are considered invalid. * - * This function does the same as wfMakeUrlIndexes(), except it also takes care + * @since 1.33 + * @param string $filterEntry Filter entry, as described above + * @param array $options Options are: + * - protocol: (string) Protocol to query (default http://) + * - oneWildcard: (bool) Stop at the first wildcard (default false) + * - prefix: (string) Field prefix (default 'el'). The query will test + * fields '{$prefix}_index' and '{$prefix}_index_60' + * - db: (IDatabase|null) Database to use. + * @return array|bool Conditions to be used for the query (to be ANDed) or + * false on error. To determine if the query is constant on the + * el_index_60 field, check whether key 'el_index_60' is set. + */ + public static function getQueryConditions( $filterEntry, array $options = [] ) { + $options += [ + 'protocol' => 'http://', + 'oneWildcard' => false, + 'prefix' => 'el', + 'db' => null, + ]; + + // First, get the like array + $like = self::makeLikeArray( $filterEntry, $options['protocol'] ); + if ( $like === false ) { + return $like; + } + + // Get the constant prefix (i.e. everything up to the first wildcard) + $trimmedLike = self::keepOneWildcard( $like ); + if ( $options['oneWildcard'] ) { + $like = $trimmedLike; + } + if ( $trimmedLike[count( $trimmedLike ) - 1] instanceof LikeMatch ) { + array_pop( $trimmedLike ); + } + $index = implode( '', $trimmedLike ); + + $p = $options['prefix']; + $db = $options['db'] ?: wfGetDB( DB_REPLICA ); + + // Build the query + $l = strlen( $index ); + if ( $l >= 60 ) { + // The constant prefix is larger than el_index_60, so we can use a + // constant comparison. + return [ + "{$p}_index_60" => substr( $index, 0, 60 ), + "{$p}_index" . $db->buildLike( $like ), + ]; + } + + // The constant prefix is smaller than el_index_60, so we use a LIKE + // for a prefix search. + return [ + "{$p}_index_60" . $db->buildLike( [ $index, $db->anyString() ] ), + "{$p}_index" . $db->buildLike( $like ), + ]; + } + + /** + * Make an array to be used for calls to Database::buildLike(), which + * will match the specified string. + * + * This function does the same as LinkFilter::makeIndexes(), except it also takes care * of adding wildcards * - * @param string $filterEntry Domainparts + * @note You probably want self::getQueryConditions() instead + * @param string $filterEntry Filter entry, @see self::getQueryConditions() * @param string $protocol Protocol (default http://) * @return array|bool Array to be passed to Database::buildLike() or false on error */ @@ -100,38 +314,27 @@ class LinkFilter { $target = $protocol . $filterEntry; $bits = wfParseUrl( $target ); - - if ( $bits == false ) { - // Unknown protocol? + if ( !$bits ) { return false; } - if ( substr( $bits['host'], 0, 2 ) == '*.' ) { - $subdomains = true; - $bits['host'] = substr( $bits['host'], 2 ); - if ( $bits['host'] == '' ) { - // We don't want to make a clause that will match everything, - // that could be dangerous - return false; - } - } else { - $subdomains = false; - } - - // Reverse the labels in the hostname, convert to lower case - // For emails reverse domainpart only + $subdomains = false; if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) { - // complete email address - $mailparts = explode( '@', $bits['host'] ); - $domainpart = strtolower( implode( '.', array_reverse( explode( '.', $mailparts[1] ) ) ) ); - $bits['host'] = $domainpart . '@' . $mailparts[0]; - } elseif ( $bits['scheme'] === 'mailto' ) { - // domainpart of email address only, do not add '.' - $bits['host'] = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) ); + // Email address with domain and non-empty local part + $mailparts = explode( '@', $bits['host'], 2 ); + $domainpart = self::indexifyHost( $mailparts[1] ); + if ( $mailparts[0] === '*' ) { + $subdomains = true; + $bits['host'] = $domainpart . '@'; + } else { + $bits['host'] = $domainpart . '@' . $mailparts[0]; + } } else { - $bits['host'] = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) ); - if ( substr( $bits['host'], -1, 1 ) !== '.' ) { - $bits['host'] .= '.'; + // Non-email, or email with only a domain part. + $bits['host'] = self::indexifyHost( $bits['host'] ); + if ( substr( $bits['host'], -3 ) === '.*.' ) { + $subdomains = true; + $bits['host'] = substr( $bits['host'], 0, -2 ); } } @@ -175,6 +378,7 @@ class LinkFilter { * Filters an array returned by makeLikeArray(), removing everything past first * pattern placeholder. * + * @note You probably want self::getQueryConditions() instead * @param array $arr Array to filter * @return array Filtered array */