Do not insert page titles into querycache.qc_value
[lhc/web/wiklou.git] / includes / LinkFilter.php
1 <?php
2 /**
3 * Functions to help implement an external link filter for spam control.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 * http://www.gnu.org/copyleft/gpl.html
19 *
20 * @file
21 */
22 use Wikimedia\Rdbms\LikeMatch;
23
24 /**
25 * Some functions to help implement an external link filter for spam control.
26 *
27 * @todo implement the filter. Currently these are just some functions to help
28 * maintenance/cleanupSpam.php remove links to a single specified domain. The
29 * next thing is to implement functions for checking a given page against a big
30 * list of domains.
31 *
32 * Another cool thing to do would be a web interface for fast spam removal.
33 */
34 class LinkFilter {
35 /**
36 * Increment this when makeIndexes output changes. It'll cause
37 * maintenance/refreshExternallinksIndex.php to run from update.php.
38 */
39 const VERSION = 1;
40
41 /**
42 * Check whether $content contains a link to $filterEntry
43 *
44 * @param Content $content Content to check
45 * @param string $filterEntry Domainparts, see makeRegex() for more details
46 * @param string $protocol 'http://' or 'https://'
47 * @return int 0 if no match or 1 if there's at least one match
48 */
49 public static function matchEntry( Content $content, $filterEntry, $protocol = 'http://' ) {
50 if ( !( $content instanceof TextContent ) ) {
51 // TODO: handle other types of content too.
52 // Maybe create ContentHandler::matchFilter( LinkFilter ).
53 // Think about a common base class for LinkFilter and MagicWord.
54 return 0;
55 }
56
57 $text = $content->getText();
58
59 $regex = self::makeRegex( $filterEntry, $protocol );
60 return preg_match( $regex, $text );
61 }
62
63 /**
64 * Builds a regex pattern for $filterEntry.
65 *
66 * @todo This doesn't match the rest of the functionality here.
67 * @param string $filterEntry URL, if it begins with "*.", it'll be
68 * replaced to match any subdomain
69 * @param string $protocol 'http://' or 'https://'
70 *
71 * @return string Regex pattern, for preg_match()
72 */
73 private static function makeRegex( $filterEntry, $protocol ) {
74 $regex = '!' . preg_quote( $protocol, '!' );
75 if ( substr( $filterEntry, 0, 2 ) == '*.' ) {
76 $regex .= '(?:[A-Za-z0-9.-]+\.|)';
77 $filterEntry = substr( $filterEntry, 2 );
78 }
79 $regex .= preg_quote( $filterEntry, '!' ) . '!Si';
80 return $regex;
81 }
82
83 /**
84 * Indicate whether LinkFilter IDN support is available
85 * @since 1.33
86 * @return bool
87 */
88 public static function supportsIDN() {
89 return is_callable( 'idn_to_utf8' ) && defined( 'INTL_IDNA_VARIANT_UTS46' );
90 }
91
92 /**
93 * Canonicalize a hostname for el_index
94 * @param string $host
95 * @return string
96 */
97 private static function indexifyHost( $host ) {
98 // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
99
100 // Canonicalize.
101 $host = rawurldecode( $host );
102 if ( $host !== '' && self::supportsIDN() ) {
103 // @todo Add a PHP fallback
104 $tmp = idn_to_utf8( $host, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46 );
105 if ( $tmp !== false ) {
106 $host = $tmp;
107 }
108 }
109 $okChars = 'a-zA-Z0-9\\-._~!$&\'()*+,;=';
110 if ( StringUtils::isUtf8( $host ) ) {
111 // Save a little space by not percent-encoding valid UTF-8 bytes
112 $okChars .= '\x80-\xf4';
113 }
114 $host = preg_replace_callback(
115 '<[^' . $okChars . ']>',
116 function ( $m ) {
117 return rawurlencode( $m[0] );
118 },
119 strtolower( $host )
120 );
121
122 // IPv6? RFC 3986 syntax.
123 if ( preg_match( '/^\[([0-9a-f:*]+)\]$/', rawurldecode( $host ), $m ) ) {
124 $ip = $m[1];
125 if ( IP::isValid( $ip ) ) {
126 return 'V6.' . implode( '.', explode( ':', IP::sanitizeIP( $ip ) ) ) . '.';
127 }
128 if ( substr( $ip, -2 ) === ':*' ) {
129 $cutIp = substr( $ip, 0, -2 );
130 if ( IP::isValid( "{$cutIp}::" ) ) {
131 // Wildcard IP doesn't contain "::", so multiple parts can be wild
132 $ct = count( explode( ':', $ip ) ) - 1;
133 return 'V6.' .
134 implode( '.', array_slice( explode( ':', IP::sanitizeIP( "{$cutIp}::" ) ), 0, $ct ) ) .
135 '.*.';
136 }
137 if ( IP::isValid( "{$cutIp}:1" ) ) {
138 // Wildcard IP does contain "::", so only the last part is wild
139 return 'V6.' .
140 substr( implode( '.', explode( ':', IP::sanitizeIP( "{$cutIp}:1" ) ) ), 0, -1 ) .
141 '*.';
142 }
143 }
144 }
145
146 // Regularlize explicit specification of the DNS root.
147 // Browsers seem to do this for IPv4 literals too.
148 if ( substr( $host, -1 ) === '.' ) {
149 $host = substr( $host, 0, -1 );
150 }
151
152 // IPv4?
153 $b = '(?:0*25[0-5]|0*2[0-4][0-9]|0*1[0-9][0-9]|0*[0-9]?[0-9])';
154 if ( preg_match( "/^(?:{$b}\.){3}{$b}$|^(?:{$b}\.){1,3}\*$/", $host ) ) {
155 return 'V4.' . implode( '.', array_map( function ( $v ) {
156 return $v === '*' ? $v : (int)$v;
157 }, explode( '.', $host ) ) ) . '.';
158 }
159
160 // Must be a host name.
161 return implode( '.', array_reverse( explode( '.', $host ) ) ) . '.';
162 }
163
164 /**
165 * Converts a URL into a format for el_index
166 * @since 1.33
167 * @param string $url
168 * @return string[] Usually one entry, but might be two in case of
169 * protocol-relative URLs. Empty array on error.
170 */
171 public static function makeIndexes( $url ) {
172 // NOTE: If you change the output of this method, you'll probably have to increment self::VERSION!
173
174 // NOTE: refreshExternallinksIndex.php assumes that only protocol-relative URLs return more
175 // than one index, and that the indexes for protocol-relative URLs only vary in the "http://"
176 // versus "https://" prefix. If you change that, you'll likely need to update
177 // refreshExternallinksIndex.php accordingly.
178
179 $bits = wfParseUrl( $url );
180 if ( !$bits ) {
181 return [];
182 }
183
184 // Reverse the labels in the hostname, convert to lower case, unless it's an IP.
185 // For emails turn it into "domain.reversed@localpart"
186 if ( $bits['scheme'] == 'mailto' ) {
187 $mailparts = explode( '@', $bits['host'], 2 );
188 if ( count( $mailparts ) === 2 ) {
189 $domainpart = self::indexifyHost( $mailparts[1] );
190 } else {
191 // No @, assume it's a local part with no domain
192 $domainpart = '';
193 }
194 $bits['host'] = $domainpart . '@' . $mailparts[0];
195 } else {
196 $bits['host'] = self::indexifyHost( $bits['host'] );
197 }
198
199 // Reconstruct the pseudo-URL
200 $index = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
201 // Leave out user and password. Add the port, path, query and fragment
202 if ( isset( $bits['port'] ) ) {
203 $index .= ':' . $bits['port'];
204 }
205 if ( isset( $bits['path'] ) ) {
206 $index .= $bits['path'];
207 } else {
208 $index .= '/';
209 }
210 if ( isset( $bits['query'] ) ) {
211 $index .= '?' . $bits['query'];
212 }
213 if ( isset( $bits['fragment'] ) ) {
214 $index .= '#' . $bits['fragment'];
215 }
216
217 if ( $bits['scheme'] == '' ) {
218 return [ "http:$index", "https:$index" ];
219 } else {
220 return [ $index ];
221 }
222 }
223
224 /**
225 * Return query conditions which will match the specified string. There are
226 * several kinds of filter entry:
227 *
228 * *.domain.com - Matches domain.com and www.domain.com
229 * domain.com - Matches domain.com or domain.com/ but not www.domain.com
230 * *.domain.com/x - Matches domain.com/xy or www.domain.com/xy. Also probably matches
231 * domain.com/foobar/xy due to limitations of LIKE syntax.
232 * domain.com/x - Matches domain.com/xy but not www.domain.com/xy
233 * 192.0.2.* - Matches any IP in 192.0.2.0/24. Can also have a path appended.
234 * [2001:db8::*] - Matches any IP in 2001:db8::/112. Can also have a path appended.
235 * [2001:db8:*] - Matches any IP in 2001:db8::/32. Can also have a path appended.
236 * foo@domain.com - With protocol 'mailto:', matches the email address foo@domain.com.
237 * *@domain.com - With protocol 'mailto:', matches any email address at domain.com, but
238 * not subdomains like foo@mail.domain.com
239 *
240 * Asterisks in any other location are considered invalid.
241 *
242 * @since 1.33
243 * @param string $filterEntry Filter entry, as described above
244 * @param array $options Options are:
245 * - protocol: (string) Protocol to query (default http://)
246 * - oneWildcard: (bool) Stop at the first wildcard (default false)
247 * - prefix: (string) Field prefix (default 'el'). The query will test
248 * fields '{$prefix}_index' and '{$prefix}_index_60'
249 * - db: (IDatabase|null) Database to use.
250 * @return array|bool Conditions to be used for the query (to be ANDed) or
251 * false on error. To determine if the query is constant on the
252 * el_index_60 field, check whether key 'el_index_60' is set.
253 */
254 public static function getQueryConditions( $filterEntry, array $options = [] ) {
255 $options += [
256 'protocol' => 'http://',
257 'oneWildcard' => false,
258 'prefix' => 'el',
259 'db' => null,
260 ];
261
262 // First, get the like array
263 $like = self::makeLikeArray( $filterEntry, $options['protocol'] );
264 if ( $like === false ) {
265 return $like;
266 }
267
268 // Get the constant prefix (i.e. everything up to the first wildcard)
269 $trimmedLike = self::keepOneWildcard( $like );
270 if ( $options['oneWildcard'] ) {
271 $like = $trimmedLike;
272 }
273 if ( $trimmedLike[count( $trimmedLike ) - 1] instanceof LikeMatch ) {
274 array_pop( $trimmedLike );
275 }
276 $index = implode( '', $trimmedLike );
277
278 $p = $options['prefix'];
279 $db = $options['db'] ?: wfGetDB( DB_REPLICA );
280
281 // Build the query
282 $l = strlen( $index );
283 if ( $l >= 60 ) {
284 // The constant prefix is larger than el_index_60, so we can use a
285 // constant comparison.
286 return [
287 "{$p}_index_60" => substr( $index, 0, 60 ),
288 "{$p}_index" . $db->buildLike( $like ),
289 ];
290 }
291
292 // The constant prefix is smaller than el_index_60, so we use a LIKE
293 // for a prefix search.
294 return [
295 "{$p}_index_60" . $db->buildLike( $index, $db->anyString() ),
296 "{$p}_index" . $db->buildLike( $like ),
297 ];
298 }
299
300 /**
301 * Make an array to be used for calls to Database::buildLike(), which
302 * will match the specified string.
303 *
304 * This function does the same as LinkFilter::makeIndexes(), except it also takes care
305 * of adding wildcards
306 *
307 * @note You probably want self::getQueryConditions() instead
308 * @param string $filterEntry Filter entry, @see self::getQueryConditions()
309 * @param string $protocol Protocol (default http://)
310 * @return array|bool Array to be passed to Database::buildLike() or false on error
311 */
312 public static function makeLikeArray( $filterEntry, $protocol = 'http://' ) {
313 $db = wfGetDB( DB_REPLICA );
314 $like = [];
315
316 $target = $protocol . $filterEntry;
317 $bits = wfParseUrl( $target );
318 if ( !$bits ) {
319 return false;
320 }
321
322 $subdomains = false;
323 if ( $bits['scheme'] === 'mailto' && strpos( $bits['host'], '@' ) ) {
324 // Email address with domain and non-empty local part
325 $mailparts = explode( '@', $bits['host'], 2 );
326 $domainpart = self::indexifyHost( $mailparts[1] );
327 if ( $mailparts[0] === '*' ) {
328 $subdomains = true;
329 $bits['host'] = $domainpart . '@';
330 } else {
331 $bits['host'] = $domainpart . '@' . $mailparts[0];
332 }
333 } else {
334 // Non-email, or email with only a domain part.
335 $bits['host'] = self::indexifyHost( $bits['host'] );
336 if ( substr( $bits['host'], -3 ) === '.*.' ) {
337 $subdomains = true;
338 $bits['host'] = substr( $bits['host'], 0, -2 );
339 }
340 }
341
342 $like[] = $bits['scheme'] . $bits['delimiter'] . $bits['host'];
343
344 if ( $subdomains ) {
345 $like[] = $db->anyString();
346 }
347
348 if ( isset( $bits['port'] ) ) {
349 $like[] = ':' . $bits['port'];
350 }
351 if ( isset( $bits['path'] ) ) {
352 $like[] = $bits['path'];
353 } elseif ( !$subdomains ) {
354 $like[] = '/';
355 }
356 if ( isset( $bits['query'] ) ) {
357 $like[] = '?' . $bits['query'];
358 }
359 if ( isset( $bits['fragment'] ) ) {
360 $like[] = '#' . $bits['fragment'];
361 }
362
363 // Check for stray asterisks: asterisk only allowed at the start of the domain
364 foreach ( $like as $likepart ) {
365 if ( !( $likepart instanceof LikeMatch ) && strpos( $likepart, '*' ) !== false ) {
366 return false;
367 }
368 }
369
370 if ( !( $like[count( $like ) - 1] instanceof LikeMatch ) ) {
371 // Add wildcard at the end if there isn't one already
372 $like[] = $db->anyString();
373 }
374
375 return $like;
376 }
377
378 /**
379 * Filters an array returned by makeLikeArray(), removing everything past first
380 * pattern placeholder.
381 *
382 * @note You probably want self::getQueryConditions() instead
383 * @param array $arr Array to filter
384 * @return array Filtered array
385 */
386 public static function keepOneWildcard( $arr ) {
387 if ( !is_array( $arr ) ) {
388 return $arr;
389 }
390
391 foreach ( $arr as $key => $value ) {
392 if ( $value instanceof LikeMatch ) {
393 return array_slice( $arr, 0, $key + 1 );
394 }
395 }
396
397 return $arr;
398 }
399 }