Merge "Doc: result domain for GlobalFunctions::wfRandom()"
[lhc/web/wiklou.git] / includes / parser / Sanitizer.php
1 <?php
2 /**
3 * HTML sanitizer for %MediaWiki.
4 *
5 * Copyright © 2002-2005 Brion Vibber <brion@pobox.com> et al
6 * https://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 * @ingroup Parser
25 */
26
27 use MediaWiki\MediaWikiServices;
28
29 /**
30 * HTML sanitizer for MediaWiki
31 * @ingroup Parser
32 */
33 class Sanitizer {
34 /**
35 * Regular expression to match various types of character references in
36 * Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences
37 */
38 const CHAR_REFS_REGEX =
39 '/&([A-Za-z0-9\x80-\xff]+);
40 |&\#([0-9]+);
41 |&\#[xX]([0-9A-Fa-f]+);
42 |(&)/x';
43
44 /**
45 * Acceptable tag name charset from HTML5 parsing spec
46 * https://www.w3.org/TR/html5/syntax.html#tag-open-state
47 */
48 const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
49
50 /**
51 * Blacklist for evil uris like javascript:
52 * WARNING: DO NOT use this in any place that actually requires blacklisting
53 * for security reasons. There are NUMEROUS[1] ways to bypass blacklisting, the
54 * only way to be secure from javascript: uri based xss vectors is to whitelist
55 * things that you know are safe and deny everything else.
56 * [1]: http://ha.ckers.org/xss.html
57 */
58 const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
59 const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
60
61 /**
62 * Tells escapeUrlForHtml() to encode the ID using the wiki's primary encoding.
63 *
64 * @since 1.30
65 */
66 const ID_PRIMARY = 0;
67
68 /**
69 * Tells escapeUrlForHtml() to encode the ID using the fallback encoding, or return false
70 * if no fallback is configured.
71 *
72 * @since 1.30
73 */
74 const ID_FALLBACK = 1;
75
76 /**
77 * List of all named character entities defined in HTML 4.01
78 * https://www.w3.org/TR/html4/sgml/entities.html
79 * As well as &apos; which is only defined starting in XHTML1.
80 */
81 private static $htmlEntities = [
82 'Aacute' => 193,
83 'aacute' => 225,
84 'Acirc' => 194,
85 'acirc' => 226,
86 'acute' => 180,
87 'AElig' => 198,
88 'aelig' => 230,
89 'Agrave' => 192,
90 'agrave' => 224,
91 'alefsym' => 8501,
92 'Alpha' => 913,
93 'alpha' => 945,
94 'amp' => 38,
95 'and' => 8743,
96 'ang' => 8736,
97 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
98 'Aring' => 197,
99 'aring' => 229,
100 'asymp' => 8776,
101 'Atilde' => 195,
102 'atilde' => 227,
103 'Auml' => 196,
104 'auml' => 228,
105 'bdquo' => 8222,
106 'Beta' => 914,
107 'beta' => 946,
108 'brvbar' => 166,
109 'bull' => 8226,
110 'cap' => 8745,
111 'Ccedil' => 199,
112 'ccedil' => 231,
113 'cedil' => 184,
114 'cent' => 162,
115 'Chi' => 935,
116 'chi' => 967,
117 'circ' => 710,
118 'clubs' => 9827,
119 'cong' => 8773,
120 'copy' => 169,
121 'crarr' => 8629,
122 'cup' => 8746,
123 'curren' => 164,
124 'dagger' => 8224,
125 'Dagger' => 8225,
126 'darr' => 8595,
127 'dArr' => 8659,
128 'deg' => 176,
129 'Delta' => 916,
130 'delta' => 948,
131 'diams' => 9830,
132 'divide' => 247,
133 'Eacute' => 201,
134 'eacute' => 233,
135 'Ecirc' => 202,
136 'ecirc' => 234,
137 'Egrave' => 200,
138 'egrave' => 232,
139 'empty' => 8709,
140 'emsp' => 8195,
141 'ensp' => 8194,
142 'Epsilon' => 917,
143 'epsilon' => 949,
144 'equiv' => 8801,
145 'Eta' => 919,
146 'eta' => 951,
147 'ETH' => 208,
148 'eth' => 240,
149 'Euml' => 203,
150 'euml' => 235,
151 'euro' => 8364,
152 'exist' => 8707,
153 'fnof' => 402,
154 'forall' => 8704,
155 'frac12' => 189,
156 'frac14' => 188,
157 'frac34' => 190,
158 'frasl' => 8260,
159 'Gamma' => 915,
160 'gamma' => 947,
161 'ge' => 8805,
162 'gt' => 62,
163 'harr' => 8596,
164 'hArr' => 8660,
165 'hearts' => 9829,
166 'hellip' => 8230,
167 'Iacute' => 205,
168 'iacute' => 237,
169 'Icirc' => 206,
170 'icirc' => 238,
171 'iexcl' => 161,
172 'Igrave' => 204,
173 'igrave' => 236,
174 'image' => 8465,
175 'infin' => 8734,
176 'int' => 8747,
177 'Iota' => 921,
178 'iota' => 953,
179 'iquest' => 191,
180 'isin' => 8712,
181 'Iuml' => 207,
182 'iuml' => 239,
183 'Kappa' => 922,
184 'kappa' => 954,
185 'Lambda' => 923,
186 'lambda' => 955,
187 'lang' => 9001,
188 'laquo' => 171,
189 'larr' => 8592,
190 'lArr' => 8656,
191 'lceil' => 8968,
192 'ldquo' => 8220,
193 'le' => 8804,
194 'lfloor' => 8970,
195 'lowast' => 8727,
196 'loz' => 9674,
197 'lrm' => 8206,
198 'lsaquo' => 8249,
199 'lsquo' => 8216,
200 'lt' => 60,
201 'macr' => 175,
202 'mdash' => 8212,
203 'micro' => 181,
204 'middot' => 183,
205 'minus' => 8722,
206 'Mu' => 924,
207 'mu' => 956,
208 'nabla' => 8711,
209 'nbsp' => 160,
210 'ndash' => 8211,
211 'ne' => 8800,
212 'ni' => 8715,
213 'not' => 172,
214 'notin' => 8713,
215 'nsub' => 8836,
216 'Ntilde' => 209,
217 'ntilde' => 241,
218 'Nu' => 925,
219 'nu' => 957,
220 'Oacute' => 211,
221 'oacute' => 243,
222 'Ocirc' => 212,
223 'ocirc' => 244,
224 'OElig' => 338,
225 'oelig' => 339,
226 'Ograve' => 210,
227 'ograve' => 242,
228 'oline' => 8254,
229 'Omega' => 937,
230 'omega' => 969,
231 'Omicron' => 927,
232 'omicron' => 959,
233 'oplus' => 8853,
234 'or' => 8744,
235 'ordf' => 170,
236 'ordm' => 186,
237 'Oslash' => 216,
238 'oslash' => 248,
239 'Otilde' => 213,
240 'otilde' => 245,
241 'otimes' => 8855,
242 'Ouml' => 214,
243 'ouml' => 246,
244 'para' => 182,
245 'part' => 8706,
246 'permil' => 8240,
247 'perp' => 8869,
248 'Phi' => 934,
249 'phi' => 966,
250 'Pi' => 928,
251 'pi' => 960,
252 'piv' => 982,
253 'plusmn' => 177,
254 'pound' => 163,
255 'prime' => 8242,
256 'Prime' => 8243,
257 'prod' => 8719,
258 'prop' => 8733,
259 'Psi' => 936,
260 'psi' => 968,
261 'quot' => 34,
262 'radic' => 8730,
263 'rang' => 9002,
264 'raquo' => 187,
265 'rarr' => 8594,
266 'rArr' => 8658,
267 'rceil' => 8969,
268 'rdquo' => 8221,
269 'real' => 8476,
270 'reg' => 174,
271 'rfloor' => 8971,
272 'Rho' => 929,
273 'rho' => 961,
274 'rlm' => 8207,
275 'rsaquo' => 8250,
276 'rsquo' => 8217,
277 'sbquo' => 8218,
278 'Scaron' => 352,
279 'scaron' => 353,
280 'sdot' => 8901,
281 'sect' => 167,
282 'shy' => 173,
283 'Sigma' => 931,
284 'sigma' => 963,
285 'sigmaf' => 962,
286 'sim' => 8764,
287 'spades' => 9824,
288 'sub' => 8834,
289 'sube' => 8838,
290 'sum' => 8721,
291 'sup' => 8835,
292 'sup1' => 185,
293 'sup2' => 178,
294 'sup3' => 179,
295 'supe' => 8839,
296 'szlig' => 223,
297 'Tau' => 932,
298 'tau' => 964,
299 'there4' => 8756,
300 'Theta' => 920,
301 'theta' => 952,
302 'thetasym' => 977,
303 'thinsp' => 8201,
304 'THORN' => 222,
305 'thorn' => 254,
306 'tilde' => 732,
307 'times' => 215,
308 'trade' => 8482,
309 'Uacute' => 218,
310 'uacute' => 250,
311 'uarr' => 8593,
312 'uArr' => 8657,
313 'Ucirc' => 219,
314 'ucirc' => 251,
315 'Ugrave' => 217,
316 'ugrave' => 249,
317 'uml' => 168,
318 'upsih' => 978,
319 'Upsilon' => 933,
320 'upsilon' => 965,
321 'Uuml' => 220,
322 'uuml' => 252,
323 'weierp' => 8472,
324 'Xi' => 926,
325 'xi' => 958,
326 'Yacute' => 221,
327 'yacute' => 253,
328 'yen' => 165,
329 'Yuml' => 376,
330 'yuml' => 255,
331 'Zeta' => 918,
332 'zeta' => 950,
333 'zwj' => 8205,
334 'zwnj' => 8204
335 ];
336
337 /**
338 * Character entity aliases accepted by MediaWiki
339 */
340 private static $htmlEntityAliases = [
341 'רלמ' => 'rlm',
342 'رلم' => 'rlm',
343 ];
344
345 /**
346 * Lazy-initialised attributes regex, see getAttribsRegex()
347 */
348 private static $attribsRegex;
349
350 /**
351 * Regular expression to match HTML/XML attribute pairs within a tag.
352 * Allows some... latitude. Based on,
353 * https://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
354 * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
355 * @return string
356 */
357 static function getAttribsRegex() {
358 if ( self::$attribsRegex === null ) {
359 $attribFirst = "[:_\p{L}\p{N}]";
360 $attrib = "[:_\.\-\p{L}\p{N}]";
361 $space = '[\x09\x0a\x0c\x0d\x20]';
362 self::$attribsRegex =
363 "/(?:^|$space)({$attribFirst}{$attrib}*)
364 ($space*=$space*
365 (?:
366 # The attribute value: quoted or alone
367 \"([^\"]*)(?:\"|\$)
368 | '([^']*)(?:'|\$)
369 | (((?!$space|>).)*)
370 )
371 )?(?=$space|\$)/sxu";
372 }
373 return self::$attribsRegex;
374 }
375
376 /**
377 * Return the various lists of recognized tags
378 * @param array $extratags For any extra tags to include
379 * @param array $removetags For any tags (default or extra) to exclude
380 * @return array
381 */
382 public static function getRecognizedTagData( $extratags = [], $removetags = [] ) {
383 global $wgAllowImageTag;
384
385 static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
386 $htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised;
387
388 // Base our staticInitialised variable off of the global config state so that if the globals
389 // are changed (like in the screwed up test system) we will re-initialise the settings.
390 $globalContext = $wgAllowImageTag;
391 if ( !$staticInitialised || $staticInitialised != $globalContext ) {
392 $htmlpairsStatic = [ # Tags that must be closed
393 'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
394 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
395 'strike', 'strong', 'tt', 'var', 'div', 'center',
396 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
397 'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn',
398 'kbd', 'samp', 'data', 'time', 'mark'
399 ];
400 $htmlsingle = [
401 'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
402 ];
403
404 # Elements that cannot have close tags. This is (not coincidentally)
405 # also the list of tags for which the HTML 5 parsing algorithm
406 # requires you to "acknowledge the token's self-closing flag", i.e.
407 # a self-closing tag like <br/> is not an HTML 5 parse error only
408 # for this list.
409 $htmlsingleonly = [
410 'br', 'wbr', 'hr', 'meta', 'link'
411 ];
412
413 $htmlnest = [ # Tags that can be nested--??
414 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
415 'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span',
416 'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo'
417 ];
418 $tabletags = [ # Can only appear inside table, we will close them
419 'td', 'th', 'tr',
420 ];
421 $htmllist = [ # Tags used by list
422 'ul', 'ol',
423 ];
424 $listtags = [ # Tags that can appear in a list
425 'li',
426 ];
427
428 if ( $wgAllowImageTag ) {
429 $htmlsingle[] = 'img';
430 $htmlsingleonly[] = 'img';
431 }
432
433 $htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) );
434 $htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) );
435
436 # Convert them all to hashtables for faster lookup
437 $vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
438 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ];
439 foreach ( $vars as $var ) {
440 $$var = array_flip( $$var );
441 }
442 $staticInitialised = $globalContext;
443 }
444
445 # Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays
446 $extratags = array_flip( $extratags );
447 $removetags = array_flip( $removetags );
448 $htmlpairs = array_merge( $extratags, $htmlpairsStatic );
449 $htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags );
450
451 return [
452 'htmlpairs' => $htmlpairs,
453 'htmlsingle' => $htmlsingle,
454 'htmlsingleonly' => $htmlsingleonly,
455 'htmlnest' => $htmlnest,
456 'tabletags' => $tabletags,
457 'htmllist' => $htmllist,
458 'listtags' => $listtags,
459 'htmlsingleallowed' => $htmlsingleallowed,
460 'htmlelements' => $htmlelements,
461 ];
462 }
463
464 /**
465 * Cleans up HTML, removes dangerous tags and attributes, and
466 * removes HTML comments
467 * @param string $text
468 * @param callable|null $processCallback Callback to do any variable or parameter
469 * replacements in HTML attribute values
470 * @param array|bool $args Arguments for the processing callback
471 * @param array $extratags For any extra tags to include
472 * @param array $removetags For any tags (default or extra) to exclude
473 * @param callable|null $warnCallback (Deprecated) Callback allowing the
474 * addition of a tracking category when bad input is encountered.
475 * DO NOT ADD NEW PARAMETERS AFTER $warnCallback, since it will be
476 * removed shortly.
477 * @return string
478 */
479 public static function removeHTMLtags( $text, $processCallback = null,
480 $args = [], $extratags = [], $removetags = [], $warnCallback = null
481 ) {
482 $tagData = self::getRecognizedTagData( $extratags, $removetags );
483 $htmlpairs = $tagData['htmlpairs'];
484 $htmlsingle = $tagData['htmlsingle'];
485 $htmlsingleonly = $tagData['htmlsingleonly'];
486 $htmlnest = $tagData['htmlnest'];
487 $tabletags = $tagData['tabletags'];
488 $htmllist = $tagData['htmllist'];
489 $listtags = $tagData['listtags'];
490 $htmlsingleallowed = $tagData['htmlsingleallowed'];
491 $htmlelements = $tagData['htmlelements'];
492
493 # Remove HTML comments
494 $text = self::removeHTMLcomments( $text );
495 $bits = explode( '<', $text );
496 $text = str_replace( '>', '&gt;', array_shift( $bits ) );
497 if ( !MWTidy::isEnabled() ) {
498 wfDeprecated( 'disabling tidy', '1.33' );
499 $tagstack = $tablestack = [];
500 foreach ( $bits as $x ) {
501 $regs = [];
502 # $slash: Does the current element start with a '/'?
503 # $t: Current element name
504 # $params: String between element name and >
505 # $brace: Ending '>' or '/>'
506 # $rest: Everything until the next element of $bits
507 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
508 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
509 } else {
510 $slash = $t = $params = $brace = $rest = null;
511 }
512
513 $badtag = false;
514 $t = strtolower( $t );
515 if ( isset( $htmlelements[$t] ) ) {
516 # Check our stack
517 if ( $slash && isset( $htmlsingleonly[$t] ) ) {
518 $badtag = true;
519 } elseif ( $slash ) {
520 # Closing a tag... is it the one we just opened?
521 Wikimedia\suppressWarnings();
522 $ot = array_pop( $tagstack );
523 Wikimedia\restoreWarnings();
524
525 if ( $ot != $t ) {
526 if ( isset( $htmlsingleallowed[$ot] ) ) {
527 # Pop all elements with an optional close tag
528 # and see if we find a match below them
529 $optstack = [];
530 array_push( $optstack, $ot );
531 Wikimedia\suppressWarnings();
532 $ot = array_pop( $tagstack );
533 Wikimedia\restoreWarnings();
534 while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) {
535 array_push( $optstack, $ot );
536 Wikimedia\suppressWarnings();
537 $ot = array_pop( $tagstack );
538 Wikimedia\restoreWarnings();
539 }
540 if ( $t != $ot ) {
541 # No match. Push the optional elements back again
542 $badtag = true;
543 Wikimedia\suppressWarnings();
544 $ot = array_pop( $optstack );
545 Wikimedia\restoreWarnings();
546 while ( $ot ) {
547 array_push( $tagstack, $ot );
548 Wikimedia\suppressWarnings();
549 $ot = array_pop( $optstack );
550 Wikimedia\restoreWarnings();
551 }
552 }
553 } else {
554 Wikimedia\suppressWarnings();
555 array_push( $tagstack, $ot );
556 Wikimedia\restoreWarnings();
557
558 # <li> can be nested in <ul> or <ol>, skip those cases:
559 if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) {
560 $badtag = true;
561 }
562 }
563 } else {
564 if ( $t == 'table' ) {
565 $tagstack = array_pop( $tablestack );
566 }
567 }
568 $newparams = '';
569 } else {
570 # Keep track for later
571 if ( isset( $tabletags[$t] ) && !in_array( 'table', $tagstack ) ) {
572 $badtag = true;
573 } elseif ( in_array( $t, $tagstack ) && !isset( $htmlnest[$t] ) ) {
574 $badtag = true;
575 #  Is it a self closed htmlpair ? (T7487)
576 } elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) {
577 // Eventually we'll just remove the self-closing
578 // slash, in order to be consistent with HTML5
579 // semantics.
580 // $brace = '>';
581 // For now, let's just warn authors to clean up.
582 if ( is_callable( $warnCallback ) ) {
583 call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
584 }
585 $badtag = true;
586 } elseif ( isset( $htmlsingleonly[$t] ) ) {
587 # Hack to force empty tag for unclosable elements
588 $brace = '/>';
589 } elseif ( isset( $htmlsingle[$t] ) ) {
590 # Hack to not close $htmlsingle tags
591 $brace = null;
592 # Still need to push this optionally-closed tag to
593 # the tag stack so that we can match end tags
594 # instead of marking them as bad.
595 array_push( $tagstack, $t );
596 } elseif ( isset( $tabletags[$t] ) && in_array( $t, $tagstack ) ) {
597 // New table tag but forgot to close the previous one
598 $text .= "</$t>";
599 } else {
600 if ( $t == 'table' ) {
601 array_push( $tablestack, $tagstack );
602 $tagstack = [];
603 }
604 array_push( $tagstack, $t );
605 }
606
607 # Replace any variables or template parameters with
608 # plaintext results.
609 if ( is_callable( $processCallback ) ) {
610 call_user_func_array( $processCallback, [ &$params, $args ] );
611 }
612
613 if ( !self::validateTag( $params, $t ) ) {
614 $badtag = true;
615 }
616
617 # Strip non-approved attributes from the tag
618 $newparams = self::fixTagAttributes( $params, $t );
619 }
620 if ( !$badtag ) {
621 $rest = str_replace( '>', '&gt;', $rest );
622 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
623 $text .= "<$slash$t$newparams$close>$rest";
624 continue;
625 }
626 }
627 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
628 }
629 # Close off any remaining tags
630 while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) {
631 $text .= "</$t>\n";
632 if ( $t == 'table' ) {
633 $tagstack = array_pop( $tablestack );
634 }
635 }
636 } else {
637 # this might be possible using tidy itself
638 foreach ( $bits as $x ) {
639 if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) {
640 list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
641
642 $badtag = false;
643 $t = strtolower( $t );
644 if ( isset( $htmlelements[$t] ) ) {
645 if ( is_callable( $processCallback ) ) {
646 call_user_func_array( $processCallback, [ &$params, $args ] );
647 }
648
649 if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
650 // Eventually we'll just remove the self-closing
651 // slash, in order to be consistent with HTML5
652 // semantics.
653 // $brace = '>';
654 // For now, let's just warn authors to clean up.
655 if ( is_callable( $warnCallback ) ) {
656 call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
657 }
658 }
659 if ( !self::validateTag( $params, $t ) ) {
660 $badtag = true;
661 }
662
663 $newparams = self::fixTagAttributes( $params, $t );
664 if ( !$badtag ) {
665 if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
666 # Interpret self-closing tags as empty tags even when
667 # HTML 5 would interpret them as start tags. Such input
668 # is commonly seen on Wikimedia wikis with this intention.
669 $brace = "></$t>";
670 }
671
672 $rest = str_replace( '>', '&gt;', $rest );
673 $text .= "<$slash$t$newparams$brace$rest";
674 continue;
675 }
676 }
677 }
678 $text .= '&lt;' . str_replace( '>', '&gt;', $x );
679 }
680 }
681 return $text;
682 }
683
684 /**
685 * Remove '<!--', '-->', and everything between.
686 * To avoid leaving blank lines, when a comment is both preceded
687 * and followed by a newline (ignoring spaces), trim leading and
688 * trailing spaces and one of the newlines.
689 *
690 * @param string $text
691 * @return string
692 */
693 public static function removeHTMLcomments( $text ) {
694 while ( ( $start = strpos( $text, '<!--' ) ) !== false ) {
695 $end = strpos( $text, '-->', $start + 4 );
696 if ( $end === false ) {
697 # Unterminated comment; bail out
698 break;
699 }
700
701 $end += 3;
702
703 # Trim space and newline if the comment is both
704 # preceded and followed by a newline
705 $spaceStart = max( $start - 1, 0 );
706 $spaceLen = $end - $spaceStart;
707 while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) {
708 $spaceStart--;
709 $spaceLen++;
710 }
711 while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) {
712 $spaceLen++;
713 }
714 if ( substr( $text, $spaceStart, 1 ) === "\n"
715 && substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) {
716 # Remove the comment, leading and trailing
717 # spaces, and leave only one newline.
718 $text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 );
719 } else {
720 # Remove just the comment.
721 $text = substr_replace( $text, '', $start, $end - $start );
722 }
723 }
724 return $text;
725 }
726
727 /**
728 * Takes attribute names and values for a tag and the tag name and
729 * validates that the tag is allowed to be present.
730 * This DOES NOT validate the attributes, nor does it validate the
731 * tags themselves. This method only handles the special circumstances
732 * where we may want to allow a tag within content but ONLY when it has
733 * specific attributes set.
734 *
735 * @param string $params
736 * @param string $element
737 * @return bool
738 */
739 static function validateTag( $params, $element ) {
740 $params = self::decodeTagAttributes( $params );
741
742 if ( $element == 'meta' || $element == 'link' ) {
743 if ( !isset( $params['itemprop'] ) ) {
744 // <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content
745 return false;
746 }
747 if ( $element == 'meta' && !isset( $params['content'] ) ) {
748 // <meta> must have a content="" for the itemprop
749 return false;
750 }
751 if ( $element == 'link' && !isset( $params['href'] ) ) {
752 // <link> must have an associated href=""
753 return false;
754 }
755 }
756
757 return true;
758 }
759
760 /**
761 * Take an array of attribute names and values and normalize or discard
762 * illegal values for the given element type.
763 *
764 * - Discards attributes not on a whitelist for the given element
765 * - Unsafe style attributes are discarded
766 * - Invalid id attributes are re-encoded
767 *
768 * @param array $attribs
769 * @param string $element
770 * @return array
771 *
772 * @todo Check for legal values where the DTD limits things.
773 * @todo Check for unique id attribute :P
774 */
775 static function validateTagAttributes( $attribs, $element ) {
776 return self::validateAttributes( $attribs,
777 self::attributeWhitelist( $element ) );
778 }
779
780 /**
781 * Take an array of attribute names and values and normalize or discard
782 * illegal values for the given whitelist.
783 *
784 * - Discards attributes not on the given whitelist
785 * - Unsafe style attributes are discarded
786 * - Invalid id attributes are re-encoded
787 *
788 * @param array $attribs
789 * @param array $whitelist List of allowed attribute names
790 * @return array
791 *
792 * @todo Check for legal values where the DTD limits things.
793 * @todo Check for unique id attribute :P
794 */
795 static function validateAttributes( $attribs, $whitelist ) {
796 $whitelist = array_flip( $whitelist );
797 $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/';
798
799 $out = [];
800 foreach ( $attribs as $attribute => $value ) {
801 # Allow XML namespace declaration to allow RDFa
802 if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) {
803 if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) {
804 $out[$attribute] = $value;
805 }
806
807 continue;
808 }
809
810 # Allow any attribute beginning with "data-"
811 # However:
812 # * Disallow data attributes used by MediaWiki code
813 # * Ensure that the attribute is not namespaced by banning
814 # colons.
815 if ( !preg_match( '/^data-[^:]*$/i', $attribute )
816 && !isset( $whitelist[$attribute] )
817 || self::isReservedDataAttribute( $attribute )
818 ) {
819 continue;
820 }
821
822 # Strip javascript "expression" from stylesheets.
823 # https://msdn.microsoft.com/en-us/library/ms537634.aspx
824 if ( $attribute == 'style' ) {
825 $value = self::checkCss( $value );
826 }
827
828 # Escape HTML id attributes
829 if ( $attribute === 'id' ) {
830 $value = self::escapeIdForAttribute( $value, self::ID_PRIMARY );
831 }
832
833 # Escape HTML id reference lists
834 if ( $attribute === 'aria-describedby'
835 || $attribute === 'aria-flowto'
836 || $attribute === 'aria-labelledby'
837 || $attribute === 'aria-owns'
838 ) {
839 $value = self::escapeIdReferenceList( $value );
840 }
841
842 // RDFa and microdata properties allow URLs, URIs and/or CURIs.
843 // Check them for sanity.
844 if ( $attribute === 'rel' || $attribute === 'rev'
845 # RDFa
846 || $attribute === 'about' || $attribute === 'property'
847 || $attribute === 'resource' || $attribute === 'datatype'
848 || $attribute === 'typeof'
849 # HTML5 microdata
850 || $attribute === 'itemid' || $attribute === 'itemprop'
851 || $attribute === 'itemref' || $attribute === 'itemscope'
852 || $attribute === 'itemtype'
853 ) {
854 // Paranoia. Allow "simple" values but suppress javascript
855 if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) {
856 continue;
857 }
858 }
859
860 # NOTE: even though elements using href/src are not allowed directly, supply
861 # validation code that can be used by tag hook handlers, etc
862 if ( $attribute === 'href' || $attribute === 'src' || $attribute === 'poster' ) {
863 if ( !preg_match( $hrefExp, $value ) ) {
864 continue; // drop any href or src attributes not using an allowed protocol.
865 // NOTE: this also drops all relative URLs
866 }
867 }
868
869 // If this attribute was previously set, override it.
870 // Output should only have one attribute of each name.
871 $out[$attribute] = $value;
872 }
873
874 # itemtype, itemid, itemref don't make sense without itemscope
875 if ( !array_key_exists( 'itemscope', $out ) ) {
876 unset( $out['itemtype'] );
877 unset( $out['itemid'] );
878 unset( $out['itemref'] );
879 }
880 # TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref.
881
882 return $out;
883 }
884
885 /**
886 * Given an attribute name, checks whether it is a reserved data attribute
887 * (such as data-mw-foo) which is unavailable to user-generated HTML so MediaWiki
888 * core and extension code can safely use it to communicate with frontend code.
889 * @param string $attr Attribute name.
890 * @return bool
891 */
892 public static function isReservedDataAttribute( $attr ) {
893 // data-ooui is reserved for ooui.
894 // data-mw and data-parsoid are reserved for parsoid.
895 // data-mw-<name here> is reserved for extensions (or core) if
896 // they need to communicate some data to the client and want to be
897 // sure that it isn't coming from an untrusted user.
898 // We ignore the possibility of namespaces since user-generated HTML
899 // can't use them anymore.
900 return (bool)preg_match( '/^data-(ooui|mw|parsoid)/i', $attr );
901 }
902
903 /**
904 * Merge two sets of HTML attributes. Conflicting items in the second set
905 * will override those in the first, except for 'class' attributes which
906 * will be combined (if they're both strings).
907 *
908 * @todo implement merging for other attributes such as style
909 * @param array $a
910 * @param array $b
911 * @return array
912 */
913 static function mergeAttributes( $a, $b ) {
914 $out = array_merge( $a, $b );
915 if ( isset( $a['class'] ) && isset( $b['class'] )
916 && is_string( $a['class'] ) && is_string( $b['class'] )
917 && $a['class'] !== $b['class']
918 ) {
919 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
920 -1, PREG_SPLIT_NO_EMPTY );
921 $out['class'] = implode( ' ', array_unique( $classes ) );
922 }
923 return $out;
924 }
925
926 /**
927 * Normalize CSS into a format we can easily search for hostile input
928 * - decode character references
929 * - decode escape sequences
930 * - convert characters that IE6 interprets into ascii
931 * - remove comments, unless the entire value is one single comment
932 * @param string $value the css string
933 * @return string normalized css
934 */
935 public static function normalizeCss( $value ) {
936 // Decode character references like &#123;
937 $value = self::decodeCharReferences( $value );
938
939 // Decode escape sequences and line continuation
940 // See the grammar in the CSS 2 spec, appendix D.
941 // This has to be done AFTER decoding character references.
942 // This means it isn't possible for this function to return
943 // unsanitized escape sequences. It is possible to manufacture
944 // input that contains character references that decode to
945 // escape sequences that decode to character references, but
946 // it's OK for the return value to contain character references
947 // because the caller is supposed to escape those anyway.
948 static $decodeRegex;
949 if ( !$decodeRegex ) {
950 $space = '[\\x20\\t\\r\\n\\f]';
951 $nl = '(?:\\n|\\r\\n|\\r|\\f)';
952 $backslash = '\\\\';
953 $decodeRegex = "/ $backslash
954 (?:
955 ($nl) | # 1. Line continuation
956 ([0-9A-Fa-f]{1,6})$space? | # 2. character number
957 (.) | # 3. backslash cancelling special meaning
958 () | # 4. backslash at end of string
959 )/xu";
960 }
961 $value = preg_replace_callback( $decodeRegex,
962 [ __CLASS__, 'cssDecodeCallback' ], $value );
963
964 // Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii
965 $value = preg_replace_callback(
966 '/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (T60088)
967 function ( $matches ) {
968 $cp = UtfNormal\Utils::utf8ToCodepoint( $matches[0] );
969 if ( $cp === false ) {
970 return '';
971 }
972 return chr( $cp - 65248 ); // ASCII range \x21-\x7A
973 },
974 $value
975 );
976
977 // Convert more characters IE6 might treat as ascii
978 // U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D
979 $value = str_replace(
980 [ 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ],
981 [ 'r', 'n', 'n', 'l', 'i', '(', '(' ],
982 $value
983 );
984
985 // Let the value through if it's nothing but a single comment, to
986 // allow other functions which may reject it to pass some error
987 // message through.
988 if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) {
989 // Remove any comments; IE gets token splitting wrong
990 // This must be done AFTER decoding character references and
991 // escape sequences, because those steps can introduce comments
992 // This step cannot introduce character references or escape
993 // sequences, because it replaces comments with spaces rather
994 // than removing them completely.
995 $value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value );
996
997 // Remove anything after a comment-start token, to guard against
998 // incorrect client implementations.
999 $commentPos = strpos( $value, '/*' );
1000 if ( $commentPos !== false ) {
1001 $value = substr( $value, 0, $commentPos );
1002 }
1003 }
1004
1005 // S followed by repeat, iteration, or prolonged sound marks,
1006 // which IE will treat as "ss"
1007 $value = preg_replace(
1008 '/s(?:
1009 \xE3\x80\xB1 | # U+3031
1010 \xE3\x82\x9D | # U+309D
1011 \xE3\x83\xBC | # U+30FC
1012 \xE3\x83\xBD | # U+30FD
1013 \xEF\xB9\xBC | # U+FE7C
1014 \xEF\xB9\xBD | # U+FE7D
1015 \xEF\xBD\xB0 # U+FF70
1016 )/ix',
1017 'ss',
1018 $value
1019 );
1020
1021 return $value;
1022 }
1023
1024 /**
1025 * Pick apart some CSS and check it for forbidden or unsafe structures.
1026 * Returns a sanitized string. This sanitized string will have
1027 * character references and escape sequences decoded and comments
1028 * stripped (unless it is itself one valid comment, in which case the value
1029 * will be passed through). If the input is just too evil, only a comment
1030 * complaining about evilness will be returned.
1031 *
1032 * Currently URL references, 'expression', 'tps' are forbidden.
1033 *
1034 * NOTE: Despite the fact that character references are decoded, the
1035 * returned string may contain character references given certain
1036 * clever input strings. These character references must
1037 * be escaped before the return value is embedded in HTML.
1038 *
1039 * @param string $value
1040 * @return string
1041 */
1042 static function checkCss( $value ) {
1043 $value = self::normalizeCss( $value );
1044
1045 // Reject problematic keywords and control characters
1046 if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) ||
1047 strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) {
1048 return '/* invalid control char */';
1049 } elseif ( preg_match(
1050 '! expression
1051 | filter\s*:
1052 | accelerator\s*:
1053 | -o-link\s*:
1054 | -o-link-source\s*:
1055 | -o-replace\s*:
1056 | url\s*\(
1057 | image\s*\(
1058 | image-set\s*\(
1059 | attr\s*\([^)]+[\s,]+url
1060 !ix', $value ) ) {
1061 return '/* insecure input */';
1062 }
1063 return $value;
1064 }
1065
1066 /**
1067 * @param array $matches
1068 * @return string
1069 */
1070 static function cssDecodeCallback( $matches ) {
1071 if ( $matches[1] !== '' ) {
1072 // Line continuation
1073 return '';
1074 } elseif ( $matches[2] !== '' ) {
1075 $char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) );
1076 } elseif ( $matches[3] !== '' ) {
1077 $char = $matches[3];
1078 } else {
1079 $char = '\\';
1080 }
1081 if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) {
1082 // These characters need to be escaped in strings
1083 // Clean up the escape sequence to avoid parsing errors by clients
1084 return '\\' . dechex( ord( $char ) ) . ' ';
1085 } else {
1086 // Decode unnecessary escape
1087 return $char;
1088 }
1089 }
1090
1091 /**
1092 * Take a tag soup fragment listing an HTML element's attributes
1093 * and normalize it to well-formed XML, discarding unwanted attributes.
1094 * Output is safe for further wikitext processing, with escaping of
1095 * values that could trigger problems.
1096 *
1097 * - Normalizes attribute names to lowercase
1098 * - Discards attributes not on a whitelist for the given element
1099 * - Turns broken or invalid entities into plaintext
1100 * - Double-quotes all attribute values
1101 * - Attributes without values are given the name as attribute
1102 * - Double attributes are discarded
1103 * - Unsafe style attributes are discarded
1104 * - Prepends space if there are attributes.
1105 * - (Optionally) Sorts attributes by name.
1106 *
1107 * @param string $text
1108 * @param string $element
1109 * @param bool $sorted Whether to sort the attributes (default: false)
1110 * @return string
1111 */
1112 static function fixTagAttributes( $text, $element, $sorted = false ) {
1113 if ( trim( $text ) == '' ) {
1114 return '';
1115 }
1116
1117 $decoded = self::decodeTagAttributes( $text );
1118 $stripped = self::validateTagAttributes( $decoded, $element );
1119
1120 if ( $sorted ) {
1121 ksort( $stripped );
1122 }
1123
1124 return self::safeEncodeTagAttributes( $stripped );
1125 }
1126
1127 /**
1128 * Encode an attribute value for HTML output.
1129 * @param string $text
1130 * @return string HTML-encoded text fragment
1131 */
1132 static function encodeAttribute( $text ) {
1133 $encValue = htmlspecialchars( $text, ENT_QUOTES );
1134
1135 // Whitespace is normalized during attribute decoding,
1136 // so if we've been passed non-spaces we must encode them
1137 // ahead of time or they won't be preserved.
1138 $encValue = strtr( $encValue, [
1139 "\n" => '&#10;',
1140 "\r" => '&#13;',
1141 "\t" => '&#9;',
1142 ] );
1143
1144 return $encValue;
1145 }
1146
1147 /**
1148 * Armor French spaces with a replacement character
1149 *
1150 * @since 1.32
1151 * @param string $text Text to armor
1152 * @param string $space Space character for the French spaces, defaults to '&#160;'
1153 * @return string Armored text
1154 */
1155 public static function armorFrenchSpaces( $text, $space = '&#160;' ) {
1156 // Replace $ with \$ and \ with \\
1157 $space = preg_replace( '#(?<!\\\\)(\\$|\\\\)#', '\\\\$1', $space );
1158 $fixtags = [
1159 # French spaces, last one Guillemet-left
1160 # only if there is something before the space
1161 # and a non-word character after the punctuation.
1162 '/(\S) (?=[?:;!%»›](?!\w))/u' => "\\1$space",
1163 # French spaces, Guillemet-right
1164 '/([«‹]) /u' => "\\1$space",
1165 ];
1166 return preg_replace( array_keys( $fixtags ), array_values( $fixtags ), $text );
1167 }
1168
1169 /**
1170 * Encode an attribute value for HTML tags, with extra armoring
1171 * against further wiki processing.
1172 * @param string $text
1173 * @return string HTML-encoded text fragment
1174 */
1175 static function safeEncodeAttribute( $text ) {
1176 $encValue = self::encodeAttribute( $text );
1177
1178 # Templates and links may be expanded in later parsing,
1179 # creating invalid or dangerous output. Suppress this.
1180 $encValue = strtr( $encValue, [
1181 '<' => '&lt;', // This should never happen,
1182 '>' => '&gt;', // we've received invalid input
1183 '"' => '&quot;', // which should have been escaped.
1184 '{' => '&#123;',
1185 '}' => '&#125;', // prevent unpaired language conversion syntax
1186 '[' => '&#91;',
1187 ']' => '&#93;',
1188 "''" => '&#39;&#39;',
1189 'ISBN' => '&#73;SBN',
1190 'RFC' => '&#82;FC',
1191 'PMID' => '&#80;MID',
1192 '|' => '&#124;',
1193 '__' => '&#95;_',
1194 ] );
1195
1196 # Armor against French spaces detection (T5158)
1197 $encValue = self::armorFrenchSpaces( $encValue, '&#32;' );
1198
1199 # Stupid hack
1200 $encValue = preg_replace_callback(
1201 '/((?i)' . wfUrlProtocols() . ')/',
1202 function ( $matches ) {
1203 return str_replace( ':', '&#58;', $matches[1] );
1204 },
1205 $encValue );
1206 return $encValue;
1207 }
1208
1209 /**
1210 * Given a value, escape it so that it can be used in an id attribute and
1211 * return it. This will use HTML5 validation, allowing anything but ASCII
1212 * whitespace.
1213 *
1214 * To ensure we don't have to bother escaping anything, we also strip ', ".
1215 * TODO: Is this the best tactic?
1216 *
1217 * We also strip # because it upsets IE, and % because it could be
1218 * ambiguous if it's part of something that looks like a percent escape
1219 * (which don't work reliably in fragments cross-browser).
1220 *
1221 * @deprecated since 1.30, use one of this class' escapeIdFor*() functions
1222 *
1223 * @see https://www.w3.org/TR/html401/types.html#type-name Valid characters
1224 * in the id and name attributes
1225 * @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with
1226 * the id attribute
1227 * @see https://www.w3.org/TR/html5/dom.html#the-id-attribute
1228 * HTML5 definition of id attribute
1229 *
1230 * @param string $id Id to escape
1231 * @param string|array $options String or array of strings (default is array()):
1232 * 'noninitial': This is a non-initial fragment of an id, not a full id,
1233 * so don't pay attention if the first character isn't valid at the
1234 * beginning of an id.
1235 * @return string
1236 */
1237 static function escapeId( $id, $options = [] ) {
1238 $options = (array)$options;
1239
1240 // HTML4-style escaping
1241 static $replace = [
1242 '%3A' => ':',
1243 '%' => '.'
1244 ];
1245
1246 $id = urlencode( strtr( $id, ' ', '_' ) );
1247 $id = strtr( $id, $replace );
1248
1249 if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) {
1250 // Initial character must be a letter!
1251 $id = "x$id";
1252 }
1253 return $id;
1254 }
1255
1256 /**
1257 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
1258 * a valid HTML id attribute.
1259 *
1260 * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
1261 * be sure to use proper escaping.
1262 *
1263 * @param string $id String to escape
1264 * @param int $mode One of ID_* constants, specifying whether the primary or fallback encoding
1265 * should be used.
1266 * @return string|bool Escaped ID or false if fallback encoding is requested but it's not
1267 * configured.
1268 *
1269 * @since 1.30
1270 */
1271 public static function escapeIdForAttribute( $id, $mode = self::ID_PRIMARY ) {
1272 global $wgFragmentMode;
1273
1274 if ( !isset( $wgFragmentMode[$mode] ) ) {
1275 if ( $mode === self::ID_PRIMARY ) {
1276 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
1277 }
1278 return false;
1279 }
1280
1281 $internalMode = $wgFragmentMode[$mode];
1282
1283 return self::escapeIdInternal( $id, $internalMode );
1284 }
1285
1286 /**
1287 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
1288 * a valid URL fragment.
1289 *
1290 * WARNING: unlike escapeId(), the output of this function is not guaranteed to be HTML safe,
1291 * be sure to use proper escaping.
1292 *
1293 * @param string $id String to escape
1294 * @return string Escaped ID
1295 *
1296 * @since 1.30
1297 */
1298 public static function escapeIdForLink( $id ) {
1299 global $wgFragmentMode;
1300
1301 if ( !isset( $wgFragmentMode[self::ID_PRIMARY] ) ) {
1302 throw new UnexpectedValueException( '$wgFragmentMode is configured with no primary mode' );
1303 }
1304
1305 $mode = $wgFragmentMode[self::ID_PRIMARY];
1306
1307 $id = self::escapeIdInternal( $id, $mode );
1308
1309 return $id;
1310 }
1311
1312 /**
1313 * Given a section name or other user-generated or otherwise unsafe string, escapes it to be
1314 * a valid URL fragment for external interwikis.
1315 *
1316 * @param string $id String to escape
1317 * @return string Escaped ID
1318 *
1319 * @since 1.30
1320 */
1321 public static function escapeIdForExternalInterwiki( $id ) {
1322 global $wgExternalInterwikiFragmentMode;
1323
1324 $id = self::escapeIdInternal( $id, $wgExternalInterwikiFragmentMode );
1325
1326 return $id;
1327 }
1328
1329 /**
1330 * Helper for escapeIdFor*() functions. Performs most of the actual escaping.
1331 *
1332 * @param string $id String to escape
1333 * @param string $mode One of modes from $wgFragmentMode
1334 * @return string
1335 */
1336 private static function escapeIdInternal( $id, $mode ) {
1337 switch ( $mode ) {
1338 case 'html5':
1339 $id = str_replace( ' ', '_', $id );
1340 break;
1341 case 'legacy':
1342 // This corresponds to 'noninitial' mode of the old escapeId()
1343 static $replace = [
1344 '%3A' => ':',
1345 '%' => '.'
1346 ];
1347
1348 $id = urlencode( str_replace( ' ', '_', $id ) );
1349 $id = strtr( $id, $replace );
1350 break;
1351 default:
1352 throw new InvalidArgumentException( "Invalid mode '$mode' passed to '" . __METHOD__ );
1353 }
1354
1355 return $id;
1356 }
1357
1358 /**
1359 * Given a string containing a space delimited list of ids, escape each id
1360 * to match ids escaped by the escapeId() function.
1361 *
1362 * @todo remove $options completely in 1.32
1363 *
1364 * @since 1.27
1365 *
1366 * @param string $referenceString Space delimited list of ids
1367 * @param string|array $options Deprecated and does nothing.
1368 * @return string
1369 */
1370 static function escapeIdReferenceList( $referenceString, $options = [] ) {
1371 if ( $options ) {
1372 wfDeprecated( __METHOD__ . ' with $options', '1.31' );
1373 }
1374 # Explode the space delimited list string into an array of tokens
1375 $references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY );
1376
1377 # Escape each token as an id
1378 foreach ( $references as &$ref ) {
1379 $ref = self::escapeIdForAttribute( $ref );
1380 }
1381
1382 # Merge the array back to a space delimited list string
1383 # If the array is empty, the result will be an empty string ('')
1384 $referenceString = implode( ' ', $references );
1385
1386 return $referenceString;
1387 }
1388
1389 /**
1390 * Given a value, escape it so that it can be used as a CSS class and
1391 * return it.
1392 *
1393 * @todo For extra validity, input should be validated UTF-8.
1394 *
1395 * @see https://www.w3.org/TR/CSS21/syndata.html Valid characters/format
1396 *
1397 * @param string $class
1398 * @return string
1399 */
1400 static function escapeClass( $class ) {
1401 // Convert ugly stuff to underscores and kill underscores in ugly places
1402 return rtrim( preg_replace(
1403 [ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ],
1404 '_',
1405 $class ), '_' );
1406 }
1407
1408 /**
1409 * Given HTML input, escape with htmlspecialchars but un-escape entities.
1410 * This allows (generally harmless) entities like &#160; to survive.
1411 *
1412 * @param string $html HTML to escape
1413 * @return string Escaped input
1414 */
1415 static function escapeHtmlAllowEntities( $html ) {
1416 $html = self::decodeCharReferences( $html );
1417 # It seems wise to escape ' as well as ", as a matter of course. Can't
1418 # hurt. Use ENT_SUBSTITUTE so that incorrectly truncated multibyte characters
1419 # don't cause the entire string to disappear.
1420 $html = htmlspecialchars( $html, ENT_QUOTES | ENT_SUBSTITUTE );
1421 return $html;
1422 }
1423
1424 /**
1425 * Return an associative array of attribute names and values from
1426 * a partial tag string. Attribute names are forced to lowercase,
1427 * character references are decoded to UTF-8 text.
1428 *
1429 * @param string $text
1430 * @return array
1431 */
1432 public static function decodeTagAttributes( $text ) {
1433 if ( trim( $text ) == '' ) {
1434 return [];
1435 }
1436
1437 $attribs = [];
1438 $pairs = [];
1439 if ( !preg_match_all(
1440 self::getAttribsRegex(),
1441 $text,
1442 $pairs,
1443 PREG_SET_ORDER ) ) {
1444 return $attribs;
1445 }
1446
1447 foreach ( $pairs as $set ) {
1448 $attribute = strtolower( $set[1] );
1449 $value = self::getTagAttributeCallback( $set );
1450
1451 // Normalize whitespace
1452 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
1453 $value = trim( $value );
1454
1455 // Decode character references
1456 $attribs[$attribute] = self::decodeCharReferences( $value );
1457 }
1458 return $attribs;
1459 }
1460
1461 /**
1462 * Build a partial tag string from an associative array of attribute
1463 * names and values as returned by decodeTagAttributes.
1464 *
1465 * @param array $assoc_array
1466 * @return string
1467 */
1468 public static function safeEncodeTagAttributes( $assoc_array ) {
1469 $attribs = [];
1470 foreach ( $assoc_array as $attribute => $value ) {
1471 $encAttribute = htmlspecialchars( $attribute );
1472 $encValue = self::safeEncodeAttribute( $value );
1473
1474 $attribs[] = "$encAttribute=\"$encValue\"";
1475 }
1476 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
1477 }
1478
1479 /**
1480 * Pick the appropriate attribute value from a match set from the
1481 * attribs regex matches.
1482 *
1483 * @param array $set
1484 * @throws MWException When tag conditions are not met.
1485 * @return string
1486 */
1487 private static function getTagAttributeCallback( $set ) {
1488 if ( isset( $set[5] ) ) {
1489 # No quotes.
1490 return $set[5];
1491 } elseif ( isset( $set[4] ) ) {
1492 # Single-quoted
1493 return $set[4];
1494 } elseif ( isset( $set[3] ) ) {
1495 # Double-quoted
1496 return $set[3];
1497 } elseif ( !isset( $set[2] ) ) {
1498 # In XHTML, attributes must have a value so return an empty string.
1499 # See "Empty attribute syntax",
1500 # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
1501 return "";
1502 } else {
1503 throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
1504 }
1505 }
1506
1507 /**
1508 * @param string $text
1509 * @return string
1510 */
1511 private static function normalizeWhitespace( $text ) {
1512 return trim( preg_replace(
1513 '/(?:\r\n|[\x20\x0d\x0a\x09])+/',
1514 ' ',
1515 $text ) );
1516 }
1517
1518 /**
1519 * Normalizes whitespace in a section name, such as might be returned
1520 * by Parser::stripSectionName(), for use in the id's that are used for
1521 * section links.
1522 *
1523 * @param string $section
1524 * @return string
1525 */
1526 static function normalizeSectionNameWhitespace( $section ) {
1527 return trim( preg_replace( '/[ _]+/', ' ', $section ) );
1528 }
1529
1530 /**
1531 * Ensure that any entities and character references are legal
1532 * for XML and XHTML specifically. Any stray bits will be
1533 * &amp;-escaped to result in a valid text fragment.
1534 *
1535 * a. named char refs can only be &lt; &gt; &amp; &quot;, others are
1536 * numericized (this way we're well-formed even without a DTD)
1537 * b. any numeric char refs must be legal chars, not invalid or forbidden
1538 * c. use lower cased "&#x", not "&#X"
1539 * d. fix or reject non-valid attributes
1540 *
1541 * @param string $text
1542 * @return string
1543 * @private
1544 */
1545 static function normalizeCharReferences( $text ) {
1546 return preg_replace_callback(
1547 self::CHAR_REFS_REGEX,
1548 [ self::class, 'normalizeCharReferencesCallback' ],
1549 $text );
1550 }
1551
1552 /**
1553 * @param string $matches
1554 * @return string
1555 */
1556 static function normalizeCharReferencesCallback( $matches ) {
1557 $ret = null;
1558 if ( $matches[1] != '' ) {
1559 $ret = self::normalizeEntity( $matches[1] );
1560 } elseif ( $matches[2] != '' ) {
1561 $ret = self::decCharReference( $matches[2] );
1562 } elseif ( $matches[3] != '' ) {
1563 $ret = self::hexCharReference( $matches[3] );
1564 }
1565 if ( is_null( $ret ) ) {
1566 return htmlspecialchars( $matches[0] );
1567 } else {
1568 return $ret;
1569 }
1570 }
1571
1572 /**
1573 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1574 * return the equivalent numeric entity reference (except for the core &lt;
1575 * &gt; &amp; &quot;). If the entity is a MediaWiki-specific alias, returns
1576 * the HTML equivalent. Otherwise, returns HTML-escaped text of
1577 * pseudo-entity source (eg &amp;foo;)
1578 *
1579 * @param string $name
1580 * @return string
1581 */
1582 static function normalizeEntity( $name ) {
1583 if ( isset( self::$htmlEntityAliases[$name] ) ) {
1584 return '&' . self::$htmlEntityAliases[$name] . ';';
1585 } elseif ( in_array( $name, [ 'lt', 'gt', 'amp', 'quot' ] ) ) {
1586 return "&$name;";
1587 } elseif ( isset( self::$htmlEntities[$name] ) ) {
1588 return '&#' . self::$htmlEntities[$name] . ';';
1589 } else {
1590 return "&amp;$name;";
1591 }
1592 }
1593
1594 /**
1595 * @param int $codepoint
1596 * @return null|string
1597 */
1598 static function decCharReference( $codepoint ) {
1599 $point = intval( $codepoint );
1600 if ( self::validateCodepoint( $point ) ) {
1601 return sprintf( '&#%d;', $point );
1602 } else {
1603 return null;
1604 }
1605 }
1606
1607 /**
1608 * @param int $codepoint
1609 * @return null|string
1610 */
1611 static function hexCharReference( $codepoint ) {
1612 $point = hexdec( $codepoint );
1613 if ( self::validateCodepoint( $point ) ) {
1614 return sprintf( '&#x%x;', $point );
1615 } else {
1616 return null;
1617 }
1618 }
1619
1620 /**
1621 * Returns true if a given Unicode codepoint is a valid character in
1622 * both HTML5 and XML.
1623 * @param int $codepoint
1624 * @return bool
1625 */
1626 private static function validateCodepoint( $codepoint ) {
1627 # U+000C is valid in HTML5 but not allowed in XML.
1628 # U+000D is valid in XML but not allowed in HTML5.
1629 # U+007F - U+009F are disallowed in HTML5 (control characters).
1630 return $codepoint == 0x09
1631 || $codepoint == 0x0a
1632 || ( $codepoint >= 0x20 && $codepoint <= 0x7e )
1633 || ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff )
1634 || ( $codepoint >= 0xe000 && $codepoint <= 0xfffd )
1635 || ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff );
1636 }
1637
1638 /**
1639 * Decode any character references, numeric or named entities,
1640 * in the text and return a UTF-8 string.
1641 *
1642 * @param string $text
1643 * @return string
1644 */
1645 public static function decodeCharReferences( $text ) {
1646 return preg_replace_callback(
1647 self::CHAR_REFS_REGEX,
1648 [ self::class, 'decodeCharReferencesCallback' ],
1649 $text );
1650 }
1651
1652 /**
1653 * Decode any character references, numeric or named entities,
1654 * in the next and normalize the resulting string. (T16952)
1655 *
1656 * This is useful for page titles, not for text to be displayed,
1657 * MediaWiki allows HTML entities to escape normalization as a feature.
1658 *
1659 * @param string $text Already normalized, containing entities
1660 * @return string Still normalized, without entities
1661 */
1662 public static function decodeCharReferencesAndNormalize( $text ) {
1663 $text = preg_replace_callback(
1664 self::CHAR_REFS_REGEX,
1665 [ self::class, 'decodeCharReferencesCallback' ],
1666 $text,
1667 -1, //limit
1668 $count
1669 );
1670
1671 if ( $count ) {
1672 return MediaWikiServices::getInstance()->getContentLanguage()->normalize( $text );
1673 } else {
1674 return $text;
1675 }
1676 }
1677
1678 /**
1679 * @param string $matches
1680 * @return string
1681 */
1682 static function decodeCharReferencesCallback( $matches ) {
1683 if ( $matches[1] != '' ) {
1684 return self::decodeEntity( $matches[1] );
1685 } elseif ( $matches[2] != '' ) {
1686 return self::decodeChar( intval( $matches[2] ) );
1687 } elseif ( $matches[3] != '' ) {
1688 return self::decodeChar( hexdec( $matches[3] ) );
1689 }
1690 # Last case should be an ampersand by itself
1691 return $matches[0];
1692 }
1693
1694 /**
1695 * Return UTF-8 string for a codepoint if that is a valid
1696 * character reference, otherwise U+FFFD REPLACEMENT CHARACTER.
1697 * @param int $codepoint
1698 * @return string
1699 * @private
1700 */
1701 static function decodeChar( $codepoint ) {
1702 if ( self::validateCodepoint( $codepoint ) ) {
1703 return UtfNormal\Utils::codepointToUtf8( $codepoint );
1704 } else {
1705 return UtfNormal\Constants::UTF8_REPLACEMENT;
1706 }
1707 }
1708
1709 /**
1710 * If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD,
1711 * return the UTF-8 encoding of that character. Otherwise, returns
1712 * pseudo-entity source (eg "&foo;")
1713 *
1714 * @param string $name
1715 * @return string
1716 */
1717 static function decodeEntity( $name ) {
1718 if ( isset( self::$htmlEntityAliases[$name] ) ) {
1719 $name = self::$htmlEntityAliases[$name];
1720 }
1721 if ( isset( self::$htmlEntities[$name] ) ) {
1722 return UtfNormal\Utils::codepointToUtf8( self::$htmlEntities[$name] );
1723 } else {
1724 return "&$name;";
1725 }
1726 }
1727
1728 /**
1729 * Fetch the whitelist of acceptable attributes for a given element name.
1730 *
1731 * @param string $element
1732 * @return array
1733 */
1734 static function attributeWhitelist( $element ) {
1735 $list = self::setupAttributeWhitelist();
1736 return $list[$element] ?? [];
1737 }
1738
1739 /**
1740 * Foreach array key (an allowed HTML element), return an array
1741 * of allowed attributes
1742 * @return array
1743 */
1744 static function setupAttributeWhitelist() {
1745 static $whitelist;
1746
1747 if ( $whitelist !== null ) {
1748 return $whitelist;
1749 }
1750
1751 $common = [
1752 # HTML
1753 'id',
1754 'class',
1755 'style',
1756 'lang',
1757 'dir',
1758 'title',
1759
1760 # WAI-ARIA
1761 'aria-describedby',
1762 'aria-flowto',
1763 'aria-label',
1764 'aria-labelledby',
1765 'aria-owns',
1766 'role',
1767
1768 # RDFa
1769 # These attributes are specified in section 9 of
1770 # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
1771 'about',
1772 'property',
1773 'resource',
1774 'datatype',
1775 'typeof',
1776
1777 # Microdata. These are specified by
1778 # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
1779 'itemid',
1780 'itemprop',
1781 'itemref',
1782 'itemscope',
1783 'itemtype',
1784 ];
1785
1786 $block = array_merge( $common, [ 'align' ] );
1787 $tablealign = [ 'align', 'valign' ];
1788 $tablecell = [
1789 'abbr',
1790 'axis',
1791 'headers',
1792 'scope',
1793 'rowspan',
1794 'colspan',
1795 'nowrap', # deprecated
1796 'width', # deprecated
1797 'height', # deprecated
1798 'bgcolor', # deprecated
1799 ];
1800
1801 # Numbers refer to sections in HTML 4.01 standard describing the element.
1802 # See: https://www.w3.org/TR/html4/
1803 $whitelist = [
1804 # 7.5.4
1805 'div' => $block,
1806 'center' => $common, # deprecated
1807 'span' => $common,
1808
1809 # 7.5.5
1810 'h1' => $block,
1811 'h2' => $block,
1812 'h3' => $block,
1813 'h4' => $block,
1814 'h5' => $block,
1815 'h6' => $block,
1816
1817 # 7.5.6
1818 # address
1819
1820 # 8.2.4
1821 'bdo' => $common,
1822
1823 # 9.2.1
1824 'em' => $common,
1825 'strong' => $common,
1826 'cite' => $common,
1827 'dfn' => $common,
1828 'code' => $common,
1829 'samp' => $common,
1830 'kbd' => $common,
1831 'var' => $common,
1832 'abbr' => $common,
1833 # acronym
1834
1835 # 9.2.2
1836 'blockquote' => array_merge( $common, [ 'cite' ] ),
1837 'q' => array_merge( $common, [ 'cite' ] ),
1838
1839 # 9.2.3
1840 'sub' => $common,
1841 'sup' => $common,
1842
1843 # 9.3.1
1844 'p' => $block,
1845
1846 # 9.3.2
1847 'br' => array_merge( $common, [ 'clear' ] ),
1848
1849 # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
1850 'wbr' => $common,
1851
1852 # 9.3.4
1853 'pre' => array_merge( $common, [ 'width' ] ),
1854
1855 # 9.4
1856 'ins' => array_merge( $common, [ 'cite', 'datetime' ] ),
1857 'del' => array_merge( $common, [ 'cite', 'datetime' ] ),
1858
1859 # 10.2
1860 'ul' => array_merge( $common, [ 'type' ] ),
1861 'ol' => array_merge( $common, [ 'type', 'start', 'reversed' ] ),
1862 'li' => array_merge( $common, [ 'type', 'value' ] ),
1863
1864 # 10.3
1865 'dl' => $common,
1866 'dd' => $common,
1867 'dt' => $common,
1868
1869 # 11.2.1
1870 'table' => array_merge( $common,
1871 [ 'summary', 'width', 'border', 'frame',
1872 'rules', 'cellspacing', 'cellpadding',
1873 'align', 'bgcolor',
1874 ] ),
1875
1876 # 11.2.2
1877 'caption' => $block,
1878
1879 # 11.2.3
1880 'thead' => $common,
1881 'tfoot' => $common,
1882 'tbody' => $common,
1883
1884 # 11.2.4
1885 'colgroup' => array_merge( $common, [ 'span' ] ),
1886 'col' => array_merge( $common, [ 'span' ] ),
1887
1888 # 11.2.5
1889 'tr' => array_merge( $common, [ 'bgcolor' ], $tablealign ),
1890
1891 # 11.2.6
1892 'td' => array_merge( $common, $tablecell, $tablealign ),
1893 'th' => array_merge( $common, $tablecell, $tablealign ),
1894
1895 # 12.2
1896 # NOTE: <a> is not allowed directly, but the attrib
1897 # whitelist is used from the Parser object
1898 'a' => array_merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa
1899
1900 # 13.2
1901 # Not usually allowed, but may be used for extension-style hooks
1902 # such as <math> when it is rasterized, or if $wgAllowImageTag is
1903 # true
1904 'img' => array_merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ),
1905
1906 'video' => array_merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ),
1907 'source' => array_merge( $common, [ 'type', 'src' ] ),
1908 'track' => array_merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ),
1909
1910 # 15.2.1
1911 'tt' => $common,
1912 'b' => $common,
1913 'i' => $common,
1914 'big' => $common,
1915 'small' => $common,
1916 'strike' => $common,
1917 's' => $common,
1918 'u' => $common,
1919
1920 # 15.2.2
1921 'font' => array_merge( $common, [ 'size', 'color', 'face' ] ),
1922 # basefont
1923
1924 # 15.3
1925 'hr' => array_merge( $common, [ 'width' ] ),
1926
1927 # HTML Ruby annotation text module, simple ruby only.
1928 # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
1929 'ruby' => $common,
1930 # rbc
1931 'rb' => $common,
1932 'rp' => $common,
1933 'rt' => $common, # array_merge( $common, array( 'rbspan' ) ),
1934 'rtc' => $common,
1935
1936 # MathML root element, where used for extensions
1937 # 'title' may not be 100% valid here; it's XHTML
1938 # https://www.w3.org/TR/REC-MathML/
1939 'math' => [ 'class', 'style', 'id', 'title' ],
1940
1941 // HTML 5 section 4.5
1942 'figure' => $common,
1943 'figcaption' => $common,
1944
1945 # HTML 5 section 4.6
1946 'bdi' => $common,
1947
1948 # HTML5 elements, defined by:
1949 # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
1950 'data' => array_merge( $common, [ 'value' ] ),
1951 'time' => array_merge( $common, [ 'datetime' ] ),
1952 'mark' => $common,
1953
1954 // meta and link are only permitted by removeHTMLtags when Microdata
1955 // is enabled so we don't bother adding a conditional to hide these
1956 // Also meta and link are only valid in WikiText as Microdata elements
1957 // (ie: validateTag rejects tags missing the attributes needed for Microdata)
1958 // So we don't bother including $common attributes that have no purpose.
1959 'meta' => [ 'itemprop', 'content' ],
1960 'link' => [ 'itemprop', 'href', 'title' ],
1961 ];
1962
1963 return $whitelist;
1964 }
1965
1966 /**
1967 * Take a fragment of (potentially invalid) HTML and return
1968 * a version with any tags removed, encoded as plain text.
1969 *
1970 * Warning: this return value must be further escaped for literal
1971 * inclusion in HTML output as of 1.10!
1972 *
1973 * @param string $html HTML fragment
1974 * @return string
1975 */
1976 static function stripAllTags( $html ) {
1977 // Use RemexHtml to tokenize $html and extract the text
1978 $handler = new RemexStripTagHandler;
1979 $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [
1980 'ignoreErrors' => true,
1981 // don't ignore char refs, we want them to be decoded
1982 'ignoreNulls' => true,
1983 'skipPreprocess' => true,
1984 ] );
1985 $tokenizer->execute();
1986 $text = $handler->getResult();
1987
1988 $text = self::normalizeWhitespace( $text );
1989 return $text;
1990 }
1991
1992 /**
1993 * Hack up a private DOCTYPE with HTML's standard entity declarations.
1994 * PHP 4 seemed to know these if you gave it an HTML doctype, but
1995 * PHP 5.1 doesn't.
1996 *
1997 * Use for passing XHTML fragments to PHP's XML parsing functions
1998 *
1999 * @return string
2000 */
2001 static function hackDocType() {
2002 $out = "<!DOCTYPE html [\n";
2003 foreach ( self::$htmlEntities as $entity => $codepoint ) {
2004 $out .= "<!ENTITY $entity \"&#$codepoint;\">";
2005 }
2006 $out .= "]>\n";
2007 return $out;
2008 }
2009
2010 /**
2011 * @param string $url
2012 * @return mixed|string
2013 */
2014 static function cleanUrl( $url ) {
2015 # Normalize any HTML entities in input. They will be
2016 # re-escaped by makeExternalLink().
2017 $url = self::decodeCharReferences( $url );
2018
2019 # Escape any control characters introduced by the above step
2020 $url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/',
2021 [ __CLASS__, 'cleanUrlCallback' ], $url );
2022
2023 # Validate hostname portion
2024 $matches = [];
2025 if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
2026 list( /* $whole */, $protocol, $host, $rest ) = $matches;
2027
2028 // Characters that will be ignored in IDNs.
2029 // https://tools.ietf.org/html/rfc3454#section-3.1
2030 // Strip them before further processing so blacklists and such work.
2031 $strip = "/
2032 \\s| # general whitespace
2033 \xc2\xad| # 00ad SOFT HYPHEN
2034 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
2035 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
2036 \xe2\x81\xa0| # 2060 WORD JOINER
2037 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
2038 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
2039 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
2040 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
2041 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
2042 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
2043 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
2044 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16
2045 /xuD";
2046
2047 $host = preg_replace( $strip, '', $host );
2048
2049 // IPv6 host names are bracketed with []. Url-decode these.
2050 if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 &&
2051 preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches )
2052 ) {
2053 $host = '//[' . $matches[1] . ']' . $matches[2];
2054 }
2055
2056 // @todo FIXME: Validate hostnames here
2057
2058 return $protocol . $host . $rest;
2059 } else {
2060 return $url;
2061 }
2062 }
2063
2064 /**
2065 * @param array $matches
2066 * @return string
2067 */
2068 static function cleanUrlCallback( $matches ) {
2069 return urlencode( $matches[0] );
2070 }
2071
2072 /**
2073 * Does a string look like an e-mail address?
2074 *
2075 * This validates an email address using an HTML5 specification found at:
2076 * http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address
2077 * Which as of 2011-01-24 says:
2078 *
2079 * A valid e-mail address is a string that matches the ABNF production
2080 * 1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined
2081 * in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section
2082 * 3.5.
2083 *
2084 * This function is an implementation of the specification as requested in
2085 * T24449.
2086 *
2087 * Client-side forms will use the same standard validation rules via JS or
2088 * HTML 5 validation; additional restrictions can be enforced server-side
2089 * by extensions via the 'isValidEmailAddr' hook.
2090 *
2091 * Note that this validation doesn't 100% match RFC 2822, but is believed
2092 * to be liberal enough for wide use. Some invalid addresses will still
2093 * pass validation here.
2094 *
2095 * @since 1.18
2096 *
2097 * @param string $addr E-mail address
2098 * @return bool
2099 */
2100 public static function validateEmail( $addr ) {
2101 $result = null;
2102 if ( !Hooks::run( 'isValidEmailAddr', [ $addr, &$result ] ) ) {
2103 return $result;
2104 }
2105
2106 // Please note strings below are enclosed in brackets [], this make the
2107 // hyphen "-" a range indicator. Hence it is double backslashed below.
2108 // See T28948
2109 $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~";
2110 $rfc1034_ldh_str = "a-z0-9\\-";
2111
2112 $html5_email_regexp = "/
2113 ^ # start of string
2114 [$rfc5322_atext\\.]+ # user part which is liberal :p
2115 @ # 'apostrophe'
2116 [$rfc1034_ldh_str]+ # First domain part
2117 (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
2118 $ # End of string
2119 /ix"; // case Insensitive, eXtended
2120
2121 return (bool)preg_match( $html5_email_regexp, $addr );
2122 }
2123 }