Merge "tests: Add PHPUnit tests for methods in MagicWordFactory::class"
[lhc/web/wiklou.git] / tests / phpunit / includes / parser / SanitizerTest.php
1 <?php
2
3 /**
4 * @todo Tests covering decodeCharReferences can be refactored into a single
5 * method and dataprovider.
6 *
7 * @group Sanitizer
8 */
9 class SanitizerTest extends MediaWikiTestCase {
10
11 protected function tearDown() {
12 MWTidy::destroySingleton();
13 parent::tearDown();
14 }
15
16 /**
17 * @covers Sanitizer::decodeCharReferences
18 */
19 public function testDecodeNamedEntities() {
20 $this->assertEquals(
21 "\xc3\xa9cole",
22 Sanitizer::decodeCharReferences( '&eacute;cole' ),
23 'decode named entities'
24 );
25 }
26
27 /**
28 * @covers Sanitizer::decodeCharReferences
29 */
30 public function testDecodeNumericEntities() {
31 $this->assertEquals(
32 "\xc4\x88io bonas dans l'\xc3\xa9cole!",
33 Sanitizer::decodeCharReferences( "&#x108;io bonas dans l'&#233;cole!" ),
34 'decode numeric entities'
35 );
36 }
37
38 /**
39 * @covers Sanitizer::decodeCharReferences
40 */
41 public function testDecodeMixedEntities() {
42 $this->assertEquals(
43 "\xc4\x88io bonas dans l'\xc3\xa9cole!",
44 Sanitizer::decodeCharReferences( "&#x108;io bonas dans l'&eacute;cole!" ),
45 'decode mixed numeric/named entities'
46 );
47 }
48
49 /**
50 * @covers Sanitizer::decodeCharReferences
51 */
52 public function testDecodeMixedComplexEntities() {
53 $this->assertEquals(
54 "\xc4\x88io bonas dans l'\xc3\xa9cole! (mais pas &#x108;io dans l'&eacute;cole)",
55 Sanitizer::decodeCharReferences(
56 "&#x108;io bonas dans l'&eacute;cole! (mais pas &amp;#x108;io dans l'&#38;eacute;cole)"
57 ),
58 'decode mixed complex entities'
59 );
60 }
61
62 /**
63 * @covers Sanitizer::decodeCharReferences
64 */
65 public function testInvalidAmpersand() {
66 $this->assertEquals(
67 'a & b',
68 Sanitizer::decodeCharReferences( 'a & b' ),
69 'Invalid ampersand'
70 );
71 }
72
73 /**
74 * @covers Sanitizer::decodeCharReferences
75 */
76 public function testInvalidEntities() {
77 $this->assertEquals(
78 '&foo;',
79 Sanitizer::decodeCharReferences( '&foo;' ),
80 'Invalid named entity'
81 );
82 }
83
84 /**
85 * @covers Sanitizer::decodeCharReferences
86 */
87 public function testInvalidNumberedEntities() {
88 $this->assertEquals(
89 UtfNormal\Constants::UTF8_REPLACEMENT,
90 Sanitizer::decodeCharReferences( "&#88888888888888;" ),
91 'Invalid numbered entity'
92 );
93 }
94
95 /**
96 * @covers Sanitizer::removeHTMLtags
97 * @dataProvider provideHtml5Tags
98 *
99 * @param string $tag Name of an HTML5 element (ie: 'video')
100 * @param bool $escaped Whether sanitizer let the tag in or escape it (ie: '&lt;video&gt;')
101 */
102 public function testRemovehtmltagsOnHtml5Tags( $tag, $escaped ) {
103 $this->hideDeprecated( 'disabling tidy' );
104 $this->hideDeprecated( 'MWTidy::setInstance' );
105 MWTidy::setInstance( false );
106
107 if ( $escaped ) {
108 $this->assertEquals( "&lt;$tag&gt;",
109 Sanitizer::removeHTMLtags( "<$tag>" )
110 );
111 } else {
112 $this->assertEquals( "<$tag></$tag>\n",
113 Sanitizer::removeHTMLtags( "<$tag>" )
114 );
115 }
116 }
117
118 /**
119 * Provide HTML5 tags
120 */
121 public static function provideHtml5Tags() {
122 $ESCAPED = true; # We want tag to be escaped
123 $VERBATIM = false; # We want to keep the tag
124 return [
125 [ 'data', $VERBATIM ],
126 [ 'mark', $VERBATIM ],
127 [ 'time', $VERBATIM ],
128 [ 'video', $ESCAPED ],
129 ];
130 }
131
132 function dataRemoveHTMLtags() {
133 return [
134 // former testSelfClosingTag
135 [
136 '<div>Hello world</div />',
137 '<div>Hello world</div>',
138 'Self-closing closing div'
139 ],
140 // Make sure special nested HTML5 semantics are not broken
141 // https://html.spec.whatwg.org/multipage/semantics.html#the-kbd-element
142 [
143 '<kbd><kbd>Shift</kbd>+<kbd>F3</kbd></kbd>',
144 '<kbd><kbd>Shift</kbd>+<kbd>F3</kbd></kbd>',
145 'Nested <kbd>.'
146 ],
147 // https://html.spec.whatwg.org/multipage/semantics.html#the-sub-and-sup-elements
148 [
149 '<var>x<sub><var>i</var></sub></var>, <var>y<sub><var>i</var></sub></var>',
150 '<var>x<sub><var>i</var></sub></var>, <var>y<sub><var>i</var></sub></var>',
151 'Nested <var>.'
152 ],
153 // https://html.spec.whatwg.org/multipage/semantics.html#the-dfn-element
154 [
155 '<dfn><abbr title="Garage Door Opener">GDO</abbr></dfn>',
156 '<dfn><abbr title="Garage Door Opener">GDO</abbr></dfn>',
157 '<abbr> inside <dfn>',
158 ],
159 ];
160 }
161
162 /**
163 * @dataProvider dataRemoveHTMLtags
164 * @covers Sanitizer::removeHTMLtags
165 */
166 public function testRemoveHTMLtags( $input, $output, $msg = null ) {
167 $this->hideDeprecated( 'disabling tidy' );
168 $this->hideDeprecated( 'MWTidy::setInstance' );
169 MWTidy::setInstance( false );
170 $this->assertEquals( $output, Sanitizer::removeHTMLtags( $input ), $msg );
171 }
172
173 /**
174 * @dataProvider provideTagAttributesToDecode
175 * @covers Sanitizer::decodeTagAttributes
176 */
177 public function testDecodeTagAttributes( $expected, $attributes, $message = '' ) {
178 $this->assertEquals( $expected,
179 Sanitizer::decodeTagAttributes( $attributes ),
180 $message
181 );
182 }
183
184 public static function provideTagAttributesToDecode() {
185 return [
186 [ [ 'foo' => 'bar' ], 'foo=bar', 'Unquoted attribute' ],
187 [ [ 'עברית' => 'bar' ], 'עברית=bar', 'Non-Latin attribute' ],
188 [ [ '६' => 'bar' ], '६=bar', 'Devanagari number' ],
189 [ [ '搭𨋢' => 'bar' ], '搭𨋢=bar', 'Non-BMP character' ],
190 [ [], 'ńgh=bar', 'Combining accent is not allowed' ],
191 [ [ 'foo' => 'bar' ], ' foo = bar ', 'Spaced attribute' ],
192 [ [ 'foo' => 'bar' ], 'foo="bar"', 'Double-quoted attribute' ],
193 [ [ 'foo' => 'bar' ], 'foo=\'bar\'', 'Single-quoted attribute' ],
194 [
195 [ 'foo' => 'bar', 'baz' => 'foo' ],
196 'foo=\'bar\' baz="foo"',
197 'Several attributes'
198 ],
199 [
200 [ 'foo' => 'bar', 'baz' => 'foo' ],
201 'foo=\'bar\' baz="foo"',
202 'Several attributes'
203 ],
204 [
205 [ 'foo' => 'bar', 'baz' => 'foo' ],
206 'foo=\'bar\' baz="foo"',
207 'Several attributes'
208 ],
209 [ [ ':foo' => 'bar' ], ':foo=\'bar\'', 'Leading :' ],
210 [ [ '_foo' => 'bar' ], '_foo=\'bar\'', 'Leading _' ],
211 [ [ 'foo' => 'bar' ], 'Foo=\'bar\'', 'Leading capital' ],
212 [ [ 'foo' => 'BAR' ], 'FOO=BAR', 'Attribute keys are normalized to lowercase' ],
213
214 # Invalid beginning
215 [ [], '-foo=bar', 'Leading - is forbidden' ],
216 [ [], '.foo=bar', 'Leading . is forbidden' ],
217 [ [ 'foo-bar' => 'bar' ], 'foo-bar=bar', 'A - is allowed inside the attribute' ],
218 [ [ 'foo-' => 'bar' ], 'foo-=bar', 'A - is allowed inside the attribute' ],
219 [ [ 'foo.bar' => 'baz' ], 'foo.bar=baz', 'A . is allowed inside the attribute' ],
220 [ [ 'foo.' => 'baz' ], 'foo.=baz', 'A . is allowed as last character' ],
221 [ [ 'foo6' => 'baz' ], 'foo6=baz', 'Numbers are allowed' ],
222
223 # This bit is more relaxed than XML rules, but some extensions use
224 # it, like ProofreadPage (see T29539)
225 [ [ '1foo' => 'baz' ], '1foo=baz', 'Leading numbers are allowed' ],
226 [ [], 'foo$=baz', 'Symbols are not allowed' ],
227 [ [], 'foo@=baz', 'Symbols are not allowed' ],
228 [ [], 'foo~=baz', 'Symbols are not allowed' ],
229 [
230 [ 'foo' => '1[#^`*%w/(' ],
231 'foo=1[#^`*%w/(',
232 'All kind of characters are allowed as values'
233 ],
234 [
235 [ 'foo' => '1[#^`*%\'w/(' ],
236 'foo="1[#^`*%\'w/("',
237 'Double quotes are allowed if quoted by single quotes'
238 ],
239 [
240 [ 'foo' => '1[#^`*%"w/(' ],
241 'foo=\'1[#^`*%"w/(\'',
242 'Single quotes are allowed if quoted by double quotes'
243 ],
244 [ [ 'foo' => '&"' ], 'foo=&amp;&quot;', 'Special chars can be provided as entities' ],
245 [ [ 'foo' => '&foobar;' ], 'foo=&foobar;', 'Entity-like items are accepted' ],
246 ];
247 }
248
249 /**
250 * @dataProvider provideDeprecatedAttributes
251 * @covers Sanitizer::fixTagAttributes
252 */
253 public function testDeprecatedAttributesUnaltered( $inputAttr, $inputEl, $message = '' ) {
254 $this->assertEquals( " $inputAttr",
255 Sanitizer::fixTagAttributes( $inputAttr, $inputEl ),
256 $message
257 );
258 }
259
260 public static function provideDeprecatedAttributes() {
261 /** [ <attribute>, <element>, [message] ] */
262 return [
263 [ 'clear="left"', 'br' ],
264 [ 'clear="all"', 'br' ],
265 [ 'width="100"', 'td' ],
266 [ 'nowrap="true"', 'td' ],
267 [ 'nowrap=""', 'td' ],
268 [ 'align="right"', 'td' ],
269 [ 'align="center"', 'table' ],
270 [ 'align="left"', 'tr' ],
271 [ 'align="center"', 'div' ],
272 [ 'align="left"', 'h1' ],
273 [ 'align="left"', 'p' ],
274 ];
275 }
276
277 /**
278 * @dataProvider provideCssCommentsFixtures
279 * @covers Sanitizer::checkCss
280 */
281 public function testCssCommentsChecking( $expected, $css, $message = '' ) {
282 $this->assertEquals( $expected,
283 Sanitizer::checkCss( $css ),
284 $message
285 );
286 }
287
288 public static function provideCssCommentsFixtures() {
289 /** [ <expected>, <css>, [message] ] */
290 return [
291 // Valid comments spanning entire input
292 [ '/**/', '/**/' ],
293 [ '/* comment */', '/* comment */' ],
294 // Weird stuff
295 [ ' ', '/****/' ],
296 [ ' ', '/* /* */' ],
297 [ 'display: block;', "display:/* foo */block;" ],
298 [ 'display: block;', "display:\\2f\\2a foo \\2a\\2f block;",
299 'Backslash-escaped comments must be stripped (T30450)' ],
300 [ '', '/* unfinished comment structure',
301 'Remove anything after a comment-start token' ],
302 [ '', "\\2f\\2a unifinished comment'",
303 'Remove anything after a backslash-escaped comment-start token' ],
304 [
305 '/* insecure input */',
306 'filter: progid:DXImageTransform.Microsoft.AlphaImageLoader'
307 . '(src=\'asdf.png\',sizingMethod=\'scale\');'
308 ],
309 [
310 '/* insecure input */',
311 '-ms-filter: "progid:DXImageTransform.Microsoft.AlphaImageLoader'
312 . '(src=\'asdf.png\',sizingMethod=\'scale\')";'
313 ],
314 [ '/* insecure input */', 'width: expression(1+1);' ],
315 [ '/* insecure input */', 'background-image: image(asdf.png);' ],
316 [ '/* insecure input */', 'background-image: -webkit-image(asdf.png);' ],
317 [ '/* insecure input */', 'background-image: -moz-image(asdf.png);' ],
318 [ '/* insecure input */', 'background-image: image-set("asdf.png" 1x, "asdf.png" 2x);' ],
319 [
320 '/* insecure input */',
321 'background-image: -webkit-image-set("asdf.png" 1x, "asdf.png" 2x);'
322 ],
323 [
324 '/* insecure input */',
325 'background-image: -moz-image-set("asdf.png" 1x, "asdf.png" 2x);'
326 ],
327 [ '/* insecure input */', 'foo: attr( title, url );' ],
328 [ '/* insecure input */', 'foo: attr( title url );' ],
329 ];
330 }
331
332 /**
333 * @dataProvider provideEscapeHtmlAllowEntities
334 * @covers Sanitizer::escapeHtmlAllowEntities
335 */
336 public function testEscapeHtmlAllowEntities( $expected, $html ) {
337 $this->assertEquals(
338 $expected,
339 Sanitizer::escapeHtmlAllowEntities( $html )
340 );
341 }
342
343 public static function provideEscapeHtmlAllowEntities() {
344 return [
345 [ 'foo', 'foo' ],
346 [ 'a¡b', 'a&#161;b' ],
347 [ 'foo&#039;bar', "foo'bar" ],
348 [ '&lt;script&gt;foo&lt;/script&gt;', '<script>foo</script>' ],
349 ];
350 }
351
352 /**
353 * Test Sanitizer::escapeId
354 *
355 * @dataProvider provideEscapeId
356 * @covers Sanitizer::escapeId
357 */
358 public function testEscapeId( $input, $output ) {
359 $this->assertEquals(
360 $output,
361 Sanitizer::escapeId( $input, [ 'noninitial', 'legacy' ] )
362 );
363 }
364
365 public static function provideEscapeId() {
366 return [
367 [ '+', '.2B' ],
368 [ '&', '.26' ],
369 [ '=', '.3D' ],
370 [ ':', ':' ],
371 [ ';', '.3B' ],
372 [ '@', '.40' ],
373 [ '$', '.24' ],
374 [ '-_.', '-_.' ],
375 [ '!', '.21' ],
376 [ '*', '.2A' ],
377 [ '/', '.2F' ],
378 [ '[]', '.5B.5D' ],
379 [ '<>', '.3C.3E' ],
380 [ '\'', '.27' ],
381 [ '§', '.C2.A7' ],
382 [ 'Test:A & B/Here', 'Test:A_.26_B.2FHere' ],
383 [ 'A&B&amp;C&amp;amp;D&amp;amp;amp;E', 'A.26B.26amp.3BC.26amp.3Bamp.3BD.26amp.3Bamp.3Bamp.3BE' ],
384 ];
385 }
386
387 /**
388 * Test escapeIdReferenceList for consistency with escapeIdForAttribute
389 *
390 * @dataProvider provideEscapeIdReferenceList
391 * @covers Sanitizer::escapeIdReferenceList
392 */
393 public function testEscapeIdReferenceList( $referenceList, $id1, $id2 ) {
394 $this->assertEquals(
395 Sanitizer::escapeIdReferenceList( $referenceList ),
396 Sanitizer::escapeIdForAttribute( $id1 )
397 . ' '
398 . Sanitizer::escapeIdForAttribute( $id2 )
399 );
400 }
401
402 public static function provideEscapeIdReferenceList() {
403 /** [ <reference list>, <individual id 1>, <individual id 2> ] */
404 return [
405 [ 'foo bar', 'foo', 'bar' ],
406 [ '#1 #2', '#1', '#2' ],
407 [ '+1 +2', '+1', '+2' ],
408 ];
409 }
410
411 /**
412 * @dataProvider provideIsReservedDataAttribute
413 * @covers Sanitizer::isReservedDataAttribute
414 */
415 public function testIsReservedDataAttribute( $attr, $expected ) {
416 $this->assertSame( $expected, Sanitizer::isReservedDataAttribute( $attr ) );
417 }
418
419 public static function provideIsReservedDataAttribute() {
420 return [
421 [ 'foo', false ],
422 [ 'data', false ],
423 [ 'data-foo', false ],
424 [ 'data-mw', true ],
425 [ 'data-ooui', true ],
426 [ 'data-parsoid', true ],
427 [ 'data-mw-foo', true ],
428 [ 'data-ooui-foo', true ],
429 [ 'data-mwfoo', true ], // could be false but this is how it's implemented currently
430 ];
431 }
432
433 /**
434 * @dataProvider provideEscapeIdForStuff
435 *
436 * @covers Sanitizer::escapeIdForAttribute()
437 * @covers Sanitizer::escapeIdForLink()
438 * @covers Sanitizer::escapeIdForExternalInterwiki()
439 * @covers Sanitizer::escapeIdInternal()
440 *
441 * @param string $stuff
442 * @param string[] $config
443 * @param string $id
444 * @param string|false $expected
445 * @param int|null $mode
446 */
447 public function testEscapeIdForStuff( $stuff, array $config, $id, $expected, $mode = null ) {
448 $func = "Sanitizer::escapeIdFor{$stuff}";
449 $iwFlavor = array_pop( $config );
450 $this->setMwGlobals( [
451 'wgFragmentMode' => $config,
452 'wgExternalInterwikiFragmentMode' => $iwFlavor,
453 ] );
454 $escaped = call_user_func( $func, $id, $mode );
455 self::assertEquals( $expected, $escaped );
456 }
457
458 public function provideEscapeIdForStuff() {
459 // Test inputs and outputs
460 $text = 'foo тест_#%!\'()[]:<>&&amp;&amp;amp;';
461 $legacyEncoded = 'foo_.D1.82.D0.B5.D1.81.D1.82_.23.25.21.27.28.29.5B.5D:.3C.3E' .
462 '.26.26amp.3B.26amp.3Bamp.3B';
463 $html5Encoded = 'foo_тест_#%!\'()[]:<>&&amp;&amp;amp;';
464
465 // Settings: last element is $wgExternalInterwikiFragmentMode, the rest is $wgFragmentMode
466 $legacy = [ 'legacy', 'legacy' ];
467 $legacyNew = [ 'legacy', 'html5', 'legacy' ];
468 $newLegacy = [ 'html5', 'legacy', 'legacy' ];
469 $new = [ 'html5', 'legacy' ];
470 $allNew = [ 'html5', 'html5' ];
471
472 return [
473 // Pure legacy: how MW worked before 2017
474 [ 'Attribute', $legacy, $text, $legacyEncoded, Sanitizer::ID_PRIMARY ],
475 [ 'Attribute', $legacy, $text, false, Sanitizer::ID_FALLBACK ],
476 [ 'Link', $legacy, $text, $legacyEncoded ],
477 [ 'ExternalInterwiki', $legacy, $text, $legacyEncoded ],
478
479 // Transition to a new world: legacy links with HTML5 fallback
480 [ 'Attribute', $legacyNew, $text, $legacyEncoded, Sanitizer::ID_PRIMARY ],
481 [ 'Attribute', $legacyNew, $text, $html5Encoded, Sanitizer::ID_FALLBACK ],
482 [ 'Link', $legacyNew, $text, $legacyEncoded ],
483 [ 'ExternalInterwiki', $legacyNew, $text, $legacyEncoded ],
484
485 // New world: HTML5 links, legacy fallbacks
486 [ 'Attribute', $newLegacy, $text, $html5Encoded, Sanitizer::ID_PRIMARY ],
487 [ 'Attribute', $newLegacy, $text, $legacyEncoded, Sanitizer::ID_FALLBACK ],
488 [ 'Link', $newLegacy, $text, $html5Encoded ],
489 [ 'ExternalInterwiki', $newLegacy, $text, $legacyEncoded ],
490
491 // Distant future: no legacy fallbacks, but still linking to leagacy wikis
492 [ 'Attribute', $new, $text, $html5Encoded, Sanitizer::ID_PRIMARY ],
493 [ 'Attribute', $new, $text, false, Sanitizer::ID_FALLBACK ],
494 [ 'Link', $new, $text, $html5Encoded ],
495 [ 'ExternalInterwiki', $new, $text, $legacyEncoded ],
496
497 // Just before the heat death of universe: external interwikis are also HTML5 \m/
498 [ 'Attribute', $allNew, $text, $html5Encoded, Sanitizer::ID_PRIMARY ],
499 [ 'Attribute', $allNew, $text, false, Sanitizer::ID_FALLBACK ],
500 [ 'Link', $allNew, $text, $html5Encoded ],
501 [ 'ExternalInterwiki', $allNew, $text, $html5Encoded ],
502 ];
503 }
504
505 /**
506 * @dataProvider provideStripAllTags
507 *
508 * @covers Sanitizer::stripAllTags()
509 * @covers RemexStripTagHandler
510 *
511 * @param string $input
512 * @param string $expected
513 */
514 public function testStripAllTags( $input, $expected ) {
515 $this->assertEquals( $expected, Sanitizer::stripAllTags( $input ) );
516 }
517
518 public function provideStripAllTags() {
519 return [
520 [ '<p>Foo</p>', 'Foo' ],
521 [ '<p id="one">Foo</p><p id="two">Bar</p>', 'Foo Bar' ],
522 [ "<p>Foo</p>\n<p>Bar</p>", 'Foo Bar' ],
523 [ '<p>Hello &lt;strong&gt; wor&#x6c;&#100; caf&eacute;</p>', 'Hello <strong> world café' ],
524 [
525 '<p><small data-foo=\'bar"&lt;baz>quux\'><a href="./Foo">Bar</a></small> Whee!</p>',
526 'Bar Whee!'
527 ],
528 [ '1<span class="<?php">2</span>3', '123' ],
529 [ '1<span class="<?">2</span>3', '123' ],
530 ];
531 }
532
533 /**
534 * @expectedException InvalidArgumentException
535 * @covers Sanitizer::escapeIdInternal()
536 */
537 public function testInvalidFragmentThrows() {
538 $this->setMwGlobals( 'wgFragmentMode', [ 'boom!' ] );
539 Sanitizer::escapeIdForAttribute( 'This should throw' );
540 }
541
542 /**
543 * @expectedException UnexpectedValueException
544 * @covers Sanitizer::escapeIdForAttribute()
545 */
546 public function testNoPrimaryFragmentModeThrows() {
547 $this->setMwGlobals( 'wgFragmentMode', [ 666 => 'html5' ] );
548 Sanitizer::escapeIdForAttribute( 'This should throw' );
549 }
550
551 /**
552 * @expectedException UnexpectedValueException
553 * @covers Sanitizer::escapeIdForLink()
554 */
555 public function testNoPrimaryFragmentModeThrows2() {
556 $this->setMwGlobals( 'wgFragmentMode', [ 666 => 'html5' ] );
557 Sanitizer::escapeIdForLink( 'This should throw' );
558 }
559 }