Enable additional balancer tests (those starting with `<!DOCTYPE html>`)
[lhc/web/wiklou.git] / tests / phpunit / includes / tidy / BalancerTest.php
1 <?php
2
3 class BalancerTest extends MediaWikiTestCase {
4 private $balancer;
5
6 /**
7 * Anything that needs to happen before your tests should go here.
8 */
9 protected function setUp() {
10 // Be sure to do call the parent setup and teardown functions.
11 // This makes sure that all the various cleanup and restorations
12 // happen as they should (including the restoration for setMwGlobals).
13 parent::setUp();
14 $this->balancer = new MediaWiki\Tidy\Balancer( [
15 'strict' => false, /* not strict */
16 'allowedHtmlElements' => null, /* no sanitization */
17 'tidyCompat' => false, /* standard parser */
18 'allowComments' => true, /* comment parsing */
19 ] );
20 }
21
22 /**
23 * Anything cleanup you need to do should go here.
24 */
25 protected function tearDown() {
26 parent::tearDown();
27 }
28
29 /**
30 * @covers Balancer::balance
31 * @dataProvider provideBalancerTests
32 */
33 public function testBalancer( $description, $input, $expected ) {
34 $output = $this->balancer->balance( $input );
35
36 // Ignore self-closing tags
37 $output = preg_replace( '/\s*\/>/', '>', $output );
38
39 $this->assertEquals( $expected, $output, $description );
40 }
41
42 public static function provideBalancerTests() {
43 // Get the tests from html5lib-tests.json
44 $json = json_decode( file_get_contents(
45 __DIR__ . '/html5lib-tests.json'
46 ), true );
47 // Munge this slightly into the format phpunit expects
48 // for providers, and filter out HTML constructs which
49 // the balancer doesn't support.
50 $tests = [];
51 $okre = "~ \A
52 (?i:<!DOCTYPE\ html>)?
53 <html><head></head><body>
54 .*
55 </body></html>
56 \z ~xs";
57 foreach ( $json as $filename => $cases ) {
58 foreach ( $cases as $case ) {
59 $html = $case['document']['html'];
60 if ( !preg_match( $okre, $html ) ) {
61 // Skip tests which involve stuff in the <head> or
62 // weird doctypes.
63 continue;
64 }
65 // We used to do this:
66 // $html = substr( $html, strlen( $start ), -strlen( $end ) );
67 // But now we use a different field in the test case,
68 // which reports how domino would parse this case in a
69 // no-quirks <body> context. (The original test case may
70 // have had a different context, or relied on quirks mode.)
71 $html = $case['document']['noQuirksBodyHtml'];
72 // Normalize case of SVG attributes.
73 $html = str_replace( 'foreignObject', 'foreignobject', $html );
74 // Normalize case of MathML attributes.
75 $html = str_replace( 'definitionURL', 'definitionurl', $html );
76
77 if (
78 isset( $case['document']['props']['comment'] ) &&
79 preg_match( ',<!--[^>]*<,', $html )
80 ) {
81 // Skip tests which include HTML comments containing
82 // the < character, which we don't support.
83 continue;
84 }
85 if ( strpos( $case['data'], '<![CDATA[' ) !== false ) {
86 // Skip tests involving <![CDATA[ ]]> quoting.
87 continue;
88 }
89 if (
90 stripos( $case['data'], '<!DOCTYPE' ) !== false &&
91 stripos( $case['data'], '<!DOCTYPE html>' ) === false
92 ) {
93 // Skip tests involving unusual doctypes.
94 continue;
95 }
96 $literalre = "~ <rdar: | <isindex | < /? (
97 html | head | body | frame | frameset | plaintext
98 ) > ~xi";
99 if ( preg_match( $literalre, $case['data'] ) ) {
100 // Skip tests involving some literal tags, which are
101 // unsupported but don't show up in the expected output.
102 continue;
103 }
104 if (
105 isset( $case['document']['props']['tags']['iframe'] ) ||
106 isset( $case['document']['props']['tags']['noembed'] ) ||
107 isset( $case['document']['props']['tags']['noscript'] ) ||
108 isset( $case['document']['props']['tags']['script'] ) ||
109 isset( $case['document']['props']['tags']['svg script'] ) ||
110 isset( $case['document']['props']['tags']['svg title'] ) ||
111 isset( $case['document']['props']['tags']['title'] ) ||
112 isset( $case['document']['props']['tags']['xmp'] )
113 ) {
114 // Skip tests with unsupported tags which *do* show
115 // up in the expected output.
116 continue;
117 }
118 if (
119 $filename === 'entities01.dat' ||
120 $filename === 'entities02.dat' ||
121 preg_match( '/&([a-z]+|#x[0-9A-F]+);/i', $case['data'] ) ||
122 preg_match( '/^(&|&#|&#X|&#x|&#45|&x-test|&AMP)$/', $case['data'] )
123 ) {
124 // Skip tests involving entity encoding.
125 continue;
126 }
127 if (
128 isset( $case['document']['props']['tagWithLt'] ) ||
129 isset( $case['document']['props']['attrWithFunnyChar'] ) ||
130 preg_match( ':^(</b test|<di|<foo bar=qux/>)$:', $case['data'] ) ||
131 preg_match( ':</p<p>:', $case['data'] ) ||
132 preg_match( ':<b &=&amp>|<p/x/y/z>:', $case['data'] )
133 ) {
134 // Skip tests with funny tag or attribute names,
135 // which are really tests of the HTML tokenizer, not
136 // the tree builder.
137 continue;
138 }
139 if (
140 preg_match( ':encoding=" text/html "|type=" hidden":', $case['data'] )
141 ) {
142 // The Sanitizer normalizes whitespace in attribute
143 // values, which makes this test case invalid.
144 continue;
145 }
146 if ( $filename === 'plain-text-unsafe.dat' ) {
147 // Skip tests with ASCII null, etc.
148 continue;
149 }
150 $data = preg_replace(
151 '~<!DOCTYPE html>~i', '', $case['data']
152 );
153 $tests[] = [
154 $filename, # use better description?
155 $data,
156 $html
157 ];
158 }
159 }
160 return $tests;
161 }
162 }