New parserTests.php features
authorTim Starling <tstarling@wikimedia.org>
Wed, 9 Mar 2016 04:42:33 +0000 (15:42 +1100)
committerTim Starling <tstarling@wikimedia.org>
Fri, 8 Jul 2016 05:05:06 +0000 (05:05 +0000)
Features to support T89331 analysis:

* Support dwdiff for word-level diffing
* Add --mark-ws feature which produces cleaner diffs when line breaks
  differ
* Add optional normalization of parser test output, allowing significant
  differences to be separated from insignificant differences.

Change-Id: I0e151caad1f8b2f97bf20b219f26f3101be82506

tests/TestsAutoLoader.php
tests/parser/parserTest.inc
tests/parserTests.php

index 8b100a2..2bb1d2e 100644 (file)
@@ -143,6 +143,7 @@ $wgAutoloadClasses += [
        'NewParserTest' => "$testDir/phpunit/includes/parser/NewParserTest.php",
        'MediaWikiParserTest' => "$testDir/phpunit/includes/parser/MediaWikiParserTest.php",
        'ParserTest' => "$testDir/parser/parserTest.inc",
+       'ParserTestResultNormalizer' => "$testDir/parser/parserTest.inc",
        'ParserTestParserHook' => "$testDir/parser/parserTestsParserHook.php",
 
        # tests/phpunit/includes/site
index e519f59..d602194 100644 (file)
@@ -82,6 +82,9 @@ class ParserTest {
 
        public $regex = "";
        private $savedGlobals = [];
+       private $useDwdiff = false;
+       private $markWhitespace = false;
+       private $normalizationFunctions = [];
 
        /**
         * Sets terminal colorization and diff/quick modes depending on OS and
@@ -116,6 +119,18 @@ class ParserTest {
                                || isset( $options['compare'] ) ) ); // redundant output
 
                $this->showOutput = isset( $options['show-output'] );
+               $this->useDwdiff = isset( $options['dwdiff'] );
+               $this->markWhitespace = isset( $options['mark-ws'] );
+
+               if ( isset( $options['norm'] ) ) {
+                       foreach ( explode( ',', $options['norm'] ) as $func ) {
+                               if ( in_array( $func, [ 'removeTbody', 'trimWhitespace' ] ) ) {
+                                       $this->normalizationFunctions[] = $func;
+                               } else {
+                                       echo "Warning: unknown normalization option \"$func\"\n";
+                               }
+                       }
+               }
 
                if ( isset( $options['filter'] ) ) {
                        $options['regex'] = $options['filter'];
@@ -700,6 +715,11 @@ class ParserTest {
 
                $this->teardownGlobals();
 
+               if ( count( $this->normalizationFunctions ) ) {
+                       $result = ParserTestResultNormalizer::normalize( $result, $this->normalizationFunctions );
+                       $out = ParserTestResultNormalizer::normalize( $out, $this->normalizationFunctions );
+               }
+
                $testResult = new ParserTestResult( $desc );
                $testResult->expected = $result;
                $testResult->actual = $out;
@@ -1469,6 +1489,16 @@ class ParserTest {
        protected function quickDiff( $input, $output,
                $inFileTail = 'expected', $outFileTail = 'actual'
        ) {
+               if ( $this->markWhitespace ) {
+                       $pairs = [
+                               "\n" => '¶',
+                               ' ' => '·',
+                               "\t" => '→'
+                       ];
+                       $input = strtr( $input, $pairs );
+                       $output = strtr( $output, $pairs );
+               }
+
                # Windows, or at least the fc utility, is retarded
                $slash = wfIsWindows() ? '\\' : '/';
                $prefix = wfTempDir() . "{$slash}mwParser-" . mt_rand();
@@ -1484,14 +1514,22 @@ class ParserTest {
 
                global $wgDiff3;
                // we assume that people with diff3 also have usual diff
-               $shellCommand = ( wfIsWindows() && !$wgDiff3 ) ? 'fc' : 'diff -au';
+               if ( $this->useDwdiff ) {
+                       $shellCommand = 'dwdiff -Pc';
+               } else {
+                       $shellCommand = ( wfIsWindows() && !$wgDiff3 ) ? 'fc' : 'diff -au';
+               }
 
                $diff = wfShellExec( "$shellCommand $shellInfile $shellOutfile" );
 
                unlink( $infile );
                unlink( $outfile );
 
-               return $this->colorDiff( $diff );
+               if ( $this->useDwdiff ) {
+                       return $diff;
+               } else {
+                       return $this->colorDiff( $diff );
+               }
        }
 
        /**
@@ -1699,3 +1737,84 @@ class ParserTest {
                return true;
        }
 }
+
+class ParserTestResultNormalizer {
+       protected $doc, $xpath, $invalid;
+
+       public static function normalize( $text, $funcs ) {
+               $norm = new self( $text );
+               if ( $norm->invalid ) {
+                       return $text;
+               }
+               foreach ( $funcs as $func ) {
+                       $norm->$func();
+               }
+               return $norm->serialize();
+       }
+
+       protected function __construct( $text ) {
+               $this->doc = new DOMDocument( '1.0', 'utf-8' );
+
+               // Note: parsing a supposedly XHTML document with an XML parser is not
+               // guaranteed to give accurate results. For example, it may introduce
+               // differences in the number of line breaks in <pre> tags.
+
+               MediaWiki\suppressWarnings();
+               if ( !$this->doc->loadXML( '<html><body>' . $text . '</body></html>' ) ) {
+                       $this->invalid = true;
+               }
+               MediaWiki\restoreWarnings();
+               $this->xpath = new DOMXPath( $this->doc );
+               $this->body = $this->xpath->query( '//body' )->item( 0 );
+       }
+
+       protected function removeTbody() {
+               foreach ( $this->xpath->query( '//tbody' ) as $tbody ) {
+                       while ( $tbody->firstChild ) {
+                               $child = $tbody->firstChild;
+                               $tbody->removeChild( $child );
+                               $tbody->parentNode->insertBefore( $child, $tbody );
+                       }
+                       $tbody->parentNode->removeChild( $tbody );
+               }
+       }
+
+       /**
+        * The point of this function is to produce a normalized DOM in which
+        * Tidy's output matches the output of html5depurate. Tidy both trims
+        * and pretty-prints, so this requires fairly aggressive treatment.
+        *
+        * In particular, note that Tidy converts <pre>x</pre> to <pre>\nx\n</pre>,
+        * which theoretically affects display since the second line break is not
+        * ignored by compliant HTML parsers.
+        *
+        * This function also removes empty elements, as does Tidy.
+        */
+       protected function trimWhitespace() {
+               foreach ( $this->xpath->query( '//text()' ) as $child ) {
+                       if ( strtolower( $child->parentNode->nodeName ) === 'pre' ) {
+                               // Just trim one line break from the start and end
+                               if ( substr_compare( $child->data, "\n", 0 ) === 0 ) {
+                                       $child->data = substr( $child->data, 1 );
+                               }
+                               if ( substr_compare( $child->data, "\n", -1 ) === 0 ) {
+                                       $child->data = substr( $child->data, 0, -1 );
+                               }
+                       } else {
+                               // Trim all whitespace
+                               $child->data = trim( $child->data );
+                       }
+                       if ( $child->data === '' ) {
+                               $child->parentNode->removeChild( $child );
+                       }
+               }
+       }
+
+       /**
+        * Serialize the XML DOM for comparison purposes. This does not generate HTML.
+        */
+       protected function serialize() {
+               return strtr( $this->doc->saveXML( $this->body ),
+                       [ '<body>' => '', '</body>' => '' ] );
+       }
+}
index b3cb89a..5e15694 100644 (file)
@@ -27,8 +27,8 @@
 define( 'MW_PARSER_TEST', true );
 
 $options = [ 'quick', 'color', 'quiet', 'help', 'show-output',
-       'record', 'run-disabled', 'run-parsoid' ];
-$optionsWithArgs = [ 'regex', 'filter', 'seed', 'setversion', 'file' ];
+       'record', 'run-disabled', 'run-parsoid', 'dwdiff', 'mark-ws' ];
+$optionsWithArgs = [ 'regex', 'filter', 'seed', 'setversion', 'file', 'norm' ];
 
 require_once __DIR__ . '/../maintenance/commandLine.inc';
 require_once __DIR__ . '/TestsAutoLoader.php';
@@ -54,9 +54,16 @@ Options:
   --keep-uploads   Re-use the same upload directory for each test, don't delete it
   --fuzz           Do a fuzz test instead of a normal test
   --seed <n>       Start the fuzz test from the specified seed
-  --help           Show this help message
   --run-disabled   run disabled tests
   --run-parsoid    run parsoid tests (normally disabled)
+  --dwdiff         Use dwdiff to display diff output
+  --mark-ws        Mark whitespace in diffs by replacing it with symbols
+  --norm=<funcs>   Apply a comma-separated list of normalization functions to
+                   both the expected and actual output in order to resolve
+                   irrelevant differences. The accepted normalization functions
+                   are: removeTbody to remove <tbody> tags; and trimWhitespace
+                   to trim whitespace from the start and end of text nodes.
+  --help           Show this help message
 
 ENDS;
        exit( 0 );