Abstract and refactor Tidy support
authorTim Starling <tstarling@wikimedia.org>
Mon, 31 Aug 2015 04:42:55 +0000 (14:42 +1000)
committerOri Livneh <ori@wikimedia.org>
Fri, 11 Sep 2015 03:18:52 +0000 (20:18 -0700)
* Split tidy implementations into a class hierarchy
* Bring all tidy configuration into a single associative array and
  deprecate the old configuration.
* Remove $wgAlwaysUseTidy

This is preparatory to replacement of Tidy (T89331). I used the name
"Raggett" for things relating to Dave Raggett's Tidy, since if we use
"tidy" to mean the new abstract system as well as Raggett's tidy, it
gets confusing.

Change-Id: I77af1a16cbbb47fc226d05fb9aad56c58e8910b5

24 files changed:
RELEASE-NOTES-1.26
autoload.php
includes/DefaultSettings.php
includes/Sanitizer.php
includes/parser/MWTidy.php
includes/parser/Parser.php
includes/tidy.conf [deleted file]
includes/tidy/RaggettBase.php [new file with mode: 0644]
includes/tidy/RaggettExternal.php [new file with mode: 0644]
includes/tidy/RaggettInternalHHVM.php [new file with mode: 0644]
includes/tidy/RaggettInternalPHP.php [new file with mode: 0644]
includes/tidy/RaggettWrapper.php [new file with mode: 0644]
includes/tidy/TidyDriverBase.php [new file with mode: 0644]
includes/tidy/tidy.conf [new file with mode: 0644]
maintenance/refreshLinks.php
tests/parser/parserTest.inc
tests/phpunit/MediaWikiTestCase.php
tests/phpunit/includes/ExtraParserTest.php
tests/phpunit/includes/SanitizerTest.php
tests/phpunit/includes/content/TextContentHandlerTest.php
tests/phpunit/includes/content/TextContentTest.php
tests/phpunit/includes/parser/NewParserTest.php
tests/phpunit/includes/parser/TagHooksTest.php
tests/phpunit/includes/parser/TidyTest.php

index b5d9d5a..8f5db0e 100644 (file)
@@ -29,6 +29,7 @@ production.
 * Fields in ParserOptions are now private. Use the accessors instead.
 * Custom LESS functions (defined via $wgResourceLoaderLESSFunctions)
   have been removed, after being deprecated in 1.24.
+* $wgAlwaysUseTidy has been removed.
 
 === New features in 1.26 ===
 * (T51506) Now action=info gives estimates of actual watchers for a page.
index 82a45b4..62d6f09 100644 (file)
@@ -724,7 +724,6 @@ $wgAutoloadLocalClasses = array(
        'MWOldPassword' => __DIR__ . '/includes/password/MWOldPassword.php',
        'MWSaltedPassword' => __DIR__ . '/includes/password/MWSaltedPassword.php',
        'MWTidy' => __DIR__ . '/includes/parser/MWTidy.php',
-       'MWTidyWrapper' => __DIR__ . '/includes/parser/MWTidy.php',
        'MWTimestamp' => __DIR__ . '/includes/MWTimestamp.php',
        'MachineReadableRCFeedFormatter' => __DIR__ . '/includes/rcfeed/MachineReadableRCFeedFormatter.php',
        'MagicWord' => __DIR__ . '/includes/MagicWord.php',
@@ -761,6 +760,12 @@ $wgAutoloadLocalClasses = array(
        'MediaWiki\\Logger\\Monolog\\WikiProcessor' => __DIR__ . '/includes/debug/logger/monolog/WikiProcessor.php',
        'MediaWiki\\Logger\\NullSpi' => __DIR__ . '/includes/debug/logger/NullSpi.php',
        'MediaWiki\\Logger\\Spi' => __DIR__ . '/includes/debug/logger/Spi.php',
+       'MediaWiki\\Tidy\\RaggettBase' => __DIR__ . '/includes/tidy/RaggettBase.php',
+       'MediaWiki\\Tidy\\RaggettExternal' => __DIR__ . '/includes/tidy/RaggettExternal.php',
+       'MediaWiki\\Tidy\\RaggettInternalHHVM' => __DIR__ . '/includes/tidy/RaggettInternalHHVM.php',
+       'MediaWiki\\Tidy\\RaggettInternalPHP' => __DIR__ . '/includes/tidy/RaggettInternalPHP.php',
+       'MediaWiki\\Tidy\\RaggettWrapper' => __DIR__ . '/includes/tidy/RaggettWrapper.php',
+       'MediaWiki\\Tidy\\TidyDriverBase' => __DIR__ . '/includes/tidy/TidyDriverBase.php',
        'MediaWiki\\Widget\\ComplexNamespaceInputWidget' => __DIR__ . '/includes/widget/ComplexNamespaceInputWidget.php',
        'MediaWiki\\Widget\\NamespaceInputWidget' => __DIR__ . '/includes/widget/NamespaceInputWidget.php',
        'MediaWiki\\Widget\\TitleInputWidget' => __DIR__ . '/includes/widget/TitleInputWidget.php',
index 06cd55a..89c04f9 100644 (file)
@@ -4104,44 +4104,55 @@ $wgEnableImageWhitelist = true;
 $wgAllowImageTag = false;
 
 /**
- * $wgUseTidy: use tidy to make sure HTML output is sane.
- * Tidy is a free tool that fixes broken HTML.
- * See http://www.w3.org/People/Raggett/tidy/
- *
- * - $wgTidyBin should be set to the path of the binary and
- * - $wgTidyConf to the path of the configuration file.
- * - $wgTidyOpts can include any number of parameters.
- * - $wgTidyInternal controls the use of the PECL extension or the
- *   libtidy (PHP >= 5) extension to use an in-process tidy library instead
- *   of spawning a separate program.
- *   Normally you shouldn't need to override the setting except for
- *   debugging. To install, use 'pear install tidy' and add a line
- *   'extension=tidy.so' to php.ini.
+ * Configuration for HTML postprocessing tool. Set this to a configuration
+ * array to enable an external tool. Dave Raggett's "HTML Tidy" is typically
+ * used. See http://www.w3.org/People/Raggett/tidy/
+ *
+ * If this is null and $wgUseTidy is true, the deprecated configuration
+ * parameters will be used instead.
+ *
+ * If this is null and $wgUseTidy is false, a pure PHP fallback will be used.
+ *
+ * Keys are:
+ *  - driver: May be:
+ *    - RaggettInternalHHVM: Use the limited-functionality HHVM extension
+ *    - RaggettInternalPHP: Use the PECL extension
+ *    - RaggettExternal: Shell out to an external binary (tidyBin)
+ *
+ *  - tidyConfigFile: Path to configuration file for any of the Raggett drivers
+ *  - debugComment: True to add a comment to the output with warning messages
+ *  - tidyBin: For RaggettExternal, the path to the tidy binary.
+ *  - tidyCommandLine: For RaggettExternal, additional command line options.
  */
-$wgUseTidy = false;
+$wgTidyConfig = null;
 
 /**
- * @see $wgUseTidy
+ * Set this to true to use the deprecated tidy configuration parameters.
+ * @deprecated use $wgTidyConfig
  */
-$wgAlwaysUseTidy = false;
+$wgUseTidy = false;
 
 /**
- * @see $wgUseTidy
+ * The path to the tidy binary.
+ * @deprecated Use $wgTidyConfig['tidyBin']
  */
 $wgTidyBin = 'tidy';
 
 /**
- * @see $wgUseTidy
+ * The path to the tidy config file
+ * @deprecated Use $wgTidyConfig['tidyConfigFile']
  */
-$wgTidyConf = $IP . '/includes/tidy.conf';
+$wgTidyConf = $IP . '/includes/tidy/tidy.conf';
 
 /**
- * @see $wgUseTidy
+ * The command line options to the tidy binary
+ * @deprecated Use $wgTidyConfig['tidyCommandLine']
  */
 $wgTidyOpts = '';
 
 /**
- * @see $wgUseTidy
+ * Set this to true to use the tidy extension
+ * @deprecated Use $wgTidyConfig['driver']
  */
 $wgTidyInternal = extension_loaded( 'tidy' );
 
index 8179905..de63af7 100644 (file)
@@ -454,15 +454,13 @@ class Sanitizer {
        public static function removeHTMLtags( $text, $processCallback = null,
                $args = array(), $extratags = array(), $removetags = array()
        ) {
-               global $wgUseTidy;
-
                extract( self::getRecognizedTagData( $extratags, $removetags ) );
 
                # Remove HTML comments
                $text = Sanitizer::removeHTMLcomments( $text );
                $bits = explode( '<', $text );
                $text = str_replace( '>', '&gt;', array_shift( $bits ) );
-               if ( !$wgUseTidy ) {
+               if ( !MWTidy::isEnabled() ) {
                        $tagstack = $tablestack = array();
                        foreach ( $bits as $x ) {
                                $regs = array();
index e29ee88..d0e50bc 100644 (file)
  * @ingroup Parser
  */
 
-/**
- * Class used to hide mw:editsection tokens from Tidy so that it doesn't break them
- * or break on them. This is a bit of a hack for now, but hopefully in the future
- * we may create a real postprocessor or something that will replace this.
- * It's called wrapper because for now it basically takes over MWTidy::tidy's task
- * of wrapping the text in a xhtml block
- *
- * This re-uses some of the parser's UNIQ tricks, though some of it is private so it's
- * duplicated. Perhaps we should create an abstract marker hiding class.
- *
- * @ingroup Parser
- */
-class MWTidyWrapper {
-
-       /**
-        * @var ReplacementArray
-        */
-       protected $mTokens;
-
-       protected $mMarkerIndex;
-
-       public function __construct() {
-               $this->mTokens = null;
-       }
-
-       /**
-        * @param string $text
-        * @return string
-        */
-       public function getWrapped( $text ) {
-               $this->mTokens = new ReplacementArray;
-               $this->mMarkerIndex = 0;
-
-               // Replace <mw:editsection> elements with placeholders
-               $wrappedtext = preg_replace_callback( ParserOutput::EDITSECTION_REGEX,
-                       array( &$this, 'replaceCallback' ), $text );
-               // ...and <mw:toc> markers
-               $wrappedtext = preg_replace_callback( '/\<\\/?mw:toc\>/',
-                       array( &$this, 'replaceCallback' ), $wrappedtext );
-               // ... and <math> tags
-               $wrappedtext = preg_replace_callback( '/\<math(.*?)\<\\/math\>/s',
-                       array( &$this, 'replaceCallback' ), $wrappedtext );
-               // Modify inline Microdata <link> and <meta> elements so they say <html-link> and <html-meta> so
-               // we can trick Tidy into not stripping them out by including them in tidy's new-empty-tags config
-               $wrappedtext = preg_replace( '!<(link|meta)([^>]*?)(/{0,1}>)!', '<html-$1$2$3', $wrappedtext );
-
-               // Wrap the whole thing in a doctype and body for Tidy.
-               $wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' .
-                       ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>' .
-                       '<head><title>test</title></head><body>' . $wrappedtext . '</body></html>';
-
-               return $wrappedtext;
-       }
-
-       /**
-        * @param array $m
-        *
-        * @return string
-        */
-       public function replaceCallback( $m ) {
-               $marker = Parser::MARKER_PREFIX . "-item-{$this->mMarkerIndex}" . Parser::MARKER_SUFFIX;
-               $this->mMarkerIndex++;
-               $this->mTokens->setPair( $marker, $m[0] );
-               return $marker;
-       }
-
-       /**
-        * @param string $text
-        * @return string
-        */
-       public function postprocess( $text ) {
-               // Revert <html-{link,meta}> back to <{link,meta}>
-               $text = preg_replace( '!<html-(link|meta)([^>]*?)(/{0,1}>)!', '<$1$2$3', $text );
-
-               // Restore the contents of placeholder tokens
-               $text = $this->mTokens->replace( $text );
-
-               return $text;
-       }
-
-}
-
 /**
  * Class to interact with HTML tidy
  *
@@ -113,8 +31,10 @@ class MWTidyWrapper {
  * @ingroup Parser
  */
 class MWTidy {
+       private static $instance;
+
        /**
-        * Interface with html tidy, used if $wgUseTidy = true.
+        * Interface with html tidy.
         * If tidy isn't able to correct the markup, the original will be
         * returned in all its glory with a warning comment appended.
         *
@@ -122,23 +42,12 @@ class MWTidy {
         * @return string Corrected HTML output
         */
        public static function tidy( $text ) {
-               $wrapper = new MWTidyWrapper;
-               $wrappedtext = $wrapper->getWrapped( $text );
-
-               $retVal = null;
-               $correctedtext = self::clean( $wrappedtext, false, $retVal );
-
-               if ( $retVal < 0 ) {
-                       wfDebug( "Possible tidy configuration error!\n" );
-                       return $text . "\n<!-- Tidy was unable to run -->\n";
-               } elseif ( is_null( $correctedtext ) ) {
-                       wfDebug( "Tidy error detected!\n" );
-                       return $text . "\n<!-- Tidy found serious XHTML errors -->\n";
+               $driver = self::singleton();
+               if ( !$driver ) {
+                       throw new MWException( __METHOD__.
+                               ': tidy is disabled, caller should have checked MWTidy::isEnabled()' );
                }
-
-               $correctedtext = $wrapper->postprocess( $correctedtext ); // restore any hidden tokens
-
-               return $correctedtext;
+               return $driver->tidy( $text );
        }
 
        /**
@@ -149,170 +58,77 @@ class MWTidy {
         * @return bool Whether the HTML is valid
         */
        public static function checkErrors( $text, &$errorStr = null ) {
-               $retval = 0;
-               $errorStr = self::clean( $text, true, $retval );
-               return ( $retval < 0 && $errorStr == '' ) || $retval == 0;
+               $driver = self::singleton();
+               if ( !$driver ) {
+                       throw new MWException( __METHOD__.
+                               ': tidy is disabled, caller should have checked MWTidy::isEnabled()' );
+               }
+               if ( $driver->supportsValidate() ) {
+                       return $driver->validate( $text, $errorStr );
+               } else {
+                       throw new MWException( __METHOD__ . ": error text return from HHVM tidy is not supported" );
+               }
        }
 
-       /**
-        * Perform a clean/repair operation
-        * @param string $text HTML to check
-        * @param bool $stderr Whether to read result from STDERR rather than STDOUT
-        * @param int &$retval Exit code (-1 on internal error)
-        * @return null|string
-        * @throws MWException
-        */
-       private static function clean( $text, $stderr = false, &$retval = null ) {
-               global $wgTidyInternal;
+       public static function isEnabled() {
+               return self::singleton() !== false;
+       }
 
-               if ( $wgTidyInternal ) {
-                       if ( wfIsHHVM() ) {
-                               if ( $stderr ) {
-                                       throw new MWException( __METHOD__ . ": error text return from HHVM tidy is not supported" );
+       protected static function singleton() {
+               global $wgUseTidy, $wgTidyInternal, $wgTidyConf, $wgDebugTidy, $wgTidyConfig,
+                       $wgTidyBin, $wgTidyOpts;
+
+               if ( self::$instance === null ) {
+                       if ( $wgTidyConfig !== null ) {
+                               $config = $wgTidyConfig;
+                       } elseif ( $wgUseTidy ) {
+                               // b/c configuration
+                               $config = array(
+                                       'tidyConfigFile' => $wgTidyConf,
+                                       'debugComment' => $wgDebugTidy,
+                                       'tidyBin' => $wgTidyBin,
+                                       'tidyCommandLine' => $wgTidyOpts );
+                               if ( $wgTidyInternal ) {
+                                       if ( wfIsHHVM() ) {
+                                               $config['driver'] = 'RaggettInternalHHVM';
+                                       } else {
+                                               $config['driver'] = 'RaggettInternalPHP';
+                                       }
+                               } else {
+                                       $config['driver'] = 'RaggettExternal';
                                }
-                               return self::hhvmClean( $text, $retval );
                        } else {
-                               return self::phpClean( $text, $stderr, $retval );
+                               return false;
                        }
-               } else {
-                       return self::externalClean( $text, $stderr, $retval );
-               }
-       }
-
-       /**
-        * Spawn an external HTML tidy process and get corrected markup back from it.
-        * Also called in OutputHandler.php for full page validation
-        *
-        * @param string $text HTML to check
-        * @param bool $stderr Whether to read result from STDERR rather than STDOUT
-        * @param int &$retval Exit code (-1 on internal error)
-        * @return string|null
-        */
-       private static function externalClean( $text, $stderr = false, &$retval = null ) {
-               global $wgTidyConf, $wgTidyBin, $wgTidyOpts;
-
-               $cleansource = '';
-               $opts = ' -utf8';
-
-               if ( $stderr ) {
-                       $descriptorspec = array(
-                               0 => array( 'pipe', 'r' ),
-                               1 => array( 'file', wfGetNull(), 'a' ),
-                               2 => array( 'pipe', 'w' )
-                       );
-               } else {
-                       $descriptorspec = array(
-                               0 => array( 'pipe', 'r' ),
-                               1 => array( 'pipe', 'w' ),
-                               2 => array( 'file', wfGetNull(), 'a' )
-                       );
-               }
-
-               $readpipe = $stderr ? 2 : 1;
-               $pipes = array();
-
-               $process = proc_open(
-                       "$wgTidyBin -config $wgTidyConf $wgTidyOpts$opts", $descriptorspec, $pipes );
-
-               //NOTE: At least on linux, the process will be created even if tidy is not installed.
-               //      This means that missing tidy will be treated as a validation failure.
-
-               if ( is_resource( $process ) ) {
-                       // Theoretically, this style of communication could cause a deadlock
-                       // here. If the stdout buffer fills up, then writes to stdin could
-                       // block. This doesn't appear to happen with tidy, because tidy only
-                       // writes to stdout after it's finished reading from stdin. Search
-                       // for tidyParseStdin and tidySaveStdout in console/tidy.c
-                       fwrite( $pipes[0], $text );
-                       fclose( $pipes[0] );
-                       while ( !feof( $pipes[$readpipe] ) ) {
-                               $cleansource .= fgets( $pipes[$readpipe], 1024 );
+                       switch ( $config['driver'] ) {
+                               case 'RaggettInternalHHVM':
+                                       self::$instance = new MediaWiki\Tidy\RaggettInternalHHVM( $config );
+                                       break;
+                               case 'RaggettInternalPHP':
+                                       self::$instance = new MediaWiki\Tidy\RaggettInternalPHP( $config );
+                                       break;
+                               case 'RaggettExternal':
+                                       self::$instance = new MediaWiki\Tidy\RaggettExternal( $config );
+                                       break;
+                               default:
+                                       throw new MWException( "Invalid tidy driver: \"{$config['driver']}\"" );
                        }
-                       fclose( $pipes[$readpipe] );
-                       $retval = proc_close( $process );
-               } else {
-                       wfWarn( "Unable to start external tidy process" );
-                       $retval = -1;
                }
-
-               if ( !$stderr && $cleansource == '' && $text != '' ) {
-                       // Some kind of error happened, so we couldn't get the corrected text.
-                       // Just give up; we'll use the source text and append a warning.
-                       $cleansource = null;
-               }
-
-               return $cleansource;
+               return self::$instance;
        }
 
        /**
-        * Use the HTML tidy extension to use the tidy library in-process,
-        * saving the overhead of spawning a new process.
-        *
-        * @param string $text HTML to check
-        * @param bool $stderr Whether to read result from error status instead of output
-        * @param int &$retval Exit code (-1 on internal error)
-        * @return string|null
+        * Set the driver to be used. This is for testing.
+        * @param TidyDriverBase|false|null $instance
         */
-       private static function phpClean( $text, $stderr = false, &$retval = null ) {
-               global $wgTidyConf, $wgDebugTidy;
-
-               if ( ( !wfIsHHVM() && !class_exists( 'tidy' ) ) ||
-                       ( wfIsHHVM() && !function_exists( 'tidy_repair_string' ) )
-               ) {
-                       wfWarn( "Unable to load internal tidy class." );
-                       $retval = -1;
-
-                       return null;
-               }
-
-               $tidy = new tidy;
-               $tidy->parseString( $text, $wgTidyConf, 'utf8' );
-
-               if ( $stderr ) {
-                       $retval = $tidy->getStatus();
-                       return $tidy->errorBuffer;
-               }
-
-               $tidy->cleanRepair();
-               $retval = $tidy->getStatus();
-               if ( $retval == 2 ) {
-                       // 2 is magic number for fatal error
-                       // http://www.php.net/manual/en/function.tidy-get-status.php
-                       $cleansource = null;
-               } else {
-                       $cleansource = tidy_get_output( $tidy );
-                       if ( $wgDebugTidy && $retval > 0 ) {
-                               $cleansource .= "<!--\nTidy reports:\n" .
-                                       str_replace( '-->', '--&gt;', $tidy->errorBuffer ) .
-                                       "\n-->";
-                       }
-               }
-
-               return $cleansource;
+       public static function setInstance( $instance ) {
+               self::$instance = $instance;
        }
 
        /**
-        * Use the tidy extension for HHVM from
-        * https://github.com/wikimedia/mediawiki-php-tidy
-        *
-        * This currently does not support the object-oriented interface, but
-        * tidy_repair_string() can be used for the most common tasks.
-        *
-        * @param string $text HTML to check
-        * @param int &$retval Exit code (-1 on internal error)
-        * @return string|null
+        * Destroy the current singleton instance
         */
-       private static function hhvmClean( $text, &$retval ) {
-               global $wgTidyConf;
-
-               $cleansource = tidy_repair_string( $text, $wgTidyConf, 'utf8' );
-               if ( $cleansource === false ) {
-                       $cleansource = null;
-                       $retval = -1;
-               } else {
-                       $retval = 0;
-               }
-
-               return $cleansource;
+       public static function destroySingleton() {
+               self::$instance = null;
        }
 }
index ab5c586..677da63 100644 (file)
@@ -1283,8 +1283,6 @@ class Parser {
         * @return string
         */
        private function internalParseHalfParsed( $text, $isMain = true, $linestart = true ) {
-               global $wgUseTidy, $wgAlwaysUseTidy;
-
                $text = $this->mStripState->unstripGeneral( $text );
 
                if ( $isMain ) {
@@ -1335,7 +1333,7 @@ class Parser {
 
                $text = Sanitizer::normalizeCharReferences( $text );
 
-               if ( ( $wgUseTidy && $this->mOptions->getTidy() ) || $wgAlwaysUseTidy ) {
+               if ( MWTidy::isEnabled() && $this->mOptions->getTidy() ) {
                        $text = MWTidy::tidy( $text );
                } else {
                        # attempt to sanitize at least some nesting problems
diff --git a/includes/tidy.conf b/includes/tidy.conf
deleted file mode 100644 (file)
index 4c4daed..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-# html tidy (http://tidy.sf.net) configuration
-# tidy - validate, correct, and pretty-print HTML files
-# see: man 1 tidy, http://tidy.sourceforge.net/docs/quickref.html
-
-show-body-only: yes
-force-output: yes
-tidy-mark: no
-wrap: 0
-wrap-attributes: no
-literal-attributes: yes
-output-xhtml: yes
-numeric-entities: yes
-enclose-text: yes
-enclose-block-text: yes
-quiet: yes
-quote-nbsp: yes
-fix-backslash: no
-fix-uri: no
-# Don't strip html5 elements we support
-# html-{meta,link} is a hack we use to prevent Tidy from stripping <meta> and <link> used in the body for Microdata
-new-empty-tags: html-meta, html-link, wbr
-new-inline-tags: video, audio, source, track, bdi, data, time, mark
diff --git a/includes/tidy/RaggettBase.php b/includes/tidy/RaggettBase.php
new file mode 100644 (file)
index 0000000..a3717b2
--- /dev/null
@@ -0,0 +1,47 @@
+<?php
+
+namespace MediaWiki\Tidy;
+
+abstract class RaggettBase extends TidyDriverBase {
+       /**
+        * Generic interface for wrapping and unwrapping HTML for Dave Raggett's tidy.
+        *
+        * @param string $text Hideous HTML input
+        * @return string Corrected HTML output
+        */
+       public function tidy( $text ) {
+               $wrapper = new RaggettWrapper;
+               $wrappedtext = $wrapper->getWrapped( $text );
+
+               $retVal = null;
+               $correctedtext = $this->cleanWrapped( $wrappedtext, false, $retVal );
+
+               if ( $retVal < 0 ) {
+                       wfDebug( "Possible tidy configuration error!\n" );
+                       return $text . "\n<!-- Tidy was unable to run -->\n";
+               } elseif ( is_null( $correctedtext ) ) {
+                       wfDebug( "Tidy error detected!\n" );
+                       return $text . "\n<!-- Tidy found serious XHTML errors -->\n";
+               }
+
+               $correctedtext = $wrapper->postprocess( $correctedtext ); // restore any hidden tokens
+
+               return $correctedtext;
+       }
+
+       public function validate( $text, &$errorStr ) {
+               $retval = 0;
+               $errorStr = $this->cleanWrapped( $text, true, $retval );
+               return ( $retval < 0 && $errorStr == '' ) || $retval == 0;
+       }
+
+       /**
+        * Perform a clean/repair operation
+        * @param string $text HTML to check
+        * @param bool $stderr Whether to read result from STDERR rather than STDOUT
+        * @param int &$retval Exit code (-1 on internal error)
+        * @return null|string
+        * @throws MWException
+        */
+       abstract protected function cleanWrapped( $text, $stderr = false, &$retval = null );
+}
diff --git a/includes/tidy/RaggettExternal.php b/includes/tidy/RaggettExternal.php
new file mode 100644 (file)
index 0000000..1193318
--- /dev/null
@@ -0,0 +1,73 @@
+<?php
+
+namespace MediaWiki\Tidy;
+
+class RaggettExternal extends RaggettBase {
+       /**
+        * Spawn an external HTML tidy process and get corrected markup back from it.
+        * Also called in OutputHandler.php for full page validation
+        *
+        * @param string $text HTML to check
+        * @param bool $stderr Whether to read result from STDERR rather than STDOUT
+        * @param int &$retval Exit code (-1 on internal error)
+        * @return string|null
+        */
+       protected function cleanWrapped( $text, $stderr = false, &$retval = null ) {
+               $cleansource = '';
+               $opts = ' -utf8';
+
+               if ( $stderr ) {
+                       $descriptorspec = array(
+                               0 => array( 'pipe', 'r' ),
+                               1 => array( 'file', wfGetNull(), 'a' ),
+                               2 => array( 'pipe', 'w' )
+                       );
+               } else {
+                       $descriptorspec = array(
+                               0 => array( 'pipe', 'r' ),
+                               1 => array( 'pipe', 'w' ),
+                               2 => array( 'file', wfGetNull(), 'a' )
+                       );
+               }
+
+               $readpipe = $stderr ? 2 : 1;
+               $pipes = array();
+
+               $process = proc_open(
+                       "{$this->config['tidyBin']} -config {$this->config['tidyConfigFile']} " .
+                       $this->config['tidyCommandLine'] . $opts, $descriptorspec, $pipes );
+
+               //NOTE: At least on linux, the process will be created even if tidy is not installed.
+               //      This means that missing tidy will be treated as a validation failure.
+
+               if ( is_resource( $process ) ) {
+                       // Theoretically, this style of communication could cause a deadlock
+                       // here. If the stdout buffer fills up, then writes to stdin could
+                       // block. This doesn't appear to happen with tidy, because tidy only
+                       // writes to stdout after it's finished reading from stdin. Search
+                       // for tidyParseStdin and tidySaveStdout in console/tidy.c
+                       fwrite( $pipes[0], $text );
+                       fclose( $pipes[0] );
+                       while ( !feof( $pipes[$readpipe] ) ) {
+                               $cleansource .= fgets( $pipes[$readpipe], 1024 );
+                       }
+                       fclose( $pipes[$readpipe] );
+                       $retval = proc_close( $process );
+               } else {
+                       wfWarn( "Unable to start external tidy process" );
+                       $retval = -1;
+               }
+
+               if ( !$stderr && $cleansource == '' && $text != '' ) {
+                       // Some kind of error happened, so we couldn't get the corrected text.
+                       // Just give up; we'll use the source text and append a warning.
+                       $cleansource = null;
+               }
+
+               return $cleansource;
+       }
+
+       public function supportsValidate() {
+               return true;
+       }
+}
diff --git a/includes/tidy/RaggettInternalHHVM.php b/includes/tidy/RaggettInternalHHVM.php
new file mode 100644 (file)
index 0000000..03860be
--- /dev/null
@@ -0,0 +1,29 @@
+<?php
+
+namespace MediaWiki\Tidy;
+
+class RaggettInternalHHVM extends RaggettBase {
+       /**
+        * Use the HTML tidy extension to use the tidy library in-process,
+        * saving the overhead of spawning a new process.
+        *
+        * @param string $text HTML to check
+        * @param bool $stderr Whether to read result from error status instead of output
+        * @param int &$retval Exit code (-1 on internal error)
+        * @return string|null
+        */
+       protected function cleanWrapped( $text, $stderr = false, &$retval ) {
+               if ( $stderr ) {
+                       throw new Exception( "\$stderr cannot be used with RaggettInternalHHVM" );
+               }
+               $cleansource = tidy_repair_string( $text, $this->config['tidyConfigFile'], 'utf8' );
+               if ( $cleansource === false ) {
+                       $cleansource = null;
+                       $retval = -1;
+               } else {
+                       $retval = 0;
+               }
+
+               return $cleansource;
+       }
+}
diff --git a/includes/tidy/RaggettInternalPHP.php b/includes/tidy/RaggettInternalPHP.php
new file mode 100644 (file)
index 0000000..1ce14b6
--- /dev/null
@@ -0,0 +1,52 @@
+<?php
+
+namespace MediaWiki\Tidy;
+
+class RaggettInternalPHP extends RaggettBase {
+       /**
+        * Use the HTML tidy extension to use the tidy library in-process,
+        * saving the overhead of spawning a new process.
+        *
+        * @param string $text HTML to check
+        * @param bool $stderr Whether to read result from error status instead of output
+        * @param int &$retval Exit code (-1 on internal error)
+        * @return string|null
+        */
+       protected function cleanWrapped( $text, $stderr = false, &$retval = null ) {
+               if ( !class_exists( 'tidy' ) ) {
+                       wfWarn( "Unable to load internal tidy class." );
+                       $retval = -1;
+
+                       return null;
+               }
+
+               $tidy = new \tidy;
+               $tidy->parseString( $text, $this->config['tidyConfigFile'], 'utf8' );
+
+               if ( $stderr ) {
+                       $retval = $tidy->getStatus();
+                       return $tidy->errorBuffer;
+               }
+
+               $tidy->cleanRepair();
+               $retval = $tidy->getStatus();
+               if ( $retval == 2 ) {
+                       // 2 is magic number for fatal error
+                       // http://www.php.net/manual/en/function.tidy-get-status.php
+                       $cleansource = null;
+               } else {
+                       $cleansource = tidy_get_output( $tidy );
+                       if ( !empty( $this->config['debugComment'] ) && $retval > 0 ) {
+                               $cleansource .= "<!--\nTidy reports:\n" .
+                                       str_replace( '-->', '--&gt;', $tidy->errorBuffer ) .
+                                       "\n-->";
+                       }
+               }
+
+               return $cleansource;
+       }
+
+       public function supportsValidate() {
+               return true;
+       }
+}
diff --git a/includes/tidy/RaggettWrapper.php b/includes/tidy/RaggettWrapper.php
new file mode 100644 (file)
index 0000000..083f402
--- /dev/null
@@ -0,0 +1,89 @@
+<?php
+namespace MediaWiki\Tidy;
+
+use ReplacementArray;
+use ParserOutput;
+use Parser;
+
+/**
+ * Class used to hide mw:editsection tokens from Tidy so that it doesn't break them
+ * or break on them. This is a bit of a hack for now, but hopefully in the future
+ * we may create a real postprocessor or something that will replace this.
+ * It's called wrapper because for now it basically takes over MWTidy::tidy's task
+ * of wrapping the text in a xhtml block
+ *
+ * This re-uses some of the parser's UNIQ tricks, though some of it is private so it's
+ * duplicated. Perhaps we should create an abstract marker hiding class.
+ *
+ * @ingroup Parser
+ */
+class RaggettWrapper {
+
+       /**
+        * @var ReplacementArray
+        */
+       protected $mTokens;
+
+       protected $mMarkerIndex;
+
+       public function __construct() {
+               $this->mTokens = null;
+       }
+
+       /**
+        * @param string $text
+        * @return string
+        */
+       public function getWrapped( $text ) {
+               $this->mTokens = new ReplacementArray;
+               $this->mMarkerIndex = 0;
+
+               // Replace <mw:editsection> elements with placeholders
+               $wrappedtext = preg_replace_callback( ParserOutput::EDITSECTION_REGEX,
+                       array( &$this, 'replaceCallback' ), $text );
+               // ...and <mw:toc> markers
+               $wrappedtext = preg_replace_callback( '/\<\\/?mw:toc\>/',
+                       array( &$this, 'replaceCallback' ), $wrappedtext );
+               // ... and <math> tags
+               $wrappedtext = preg_replace_callback( '/\<math(.*?)\<\\/math\>/s',
+                       array( &$this, 'replaceCallback' ), $wrappedtext );
+               // Modify inline Microdata <link> and <meta> elements so they say <html-link> and <html-meta> so
+               // we can trick Tidy into not stripping them out by including them in tidy's new-empty-tags config
+               $wrappedtext = preg_replace( '!<(link|meta)([^>]*?)(/{0,1}>)!', '<html-$1$2$3', $wrappedtext );
+
+               // Wrap the whole thing in a doctype and body for Tidy.
+               $wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' .
+                       ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html>' .
+                       '<head><title>test</title></head><body>' . $wrappedtext . '</body></html>';
+
+               return $wrappedtext;
+       }
+
+       /**
+        * @param array $m
+        *
+        * @return string
+        */
+       public function replaceCallback( $m ) {
+               $marker = Parser::MARKER_PREFIX . "-item-{$this->mMarkerIndex}" . Parser::MARKER_SUFFIX;
+               $this->mMarkerIndex++;
+               $this->mTokens->setPair( $marker, $m[0] );
+               return $marker;
+       }
+
+       /**
+        * @param string $text
+        * @return string
+        */
+       public function postprocess( $text ) {
+               // Revert <html-{link,meta}> back to <{link,meta}>
+               $text = preg_replace( '!<html-(link|meta)([^>]*?)(/{0,1}>)!', '<$1$2$3', $text );
+
+               // Restore the contents of placeholder tokens
+               $text = $this->mTokens->replace( $text );
+
+               return $text;
+       }
+
+}
+?>
diff --git a/includes/tidy/TidyDriverBase.php b/includes/tidy/TidyDriverBase.php
new file mode 100644 (file)
index 0000000..1d994aa
--- /dev/null
@@ -0,0 +1,40 @@
+<?php
+
+namespace MediaWiki\Tidy;
+
+/**
+ * Base class for HTML cleanup utilities
+ */
+abstract class TidyDriverBase {
+       protected $config;
+
+       function __construct( $config ) {
+               $this->config = $config;
+       }
+
+       /**
+        * Return true if validate() can be used
+        */
+       public function supportsValidate() {
+               return false;
+       }
+
+       /**
+        * Check HTML for errors, used if $wgValidateAllHtml = true.
+        *
+        * @param string $text
+        * @param string &$errorStr Return the error string
+        * @return bool Whether the HTML is valid
+        */
+       public function validate( $text, &$errorStr ) {
+               throw new MWException( get_class( $this ) . " does not support validate()" );
+       }
+
+       /**
+        * Clean up HTML
+        *
+        * @param string HTML document fragment to clean up
+        * @param string The corrected HTML output
+        */
+       public abstract function tidy( $text );
+}
diff --git a/includes/tidy/tidy.conf b/includes/tidy/tidy.conf
new file mode 100644 (file)
index 0000000..4c4daed
--- /dev/null
@@ -0,0 +1,22 @@
+# html tidy (http://tidy.sf.net) configuration
+# tidy - validate, correct, and pretty-print HTML files
+# see: man 1 tidy, http://tidy.sourceforge.net/docs/quickref.html
+
+show-body-only: yes
+force-output: yes
+tidy-mark: no
+wrap: 0
+wrap-attributes: no
+literal-attributes: yes
+output-xhtml: yes
+numeric-entities: yes
+enclose-text: yes
+enclose-block-text: yes
+quiet: yes
+quote-nbsp: yes
+fix-backslash: no
+fix-uri: no
+# Don't strip html5 elements we support
+# html-{meta,link} is a hack we use to prevent Tidy from stripping <meta> and <link> used in the body for Microdata
+new-empty-tags: html-meta, html-link, wbr
+new-inline-tags: video, audio, source, track, bdi, data, time, mark
index 763d97e..06e1449 100644 (file)
@@ -73,7 +73,7 @@ class RefreshLinks extends Maintenance {
        private function doRefreshLinks( $start, $newOnly = false,
                $end = null, $redirectsOnly = false, $oldRedirectsOnly = false
        ) {
-               global $wgParser, $wgUseTidy;
+               global $wgParser;
 
                $reportingInterval = 100;
                $dbr = wfGetDB( DB_SLAVE );
@@ -88,9 +88,6 @@ class RefreshLinks extends Maintenance {
                # Don't generate extension images (e.g. Timeline)
                $wgParser->clearTagHooks();
 
-               # Don't use HTML tidy
-               $wgUseTidy = false;
-
                $what = $redirectsOnly ? "redirects" : "links";
 
                if ( $oldRedirectsOnly ) {
index cc0df7a..f429c30 100644 (file)
@@ -889,9 +889,9 @@ class ParserTest {
                        'wgDisableTitleConversion' => false,
                        // Tidy options.
                        'wgUseTidy' => isset( $opts['tidy'] ),
-                       'wgAlwaysUseTidy' => false,
+                       'wgTidyConfig' => null,
                        'wgDebugTidy' => false,
-                       'wgTidyConf' => $IP . '/includes/tidy.conf',
+                       'wgTidyConf' => $IP . '/includes/tidy/tidy.conf',
                        'wgTidyOpts' => '',
                        'wgTidyInternal' => $this->tidySupport->isInternal(),
                );
@@ -936,6 +936,7 @@ class ParserTest {
                $wgHooks['ParserGetVariableValueTs'][] = 'ParserTest::getFakeTimestamp';
 
                MagicWord::clearCache();
+               MWTidy::destroySingleton();
 
                return $context;
        }
@@ -1218,6 +1219,7 @@ class ParserTest {
                FileBackendGroup::destroySingleton();
                LockManagerGroup::destroySingletons();
                LinkCache::singleton()->clear();
+               MWTidy::destroySingleton();
 
                foreach ( $this->savedGlobals as $var => $val ) {
                        $GLOBALS[$var] = $val;
index ac214a2..7dc7027 100644 (file)
@@ -1117,7 +1117,7 @@ abstract class MediaWikiTestCase extends PHPUnit_Framework_TestCase {
                // of tidy. In that case however, we can not reliably detect whether a failing validation
                // is due to malformed HTML, or caused by tidy not being installed as a command line tool.
                // That would cause all HTML assertions to fail on a system that has no tidy installed.
-               if ( !$GLOBALS['wgTidyInternal'] ) {
+               if ( !$GLOBALS['wgTidyInternal'] || !MWTidy::isEnabled() ) {
                        $this->markTestSkipped( 'Tidy extension not installed' );
                }
 
index 77b26b3..7b60fb3 100644 (file)
@@ -22,7 +22,6 @@ class ExtraParserTest extends MediaWikiTestCase {
                        'wgContLang' => $contLang,
                        'wgLang' => Language::factory( 'en' ),
                        'wgMemc' => new EmptyBagOStuff,
-                       'wgAlwaysUseTidy' => false,
                        'wgCleanSignatures' => true,
                ) );
 
index c615c46..d3dc512 100644 (file)
@@ -6,6 +6,11 @@
  */
 class SanitizerTest extends MediaWikiTestCase {
 
+       protected function tearDown() {
+               MWTidy::destroySingleton();
+               parent::tearDown();
+       }
+
        /**
         * @covers Sanitizer::decodeCharReferences
         */
@@ -93,9 +98,7 @@ class SanitizerTest extends MediaWikiTestCase {
         * @param bool $escaped Whether sanitizer let the tag in or escape it (ie: '&lt;video&gt;')
         */
        public function testRemovehtmltagsOnHtml5Tags( $tag, $escaped ) {
-               $this->setMwGlobals( array(
-                       'wgUseTidy' => false
-               ) );
+               MWTidy::setInstance( false );
 
                if ( $escaped ) {
                        $this->assertEquals( "&lt;$tag&gt;",
@@ -157,7 +160,7 @@ class SanitizerTest extends MediaWikiTestCase {
         * @covers Sanitizer::removeHTMLtags
         */
        public function testRemoveHTMLtags( $input, $output, $msg = null ) {
-               $GLOBALS['wgUseTidy'] = false;
+               MWTidy::setInstance( false );
                $this->assertEquals( $output, Sanitizer::removeHTMLtags( $input ), $msg );
        }
 
@@ -360,5 +363,4 @@ class SanitizerTest extends MediaWikiTestCase {
                        array( '&lt;script&gt;foo&lt;/script&gt;', '<script>foo</script>' ),
                );
        }
-
 }
index 33861f1..492fec6 100644 (file)
@@ -4,7 +4,6 @@
  * @group ContentHandler
  */
 class TextContentHandlerTest extends MediaWikiLangTestCase {
-
        public function testSupportsDirectEditing() {
                $handler = new TextContentHandler();
                $this->assertTrue( $handler->supportsDirectEditing(), 'direct editing is supported' );
index dd61f85..fe26375 100644 (file)
@@ -27,10 +27,16 @@ class TextContentTest extends MediaWikiLangTestCase {
                                CONTENT_MODEL_JAVASCRIPT,
                        ),
                        'wgUseTidy' => false,
-                       'wgAlwaysUseTidy' => false,
                        'wgCapitalLinks' => true,
                        'wgHooks' => array(), // bypass hook ContentGetParserOutput that force custom rendering
                ) );
+
+               MWTidy::destroySingleton();
+       }
+
+       protected function tearDown() {
+               MWTidy::destroySingleton();
+               parent::tearDown();
        }
 
        public function newContent( $text ) {
index c7a3103..0d2129b 100644 (file)
@@ -160,10 +160,10 @@ class NewParserTest extends MediaWikiTestCase {
                $this->djVuSupport = new DjVuSupport();
                // Tidy support
                $this->tidySupport = new TidySupport();
+               $tmpGlobals['wgTidyConfig'] = null;
                $tmpGlobals['wgUseTidy'] = false;
-               $tmpGlobals['wgAlwaysUseTidy'] = false;
                $tmpGlobals['wgDebugTidy'] = false;
-               $tmpGlobals['wgTidyConf'] = $IP . '/includes/tidy.conf';
+               $tmpGlobals['wgTidyConf'] = $IP . '/includes/tidy/tidy.conf';
                $tmpGlobals['wgTidyOpts'] = '';
                $tmpGlobals['wgTidyInternal'] = $this->tidySupport->isInternal();
 
@@ -185,6 +185,8 @@ class NewParserTest extends MediaWikiTestCase {
                $wgNamespaceAliases['Image'] = $this->savedWeirdGlobals['image_alias'];
                $wgNamespaceAliases['Image_talk'] = $this->savedWeirdGlobals['image_talk_alias'];
 
+               MWTidy::destroySingleton();
+
                // Restore backends
                RepoGroup::destroySingleton();
                FileBackendGroup::destroySingleton();
@@ -454,6 +456,7 @@ class NewParserTest extends MediaWikiTestCase {
                        $GLOBALS[$var] = $val;
                }
 
+               MWTidy::destroySingleton();
                MagicWord::clearCache();
 
                # The entries saved into RepoGroup cache with previous globals will be wrong.
index 3605e50..4af3898 100644 (file)
@@ -19,12 +19,6 @@ class TagHookTest extends MediaWikiTestCase {
                return array( array( "foo<bar" ), array( "foo>bar" ), array( "foo\nbar" ), array( "foo\rbar" ) );
        }
 
-       protected function setUp() {
-               parent::setUp();
-
-               $this->setMwGlobals( 'wgAlwaysUseTidy', false );
-       }
-
        /**
         * @dataProvider provideValidNames
         * @covers Parser::setHook
index f656a74..5db2908 100644 (file)
@@ -7,8 +7,7 @@ class TidyTest extends MediaWikiTestCase {
 
        protected function setUp() {
                parent::setUp();
-               $check = MWTidy::tidy( '' );
-               if ( strpos( $check, '<!--' ) !== false ) {
+               if ( !MWTidy::isEnabled() ) {
                        $this->markTestSkipped( 'Tidy not found' );
                }
        }