Add Html5Depurate tidy driver
authorTim Starling <tstarling@wikimedia.org>
Thu, 3 Sep 2015 04:46:48 +0000 (14:46 +1000)
committerOri.livneh <ori@wikimedia.org>
Fri, 11 Sep 2015 03:32:32 +0000 (03:32 +0000)
Also document input format for MWTidy::tidy().

Change-Id: I77071d3db0524695c2baf9a4670ca2455438c83d

autoload.php
includes/parser/MWTidy.php
includes/tidy/Html5Depurate.php [new file with mode: 0644]

index 62d6f09..4db3ec7 100644 (file)
@@ -760,6 +760,7 @@ $wgAutoloadLocalClasses = array(
        'MediaWiki\\Logger\\Monolog\\WikiProcessor' => __DIR__ . '/includes/debug/logger/monolog/WikiProcessor.php',
        'MediaWiki\\Logger\\NullSpi' => __DIR__ . '/includes/debug/logger/NullSpi.php',
        'MediaWiki\\Logger\\Spi' => __DIR__ . '/includes/debug/logger/Spi.php',
        'MediaWiki\\Logger\\Monolog\\WikiProcessor' => __DIR__ . '/includes/debug/logger/monolog/WikiProcessor.php',
        'MediaWiki\\Logger\\NullSpi' => __DIR__ . '/includes/debug/logger/NullSpi.php',
        'MediaWiki\\Logger\\Spi' => __DIR__ . '/includes/debug/logger/Spi.php',
+       'MediaWiki\\Tidy\\Html5Depurate' => __DIR__ . '/includes/tidy/Html5Depurate.php',
        'MediaWiki\\Tidy\\RaggettBase' => __DIR__ . '/includes/tidy/RaggettBase.php',
        'MediaWiki\\Tidy\\RaggettExternal' => __DIR__ . '/includes/tidy/RaggettExternal.php',
        'MediaWiki\\Tidy\\RaggettInternalHHVM' => __DIR__ . '/includes/tidy/RaggettInternalHHVM.php',
        'MediaWiki\\Tidy\\RaggettBase' => __DIR__ . '/includes/tidy/RaggettBase.php',
        'MediaWiki\\Tidy\\RaggettExternal' => __DIR__ . '/includes/tidy/RaggettExternal.php',
        'MediaWiki\\Tidy\\RaggettInternalHHVM' => __DIR__ . '/includes/tidy/RaggettInternalHHVM.php',
index d0e50bc..807842b 100644 (file)
@@ -38,7 +38,8 @@ class MWTidy {
         * If tidy isn't able to correct the markup, the original will be
         * returned in all its glory with a warning comment appended.
         *
         * If tidy isn't able to correct the markup, the original will be
         * returned in all its glory with a warning comment appended.
         *
-        * @param string $text Hideous HTML input
+        * @param string $text HTML input fragment. This should not contain a
+        *                     <body> or <html> tag.
         * @return string Corrected HTML output
         */
        public static function tidy( $text ) {
         * @return string Corrected HTML output
         */
        public static function tidy( $text ) {
@@ -110,6 +111,9 @@ class MWTidy {
                                case 'RaggettExternal':
                                        self::$instance = new MediaWiki\Tidy\RaggettExternal( $config );
                                        break;
                                case 'RaggettExternal':
                                        self::$instance = new MediaWiki\Tidy\RaggettExternal( $config );
                                        break;
+                               case 'Html5Depurate':
+                                       self::$instance = new MediaWiki\Tidy\Html5Depurate( $config );
+                                       break;
                                default:
                                        throw new MWException( "Invalid tidy driver: \"{$config['driver']}\"" );
                        }
                                default:
                                        throw new MWException( "Invalid tidy driver: \"{$config['driver']}\"" );
                        }
diff --git a/includes/tidy/Html5Depurate.php b/includes/tidy/Html5Depurate.php
new file mode 100644 (file)
index 0000000..23e445f
--- /dev/null
@@ -0,0 +1,45 @@
+<?php
+
+namespace MediaWiki\Tidy;
+use MWHttpRequest;
+use Exception;
+
+class Html5Depurate extends TidyDriverBase {
+       public function __construct( array $config ) {
+               parent::__construct( $config + array(
+                       'url' => 'http://localhost:4339/document',
+                       'timeout' => 10,
+                       'connectTimeout' => 0.5,
+               ) );
+       }
+
+       public function tidy( $text ) {
+               $wrappedtext = '<!DOCTYPE html><html>' .
+                       '<body>' . $text . '</body></html>';
+
+               $req = MWHttpRequest::factory( $this->config['url'],
+                       array(
+                               'method' => 'POST',
+                               'timeout' => $this->config['timeout'],
+                               'connectTimeout' => $this->config['connectTimeout'],
+                               'postData' => array(
+                                       'text' => $wrappedtext
+                               )
+                       ) );
+               $status = $req->execute();
+               if ( !$status->isOK() ) {
+                       throw new Exception( "Error contacting depurate service: " . $status->getWikiText() );
+               } elseif ( $req->getStatus() !== 200 ) {
+                       throw new Exception( "Depurate returned error: " . $status->getWikiText() );
+               }
+               $result = $req->getContent();
+               $startBody = strpos( $result, "<body>" );
+               $endBody = strrpos( $result, "</body>" );
+               if ( $startBody !== false && $endBody !== false && $endBody > $startBody ) {
+                       $startBody += strlen( "<body>" );
+                       return substr( $result, $startBody, $endBody - $startBody );
+               } else {
+                       return $text . "\n<!-- Html5Depurate returned an invalid result -->";
+               }
+       }
+}