Use Remex in Sanitizer::stripAllTags()

author Roan Kattouw <roan.kattouw@gmail.com>

Tue, 14 Nov 2017 22:22:31 +0000 (14:22 -0800)

committer James D. Forrester <jforrester@wikimedia.org>

Thu, 16 Nov 2017 01:31:31 +0000 (17:31 -0800)
author Roan Kattouw <roan.kattouw@gmail.com>
Tue, 14 Nov 2017 22:22:31 +0000 (14:22 -0800)
committer James D. Forrester <jforrester@wikimedia.org>
Thu, 16 Nov 2017 01:31:31 +0000 (17:31 -0800)
diff --git a/autoload.php b/autoload.php

index 2f6fbda..3f5a3c1 100644 (file)
--- a/autoload.php
+++ b/autoload.php
@@ -1218,6 +1218,7 @@ $wgAutoloadLocalClasses = [
         'RefreshLinks' => __DIR__ . '/maintenance/refreshLinks.php',
         'RefreshLinksJob' => __DIR__ . '/includes/jobqueue/jobs/RefreshLinksJob.php',
         'RegexlikeReplacer' => __DIR__ . '/includes/libs/replacers/RegexlikeReplacer.php',
         'RefreshLinks' => __DIR__ . '/maintenance/refreshLinks.php',
         'RefreshLinksJob' => __DIR__ . '/includes/jobqueue/jobs/RefreshLinksJob.php',
         'RegexlikeReplacer' => __DIR__ . '/includes/libs/replacers/RegexlikeReplacer.php',
+       'RemexStripTagHandler' => __DIR__ . '/includes/parser/RemexStripTagHandler.php',
         'RemoveInvalidEmails' => __DIR__ . '/maintenance/removeInvalidEmails.php',
         'RemoveUnusedAccounts' => __DIR__ . '/maintenance/removeUnusedAccounts.php',
         'RenameDbPrefix' => __DIR__ . '/maintenance/renameDbPrefix.php',
         'RemoveInvalidEmails' => __DIR__ . '/maintenance/removeInvalidEmails.php',
         'RemoveUnusedAccounts' => __DIR__ . '/maintenance/removeUnusedAccounts.php',
         'RenameDbPrefix' => __DIR__ . '/maintenance/renameDbPrefix.php',
diff --git a/includes/parser/RemexStripTagHandler.php b/includes/parser/RemexStripTagHandler.php

new file mode 100644 (file)

index 0000000..2839147
--- /dev/null
+++ b/includes/parser/RemexStripTagHandler.php
@@ -0,0 +1,40 @@
+<?php
+
+use RemexHtml\Tokenizer\Attributes;
+use RemexHtml\Tokenizer\TokenHandler;
+use RemexHtml\Tokenizer\Tokenizer;
+
+/**
+ * @internal
+ */
+class RemexStripTagHandler implements TokenHandler {
+       private $text = '';
+       public function getResult() {
+               return $this->text;
+       }
+
+       function startDocument( Tokenizer $t, $fns, $fn ) {
+               // Do nothing.
+       }
+       function endDocument( $pos ) {
+               // Do nothing.
+       }
+       function error( $text, $pos ) {
+               // Do nothing.
+       }
+       function characters( $text, $start, $length, $sourceStart, $sourceLength ) {
+               $this->text .= substr( $text, $start, $length );
+       }
+       function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) {
+               // Do nothing.
+       }
+       function endTag( $name, $sourceStart, $sourceLength ) {
+               // Do nothing.
+       }
+       function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) {
+               // Do nothing.
+       }
+       function comment( $text, $sourceStart, $sourceLength ) {
+               // Do nothing.
+       }
+}
diff --git a/includes/parser/Sanitizer.php b/includes/parser/Sanitizer.php

index 4c99677..7c9f563 100644 (file)
--- a/includes/parser/Sanitizer.php
+++ b/includes/parser/Sanitizer.php
@@ -1967,17 +1967,22 @@ class Sanitizer {
          * Warning: this return value must be further escaped for literal
          * inclusion in HTML output as of 1.10!
          *
          * Warning: this return value must be further escaped for literal
          * inclusion in HTML output as of 1.10!
          *
-        * @param string $text HTML fragment
+        * @param string $html HTML fragment
          * @return string
          */
          * @return string
          */
-       static function stripAllTags( $text ) {
-               # Actual <tags>
-               $text = StringUtils::delimiterReplace( '<', '>', '', $text );
+       static function stripAllTags( $html ) {
+               // Use RemexHtml to tokenize $html and extract the text
+               $handler = new RemexStripTagHandler;
+               $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [
+                       'ignoreErrors' => true,
+                       // don't ignore char refs, we want them to be decoded
+                       'ignoreNulls' => true,
+                       'skipPreprocess' => true,
+               ] );
+               $tokenizer->execute();
+               $text = $handler->getResult();
  
  
-               # Normalize &entities and whitespace
-               $text = self::decodeCharReferences( $text );
                 $text = self::normalizeWhitespace( $text );
                 $text = self::normalizeWhitespace( $text );
-
                 return $text;
         }
  
                 return $text;
         }
  
diff --git a/tests/phpunit/includes/parser/SanitizerTest.php b/tests/phpunit/includes/parser/SanitizerTest.php

index 269575b..d7e72e1 100644 (file)
--- a/tests/phpunit/includes/parser/SanitizerTest.php
+++ b/tests/phpunit/includes/parser/SanitizerTest.php
@@ -530,11 +530,10 @@ class SanitizerTest extends MediaWikiTestCase {
                         [ '<p id="one">Foo</p><p id="two">Bar</p>', 'FooBar' ],
                         [ "<p>Foo</p>\n<p>Bar</p>", 'Foo Bar' ],
                         [ '<p>Hello &lt;strong&gt; wor&#x6c;&#100; caf&eacute;</p>', 'Hello <strong> world café' ],
                         [ '<p id="one">Foo</p><p id="two">Bar</p>', 'FooBar' ],
                         [ "<p>Foo</p>\n<p>Bar</p>", 'Foo Bar' ],
                         [ '<p>Hello &lt;strong&gt; wor&#x6c;&#100; caf&eacute;</p>', 'Hello <strong> world café' ],
-                       // This one is broken, see T179978
-                       //[
-                       //      '<p><small data-foo=\'bar"&lt;baz>quux\'><a href="./Foo">Bar</a></small> Whee!</p>',
-                       //      'Bar Whee!'
-                       //],
+                       [
+                               '<p><small data-foo=\'bar"&lt;baz>quux\'><a href="./Foo">Bar</a></small> Whee!</p>',
+                               'Bar Whee!'
+                       ],
                         [ '1<span class="<?php">2</span>3', '123' ],
                         [ '1<span class="<?">2</span>3', '123' ],
                 ];
                         [ '1<span class="<?php">2</span>3', '123' ],
                         [ '1<span class="<?">2</span>3', '123' ],
                 ];
author	Roan Kattouw <roan.kattouw@gmail.com>
	Tue, 14 Nov 2017 22:22:31 +0000 (14:22 -0800)
committer	James D. Forrester <jforrester@wikimedia.org>
	Thu, 16 Nov 2017 01:31:31 +0000 (17:31 -0800)
autoload.php		patch \| blob \| history
includes/parser/RemexStripTagHandler.php	[new file with mode: 0644]	patch \| blob
includes/parser/Sanitizer.php		patch \| blob \| history
tests/phpunit/includes/parser/SanitizerTest.php		patch \| blob \| history