From ddb4913f53624c8ee0a2a91bd44bf750e378569d Mon Sep 17 00:00:00 2001 From: Roan Kattouw Date: Tue, 14 Nov 2017 14:22:31 -0800 Subject: [PATCH] Use Remex in Sanitizer::stripAllTags() Using a real HTML tokenizer fixes bugs when < or > appear in attribute values. The old implementation used delimiterReplace(), which didn't handle this case: > print Sanitizer::stripAllTags( '

Hello

' ); c">Hello We also can't use PHP's built-in strip_tags() because it doesn't handle print strip_tags('123'); 1 > print strip_tags('123'); 1 Bug: T179978 Change-Id: I53b98e6c877c00c03ff110914168b398559c9c3e --- autoload.php | 1 + includes/parser/RemexStripTagHandler.php | 40 +++++++++++++++++++ includes/parser/Sanitizer.php | 19 +++++---- .../phpunit/includes/parser/SanitizerTest.php | 9 ++--- 4 files changed, 57 insertions(+), 12 deletions(-) create mode 100644 includes/parser/RemexStripTagHandler.php diff --git a/autoload.php b/autoload.php index 2f6fbda897..3f5a3c1413 100644 --- a/autoload.php +++ b/autoload.php @@ -1218,6 +1218,7 @@ $wgAutoloadLocalClasses = [ 'RefreshLinks' => __DIR__ . '/maintenance/refreshLinks.php', 'RefreshLinksJob' => __DIR__ . '/includes/jobqueue/jobs/RefreshLinksJob.php', 'RegexlikeReplacer' => __DIR__ . '/includes/libs/replacers/RegexlikeReplacer.php', + 'RemexStripTagHandler' => __DIR__ . '/includes/parser/RemexStripTagHandler.php', 'RemoveInvalidEmails' => __DIR__ . '/maintenance/removeInvalidEmails.php', 'RemoveUnusedAccounts' => __DIR__ . '/maintenance/removeUnusedAccounts.php', 'RenameDbPrefix' => __DIR__ . '/maintenance/renameDbPrefix.php', diff --git a/includes/parser/RemexStripTagHandler.php b/includes/parser/RemexStripTagHandler.php new file mode 100644 index 0000000000..2839147d4f --- /dev/null +++ b/includes/parser/RemexStripTagHandler.php @@ -0,0 +1,40 @@ +text; + } + + function startDocument( Tokenizer $t, $fns, $fn ) { + // Do nothing. + } + function endDocument( $pos ) { + // Do nothing. + } + function error( $text, $pos ) { + // Do nothing. + } + function characters( $text, $start, $length, $sourceStart, $sourceLength ) { + $this->text .= substr( $text, $start, $length ); + } + function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) { + // Do nothing. + } + function endTag( $name, $sourceStart, $sourceLength ) { + // Do nothing. + } + function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) { + // Do nothing. + } + function comment( $text, $sourceStart, $sourceLength ) { + // Do nothing. + } +} diff --git a/includes/parser/Sanitizer.php b/includes/parser/Sanitizer.php index 4c996771e8..7c9f56326b 100644 --- a/includes/parser/Sanitizer.php +++ b/includes/parser/Sanitizer.php @@ -1967,17 +1967,22 @@ class Sanitizer { * Warning: this return value must be further escaped for literal * inclusion in HTML output as of 1.10! * - * @param string $text HTML fragment + * @param string $html HTML fragment * @return string */ - static function stripAllTags( $text ) { - # Actual - $text = StringUtils::delimiterReplace( '<', '>', '', $text ); + static function stripAllTags( $html ) { + // Use RemexHtml to tokenize $html and extract the text + $handler = new RemexStripTagHandler; + $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [ + 'ignoreErrors' => true, + // don't ignore char refs, we want them to be decoded + 'ignoreNulls' => true, + 'skipPreprocess' => true, + ] ); + $tokenizer->execute(); + $text = $handler->getResult(); - # Normalize &entities and whitespace - $text = self::decodeCharReferences( $text ); $text = self::normalizeWhitespace( $text ); - return $text; } diff --git a/tests/phpunit/includes/parser/SanitizerTest.php b/tests/phpunit/includes/parser/SanitizerTest.php index 269575b24e..d7e72e164b 100644 --- a/tests/phpunit/includes/parser/SanitizerTest.php +++ b/tests/phpunit/includes/parser/SanitizerTest.php @@ -530,11 +530,10 @@ class SanitizerTest extends MediaWikiTestCase { [ '

Foo

Bar

', 'FooBar' ], [ "

Foo

\n

Bar

", 'Foo Bar' ], [ '

Hello <strong> world café

', 'Hello world café' ], - // This one is broken, see T179978 - //[ - // '

quux\'>Bar Whee!

', - // 'Bar Whee!' - //], + [ + '

quux\'>Bar Whee!

', + 'Bar Whee!' + ], [ '123', '123' ], [ '123', '123' ], ]; -- 2.20.1