From bda42cef3c92e1117e63be8028ce824812c3e265 Mon Sep 17 00:00:00 2001 From: "C. Scott Ananian" Date: Tue, 23 Apr 2019 13:09:36 -0400 Subject: [PATCH] Deprecate Sanitizer::setupAttributeWhitelist/attributeWhitelist These methods should be made private in the next release, but hard-deprecate them for 1.34. Tweak the return value of the attribute whitelist to be an associative rather than a sequential array, which makes the lookup of allowed attributes more efficient and avoids an array_flip for every html element sanitized. Bug: T221677 Change-Id: I17d734937accec6c2679dbe17328cf9554bd556a --- RELEASE-NOTES-1.34 | 2 + includes/parser/Sanitizer.php | 125 ++++++++++++------ .../phpunit/includes/parser/SanitizerTest.php | 57 ++++++++ 3 files changed, 144 insertions(+), 40 deletions(-) diff --git a/RELEASE-NOTES-1.34 b/RELEASE-NOTES-1.34 index 431db3da5f..eea0ac9835 100644 --- a/RELEASE-NOTES-1.34 +++ b/RELEASE-NOTES-1.34 @@ -276,6 +276,8 @@ because of Phabricator reports. * The Preprocessor_DOM implementation has been deprecated. It will be removed in a future release. Use the Preprocessor_Hash implementation instead. +* Sanitizer::attributeWhitelist() and Sanitizer::setupAttributeWhitelist() + have been deprecated; they will be made private in the future. === Other changes in 1.34 === * … diff --git a/includes/parser/Sanitizer.php b/includes/parser/Sanitizer.php index d8e5e3e321..8e0cf5c877 100644 --- a/includes/parser/Sanitizer.php +++ b/includes/parser/Sanitizer.php @@ -790,7 +790,7 @@ class Sanitizer { */ static function validateTagAttributes( $attribs, $element ) { return self::validateAttributes( $attribs, - self::attributeWhitelist( $element ) ); + self::attributeWhitelistInternal( $element ) ); } /** @@ -802,14 +802,21 @@ class Sanitizer { * - Invalid id attributes are re-encoded * * @param array $attribs - * @param array $whitelist List of allowed attribute names + * @param array $whitelist List of allowed attribute names, + * either as a sequential array of valid attribute names or + * as an associative array where keys give valid attribute names * @return array * * @todo Check for legal values where the DTD limits things. * @todo Check for unique id attribute :P */ static function validateAttributes( $attribs, $whitelist ) { - $whitelist = array_flip( $whitelist ); + if ( isset( $whitelist[0] ) ) { + // We would like to eventually deprecate calling this + // function with a sequential array, but for now just + // convert it. + $whitelist = array_flip( $whitelist ); + } $hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/'; $out = []; @@ -828,10 +835,10 @@ class Sanitizer { # * Disallow data attributes used by MediaWiki code # * Ensure that the attribute is not namespaced by banning # colons. - if ( !preg_match( '/^data-[^:]*$/i', $attribute ) - && !isset( $whitelist[$attribute] ) - || self::isReservedDataAttribute( $attribute ) - ) { + if ( ( + !preg_match( '/^data-[^:]*$/i', $attribute ) && + !array_key_exists( $attribute, $whitelist ) + ) || self::isReservedDataAttribute( $attribute ) ) { continue; } @@ -1746,26 +1753,63 @@ class Sanitizer { * Fetch the whitelist of acceptable attributes for a given element name. * * @param string $element - * @return array + * @return array A sequential array of acceptable attribute names + * @deprecated since 1.34; should be private */ static function attributeWhitelist( $element ) { + wfDeprecated( __METHOD__, '1.34' ); $list = self::setupAttributeWhitelist(); return $list[$element] ?? []; } + /** + * Fetch the whitelist of acceptable attributes for a given element name. + * + * @param string $element + * @return array An associative array where keys are acceptable attribute + * names + */ + private static function attributeWhitelistInternal( $element ) { + $list = self::setupAttributeWhitelistInternal(); + return $list[$element] ?? []; + } + /** * Foreach array key (an allowed HTML element), return an array * of allowed attributes * @return array + * @deprecated since 1.34; should be private */ static function setupAttributeWhitelist() { + wfDeprecated( __METHOD__, '1.34' ); + $wlist = self::setupAttributeWhitelistInternal(); + // This method is expected to return a sequential array as the + // value for each HTML element key. + return array_map( function ( $v ) { + return array_keys( $v ); + }, $wlist ); + } + + /** + * Foreach array key (an allowed HTML element), return an array + * of allowed attributes + * @return array An associative array: keys are HTML element names; + * values are associative arrays where the keys are allowed attribute + * names. + */ + private static function setupAttributeWhitelistInternal() { static $whitelist; if ( $whitelist !== null ) { return $whitelist; } - $common = [ + // For lookup efficiency flip each attributes array so the keys are + // the valid attributes. + $merge = function ( $a, $b, $c = [] ) { + return array_merge( $a, array_flip( $b ), array_flip( $c ) ); + }; + $common = $merge( [], [ # HTML 'id', 'class', @@ -1798,9 +1842,10 @@ class Sanitizer { 'itemref', 'itemscope', 'itemtype', - ]; + ] ); + + $block = $merge( $common, [ 'align' ] ); - $block = array_merge( $common, [ 'align' ] ); $tablealign = [ 'align', 'valign' ]; $tablecell = [ 'abbr', @@ -1850,8 +1895,8 @@ class Sanitizer { # acronym # 9.2.2 - 'blockquote' => array_merge( $common, [ 'cite' ] ), - 'q' => array_merge( $common, [ 'cite' ] ), + 'blockquote' => $merge( $common, [ 'cite' ] ), + 'q' => $merge( $common, [ 'cite' ] ), # 9.2.3 'sub' => $common, @@ -1861,22 +1906,22 @@ class Sanitizer { 'p' => $block, # 9.3.2 - 'br' => array_merge( $common, [ 'clear' ] ), + 'br' => $merge( $common, [ 'clear' ] ), # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element 'wbr' => $common, # 9.3.4 - 'pre' => array_merge( $common, [ 'width' ] ), + 'pre' => $merge( $common, [ 'width' ] ), # 9.4 - 'ins' => array_merge( $common, [ 'cite', 'datetime' ] ), - 'del' => array_merge( $common, [ 'cite', 'datetime' ] ), + 'ins' => $merge( $common, [ 'cite', 'datetime' ] ), + 'del' => $merge( $common, [ 'cite', 'datetime' ] ), # 10.2 - 'ul' => array_merge( $common, [ 'type' ] ), - 'ol' => array_merge( $common, [ 'type', 'start', 'reversed' ] ), - 'li' => array_merge( $common, [ 'type', 'value' ] ), + 'ul' => $merge( $common, [ 'type' ] ), + 'ol' => $merge( $common, [ 'type', 'start', 'reversed' ] ), + 'li' => $merge( $common, [ 'type', 'value' ] ), # 10.3 'dl' => $common, @@ -1884,7 +1929,7 @@ class Sanitizer { 'dt' => $common, # 11.2.1 - 'table' => array_merge( $common, + 'table' => $merge( $common, [ 'summary', 'width', 'border', 'frame', 'rules', 'cellspacing', 'cellpadding', 'align', 'bgcolor', @@ -1899,31 +1944,31 @@ class Sanitizer { 'tbody' => $common, # 11.2.4 - 'colgroup' => array_merge( $common, [ 'span' ] ), - 'col' => array_merge( $common, [ 'span' ] ), + 'colgroup' => $merge( $common, [ 'span' ] ), + 'col' => $merge( $common, [ 'span' ] ), # 11.2.5 - 'tr' => array_merge( $common, [ 'bgcolor' ], $tablealign ), + 'tr' => $merge( $common, [ 'bgcolor' ], $tablealign ), # 11.2.6 - 'td' => array_merge( $common, $tablecell, $tablealign ), - 'th' => array_merge( $common, $tablecell, $tablealign ), + 'td' => $merge( $common, $tablecell, $tablealign ), + 'th' => $merge( $common, $tablecell, $tablealign ), # 12.2 # NOTE: is not allowed directly, but the attrib # whitelist is used from the Parser object - 'a' => array_merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa + 'a' => $merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa # 13.2 # Not usually allowed, but may be used for extension-style hooks # such as when it is rasterized, or if $wgAllowImageTag is # true - 'img' => array_merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ), + 'img' => $merge( $common, [ 'alt', 'src', 'width', 'height', 'srcset' ] ), # Attributes for A/V tags added in T163583 / T133673 - 'audio' => array_merge( $common, [ 'controls', 'preload', 'width', 'height' ] ), - 'video' => array_merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ), - 'source' => array_merge( $common, [ 'type', 'src' ] ), - 'track' => array_merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ), + 'audio' => $merge( $common, [ 'controls', 'preload', 'width', 'height' ] ), + 'video' => $merge( $common, [ 'poster', 'controls', 'preload', 'width', 'height' ] ), + 'source' => $merge( $common, [ 'type', 'src' ] ), + 'track' => $merge( $common, [ 'type', 'src', 'srclang', 'kind', 'label' ] ), # 15.2.1 'tt' => $common, @@ -1936,11 +1981,11 @@ class Sanitizer { 'u' => $common, # 15.2.2 - 'font' => array_merge( $common, [ 'size', 'color', 'face' ] ), + 'font' => $merge( $common, [ 'size', 'color', 'face' ] ), # basefont # 15.3 - 'hr' => array_merge( $common, [ 'width' ] ), + 'hr' => $merge( $common, [ 'width' ] ), # HTML Ruby annotation text module, simple ruby only. # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element @@ -1948,13 +1993,13 @@ class Sanitizer { # rbc 'rb' => $common, 'rp' => $common, - 'rt' => $common, # array_merge( $common, [ 'rbspan' ] ), + 'rt' => $common, # $merge( $common, [ 'rbspan' ] ), 'rtc' => $common, # MathML root element, where used for extensions # 'title' may not be 100% valid here; it's XHTML # https://www.w3.org/TR/REC-MathML/ - 'math' => [ 'class', 'style', 'id', 'title' ], + 'math' => $merge( [], [ 'class', 'style', 'id', 'title' ] ), // HTML 5 section 4.5 'figure' => $common, @@ -1966,8 +2011,8 @@ class Sanitizer { # HTML5 elements, defined by: # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element - 'data' => array_merge( $common, [ 'value' ] ), - 'time' => array_merge( $common, [ 'datetime' ] ), + 'data' => $merge( $common, [ 'value' ] ), + 'time' => $merge( $common, [ 'datetime' ] ), 'mark' => $common, // meta and link are only permitted by removeHTMLtags when Microdata @@ -1975,8 +2020,8 @@ class Sanitizer { // Also meta and link are only valid in WikiText as Microdata elements // (ie: validateTag rejects tags missing the attributes needed for Microdata) // So we don't bother including $common attributes that have no purpose. - 'meta' => [ 'itemprop', 'content' ], - 'link' => [ 'itemprop', 'href', 'title' ], + 'meta' => $merge( [], [ 'itemprop', 'content' ] ), + 'link' => $merge( [], [ 'itemprop', 'href', 'title' ] ), ]; return $whitelist; diff --git a/tests/phpunit/includes/parser/SanitizerTest.php b/tests/phpunit/includes/parser/SanitizerTest.php index 1b67bbdf79..8ddd79863f 100644 --- a/tests/phpunit/includes/parser/SanitizerTest.php +++ b/tests/phpunit/includes/parser/SanitizerTest.php @@ -1,5 +1,7 @@ assertEquals( " $inputAttr", @@ -274,6 +278,59 @@ class SanitizerTest extends MediaWikiTestCase { ]; } + /** + * @dataProvider provideValidateTagAttributes + * @covers Sanitizer::validateTagAttributes + * @covers Sanitizer::validateAttributes + */ + public function testValidateTagAttributes( $element, $attribs, $expected ) { + $actual = Sanitizer::validateTagAttributes( $attribs, $element ); + $this->assertArrayEquals( $expected, $actual, false, true ); + } + + public static function provideValidateTagAttributes() { + return [ + [ 'math', + [ 'id' => 'foo bar', 'bogus' => 'stripped', 'data-foo' => 'bar' ], + [ 'id' => 'foo_bar', 'data-foo' => 'bar' ], + ], + [ 'meta', + [ 'id' => 'foo bar', 'itemprop' => 'foo', 'content' => 'bar' ], + [ 'itemprop' => 'foo', 'content' => 'bar' ], + ], + ]; + } + + /** + * @dataProvider provideAttributeWhitelist + * @covers Sanitizer::attributeWhitelist + */ + public function testAttributeWhitelist( $element, $attribs ) { + $this->hideDeprecated( 'Sanitizer::attributeWhitelist' ); + $this->hideDeprecated( 'Sanitizer::setupAttributeWhitelist' ); + $actual = Sanitizer::attributeWhitelist( $element ); + $this->assertArrayEquals( $attribs, $actual ); + } + + /** + * @dataProvider provideAttributeWhitelist + * @covers Sanitizer::attributeWhitelistInternal + */ + public function testAttributeWhitelistInternal( $element, $attribs ) { + $sanitizer = TestingAccessWrapper::newFromClass( Sanitizer::class ); + $actual = $sanitizer->attributeWhitelistInternal( $element ); + $this->assertArrayEquals( $attribs, array_keys( $actual ) ); + } + + public function provideAttributeWhitelist() { + /** [ , [ , , ...] ] */ + return [ + [ 'math', [ 'class', 'style', 'id', 'title' ] ], + [ 'meta', [ 'itemprop', 'content' ] ], + [ 'link', [ 'itemprop', 'href', 'title' ] ], + ]; + } + /** * @dataProvider provideCssCommentsFixtures * @covers Sanitizer::checkCss -- 2.20.1