From 19b739bd19c437c5a637d85f431f139da7521508 Mon Sep 17 00:00:00 2001 From: Max Semenik Date: Mon, 28 Aug 2017 14:14:44 -0700 Subject: [PATCH] EditPage: Don't allow clients that mangle unicode to edit Get rid of the hack that turns unicode into hexadecimal codes for browsers that don't support unicode, and prevent their edits entirely. And instead of relying on $wgBrowserBlacklist, use a hidden HTML form field - if the contents are mangled and don't match the original, then reject the edit. Bug: T67297 Change-Id: I20c2e396d7dfd6a3b23b94b218f94a847522576b --- RELEASE-NOTES-1.30 | 2 + includes/DefaultSettings.php | 41 +----- includes/EditPage.php | 159 ++++++------------------ includes/api/ApiEditPage.php | 1 + languages/i18n/en.json | 2 +- languages/i18n/qqq.json | 2 +- tests/phpunit/includes/EditPageTest.php | 7 +- 7 files changed, 48 insertions(+), 166 deletions(-) diff --git a/RELEASE-NOTES-1.30 b/RELEASE-NOTES-1.30 index 8517a8f0de..513dc3a09f 100644 --- a/RELEASE-NOTES-1.30 +++ b/RELEASE-NOTES-1.30 @@ -73,6 +73,7 @@ section). * (T138166) Added ability for users to prohibit other users from sending them emails with Special:Emailuser. Can be enabled by setting $wgEnableUserEmailBlacklist to true. +* $wgBrowserBlacklist is deprecated, and changing it will have no effect. === External library changes in 1.30 === @@ -221,6 +222,7 @@ changes to languages because of Phabricator reports. * wfShellExec() and related functions are deprecated, use Shell::command(). * (T138166) SpecialEmailUser::getTarget() now requires a second argument, the sending user object. Using the method without the second argument is deprecated. +* (T67297) Browsers that don't support Unicode will have their edits rejected. == Compatibility == MediaWiki 1.30 requires PHP 5.5.9 or later. There is experimental support for diff --git a/includes/DefaultSettings.php b/includes/DefaultSettings.php index cdb7ce86d1..5630dcbeee 100644 --- a/includes/DefaultSettings.php +++ b/includes/DefaultSettings.php @@ -2980,46 +2980,9 @@ $wgAllUnicodeFixes = false; $wgLegacyEncoding = false; /** - * Browser Blacklist for unicode non compliant browsers. Contains a list of - * regexps : "/regexp/" matching problematic browsers. These browsers will - * be served encoded unicode in the edit box instead of real unicode. + * @deprecated since 1.30, does nothing */ -$wgBrowserBlackList = [ - /** - * Netscape 2-4 detection - * The minor version may contain strings such as "Gold" or "SGoldC-SGI" - * Lots of non-netscape user agents have "compatible", so it's useful to check for that - * with a negative assertion. The [UIN] identifier specifies the level of security - * in a Netscape/Mozilla browser, checking for it rules out a number of fakers. - * The language string is unreliable, it is missing on NS4 Mac. - * - * Reference: http://www.psychedelix.com/agents/index.shtml - */ - '/^Mozilla\/2\.[^ ]+ [^(]*?\((?!compatible).*; [UIN]/', - '/^Mozilla\/3\.[^ ]+ [^(]*?\((?!compatible).*; [UIN]/', - '/^Mozilla\/4\.[^ ]+ [^(]*?\((?!compatible).*; [UIN]/', - - /** - * MSIE on Mac OS 9 is teh sux0r, converts þ to , ð to , - * Þ to and Ð to - * - * Known useragents: - * - Mozilla/4.0 (compatible; MSIE 5.0; Mac_PowerPC) - * - Mozilla/4.0 (compatible; MSIE 5.15; Mac_PowerPC) - * - Mozilla/4.0 (compatible; MSIE 5.23; Mac_PowerPC) - * - [...] - * - * @link https://en.wikipedia.org/w/index.php?diff=12356041&oldid=12355864 - * @link https://en.wikipedia.org/wiki/Template%3AOS9 - */ - '/^Mozilla\/4\.0 \(compatible; MSIE \d+\.\d+; Mac_PowerPC\)/', - - /** - * Google wireless transcoder, seems to eat a lot of chars alive - * https://it.wikipedia.org/w/index.php?title=Luciano_Ligabue&diff=prev&oldid=8857361 - */ - '/^Mozilla\/4\.0 \(compatible; MSIE 6.0; Windows NT 5.0; Google Wireless Transcoder;\)/' -]; +$wgBrowserBlackList = []; /** * If set to true, the MediaWiki 1.4 to 1.5 schema conversion will diff --git a/includes/EditPage.php b/includes/EditPage.php index 0ea61c07a9..b40a0543fc 100644 --- a/includes/EditPage.php +++ b/includes/EditPage.php @@ -40,6 +40,11 @@ use Wikimedia\ScopedCallback; * headaches, which may be fatal. */ class EditPage { + /** + * Used for Unicode support checks + */ + const UNICODE_CHECK = 'ℳ𝒲♥𝓊𝓃𝒾𝒸ℴ𝒹ℯ'; + /** * Status: Article successfully updated */ @@ -177,6 +182,11 @@ class EditPage { */ const AS_CANNOT_USE_CUSTOM_MODEL = 241; + /** + * Status: edit rejected because browser doesn't support Unicode. + */ + const AS_UNICODE_NOT_SUPPORTED = 242; + /** * HTML id and name for the beginning of the edit form. */ @@ -413,6 +423,11 @@ class EditPage { */ private $isOldRev = false; + /** + * @var string|null What the user submitted in the 'wpUnicodeCheck' field + */ + private $unicodeCheck; + /** * @param Article $article */ @@ -865,7 +880,7 @@ class EditPage { # These fields need to be checked for encoding. # Also remove trailing whitespace, but don't remove _initial_ # whitespace from the text boxes. This may be significant formatting. - $this->textbox1 = $this->safeUnicodeInput( $request, 'wpTextbox1' ); + $this->textbox1 = rtrim( $request->getText( 'wpTextbox1' ) ); if ( !$request->getCheck( 'wpTextbox2' ) ) { // Skip this if wpTextbox2 has input, it indicates that we came // from a conflict page with raw page text, not a custom form @@ -876,6 +891,8 @@ class EditPage { } } + $this->unicodeCheck = $request->getText( 'wpUnicodeCheck' ); + $this->summary = $request->getText( 'wpSummary' ); # If the summary consists of a heading, e.g. '==Foobar==', extract the title from the @@ -1544,6 +1561,7 @@ class EditPage { case self::AS_CANNOT_USE_CUSTOM_MODEL: case self::AS_PARSE_ERROR: + case self::AS_UNICODE_NOT_SUPPORTED: $out->addWikiText( '
' . "\n" . $status->getWikiText() . '
' ); return true; @@ -1745,6 +1763,12 @@ class EditPage { return $status; } + if ( $this->unicodeCheck !== self::UNICODE_CHECK ) { + $status->fatal( 'unicode-support-fail' ); + $status->value = self::AS_UNICODE_NOT_SUPPORTED; + return $status; + } + $request = $this->context->getRequest(); $spam = $request->getText( 'wpAntispam' ); if ( $spam !== '' ) { @@ -2673,6 +2697,9 @@ class EditPage { call_user_func_array( $formCallback, [ &$out ] ); } + // Add a check for Unicode support + $out->addHTML( Html::hidden( 'wpUnicodeCheck', self::UNICODE_CHECK ) ); + // Add an empty field to trip up spambots $out->addHTML( Xml::openElement( 'div', [ 'id' => 'antispam-container', 'style' => 'display: none;' ] ) @@ -2947,10 +2974,6 @@ class EditPage { $out->addWikiText( $this->hookError ); } - if ( !$this->checkUnicodeCompliantBrowser() ) { - $out->addWikiMsg( 'nonunicodebrowser' ); - } - if ( $this->section != 'new' ) { $revision = $this->mArticle->getRevisionFetched(); if ( $revision ) { @@ -3221,10 +3244,6 @@ class EditPage { $out->addHTML( Html::hidden( 'wpEdittime', $this->edittime ) ); $out->addHTML( Html::hidden( 'editRevId', $this->editRevId ) ); $out->addHTML( Html::hidden( 'wpScrolltop', $this->scrolltop, [ 'id' => 'wpScrolltop' ] ) ); - - if ( !$this->checkUnicodeCompliantBrowser() ) { - $out->addHTML( Html::hidden( 'safemode', '1' ) ); - } } protected function showFormAfterText() { @@ -3318,8 +3337,7 @@ class EditPage { } protected function showTextbox( $text, $name, $customAttribs = [] ) { - $wikitext = $this->safeUnicodeOutput( $text ); - $wikitext = $this->addNewLineAtEnd( $wikitext ); + $wikitext = $this->addNewLineAtEnd( $text ); $attribs = $this->buildTextboxAttribs( $name, $customAttribs, $this->context->getUser() ); @@ -4461,138 +4479,31 @@ class EditPage { $out->addReturnTo( $this->getContextTitle(), [ 'action' => 'edit' ] ); } - /** - * Check if the browser is on a blacklist of user-agents known to - * mangle UTF-8 data on form submission. Returns true if Unicode - * should make it through, false if it's known to be a problem. - * @return bool - */ - private function checkUnicodeCompliantBrowser() { - global $wgBrowserBlackList; - - $currentbrowser = $this->context->getRequest()->getHeader( 'User-Agent' ); - if ( $currentbrowser === false ) { - // No User-Agent header sent? Trust it by default... - return true; - } - - foreach ( $wgBrowserBlackList as $browser ) { - if ( preg_match( $browser, $currentbrowser ) ) { - return false; - } - } - return true; - } - /** * Filter an input field through a Unicode de-armoring process if it * came from an old browser with known broken Unicode editing issues. * + * @deprecated since 1.30, does nothing + * * @param WebRequest $request * @param string $field * @return string */ protected function safeUnicodeInput( $request, $field ) { - $text = rtrim( $request->getText( $field ) ); - return $request->getBool( 'safemode' ) - ? $this->unmakeSafe( $text ) - : $text; + return rtrim( $request->getText( $field ) ); } /** * Filter an output field through a Unicode armoring process if it is * going to an old browser with known broken Unicode editing issues. * + * @deprecated since 1.30, does nothing + * * @param string $text * @return string */ protected function safeUnicodeOutput( $text ) { - return $this->checkUnicodeCompliantBrowser() - ? $text - : $this->makeSafe( $text ); - } - - /** - * A number of web browsers are known to corrupt non-ASCII characters - * in a UTF-8 text editing environment. To protect against this, - * detected browsers will be served an armored version of the text, - * with non-ASCII chars converted to numeric HTML character references. - * - * Preexisting such character references will have a 0 added to them - * to ensure that round-trips do not alter the original data. - * - * @param string $invalue - * @return string - */ - private function makeSafe( $invalue ) { - // Armor existing references for reversibility. - $invalue = strtr( $invalue, [ "&#x" => "�" ] ); - - $bytesleft = 0; - $result = ""; - $working = 0; - $valueLength = strlen( $invalue ); - for ( $i = 0; $i < $valueLength; $i++ ) { - $bytevalue = ord( $invalue[$i] ); - if ( $bytevalue <= 0x7F ) { // 0xxx xxxx - $result .= chr( $bytevalue ); - $bytesleft = 0; - } elseif ( $bytevalue <= 0xBF ) { // 10xx xxxx - $working = $working << 6; - $working += ( $bytevalue & 0x3F ); - $bytesleft--; - if ( $bytesleft <= 0 ) { - $result .= "&#x" . strtoupper( dechex( $working ) ) . ";"; - } - } elseif ( $bytevalue <= 0xDF ) { // 110x xxxx - $working = $bytevalue & 0x1F; - $bytesleft = 1; - } elseif ( $bytevalue <= 0xEF ) { // 1110 xxxx - $working = $bytevalue & 0x0F; - $bytesleft = 2; - } else { // 1111 0xxx - $working = $bytevalue & 0x07; - $bytesleft = 3; - } - } - return $result; - } - - /** - * Reverse the previously applied transliteration of non-ASCII characters - * back to UTF-8. Used to protect data from corruption by broken web browsers - * as listed in $wgBrowserBlackList. - * - * @param string $invalue - * @return string - */ - private function unmakeSafe( $invalue ) { - $result = ""; - $valueLength = strlen( $invalue ); - for ( $i = 0; $i < $valueLength; $i++ ) { - if ( ( substr( $invalue, $i, 3 ) == "&#x" ) && ( $invalue[$i + 3] != '0' ) ) { - $i += 3; - $hexstring = ""; - do { - $hexstring .= $invalue[$i]; - $i++; - } while ( ctype_xdigit( $invalue[$i] ) && ( $i < strlen( $invalue ) ) ); - - // Do some sanity checks. These aren't needed for reversibility, - // but should help keep the breakage down if the editor - // breaks one of the entities whilst editing. - if ( ( substr( $invalue, $i, 1 ) == ";" ) && ( strlen( $hexstring ) <= 6 ) ) { - $codepoint = hexdec( $hexstring ); - $result .= UtfNormal\Utils::codepointToUtf8( $codepoint ); - } else { - $result .= "&#x" . $hexstring . substr( $invalue, $i, 1 ); - } - } else { - $result .= substr( $invalue, $i, 1 ); - } - } - // reverse the transform that we made for reversibility reasons. - return strtr( $result, [ "�" => "&#x" ] ); + return $text; } /** diff --git a/includes/api/ApiEditPage.php b/includes/api/ApiEditPage.php index 4360b4d782..94d6e97b24 100644 --- a/includes/api/ApiEditPage.php +++ b/includes/api/ApiEditPage.php @@ -266,6 +266,7 @@ class ApiEditPage extends ApiBase { 'wpIgnoreBlankArticle' => true, 'wpIgnoreSelfRedirect' => true, 'bot' => $params['bot'], + 'wpUnicodeCheck' => EditPage::UNICODE_CHECK, ]; if ( !is_null( $params['summary'] ) ) { diff --git a/languages/i18n/en.json b/languages/i18n/en.json index 47e62055f0..6d06e4039b 100644 --- a/languages/i18n/en.json +++ b/languages/i18n/en.json @@ -704,8 +704,8 @@ "explainconflict": "Someone else has changed this page since you started editing it.\nThe upper text area contains the page text as it currently exists.\nYour changes are shown in the lower text area.\nYou will have to merge your changes into the existing text.\nOnly the text in the upper text area will be saved when you press \"$1\".", "yourtext": "Your text", "storedversion": "Stored revision", - "nonunicodebrowser": "Warning: Your browser is not Unicode compliant.\nA workaround is in place to allow you to safely edit pages: Non-ASCII characters will appear in the edit box as hexadecimal codes.", "editingold": "Warning: You are editing an out-of-date revision of this page.\nIf you save it, any changes made since this revision will be lost.", + "unicode-support-fail": "It appears that your browser does not support Unicode. It is required to edit pages, so your edit was not saved.", "yourdiff": "Differences", "copyrightwarning": "Please note that all contributions to {{SITENAME}} are considered to be released under the $2 (see $1 for details).\nIf you do not want your writing to be edited mercilessly and redistributed at will, then do not submit it here.
\nYou are also promising us that you wrote this yourself, or copied it from a public domain or similar free resource.\nDo not submit copyrighted work without permission!", "copyrightwarning2": "Please note that all contributions to {{SITENAME}} may be edited, altered, or removed by other contributors.\nIf you do not want your writing to be edited mercilessly, then do not submit it here.
\nYou are also promising us that you wrote this yourself, or copied it from a public domain or similar free resource (see $1 for details).\nDo not submit copyrighted work without permission!", diff --git a/languages/i18n/qqq.json b/languages/i18n/qqq.json index 0a6e91bef6..753432279d 100644 --- a/languages/i18n/qqq.json +++ b/languages/i18n/qqq.json @@ -896,8 +896,8 @@ "explainconflict": "Appears at the top of a page when there is an edit conflict.\n\nParameters:\n* $1 – The label of the save button – one of {{msg-mw|savearticle}} or {{msg-mw|savechanges}} on save-labelled wiki, or {{msg-mw|publishpage}} or {{msg-mw|publishchanges}} on publish-labelled wikis.\n\nSee also:\n* {{msg-mw|Savearticle}}", "yourtext": "Used in Diff Preview page. The diff is between {{msg-mw|currentrev}} and {{msg-mw|yourtext}}.\n\nAlso used in Edit Conflict page; the diff between {{msg-mw|yourtext}} and {{msg-mw|storedversion}}.", "storedversion": "This is used in an edit conflict as the label for the top revision that has been stored, as opposed to your version {{msg-mw|yourtext}} that has not been stored which is shown at the bottom of the page.", - "nonunicodebrowser": "Used as warning when editing page.", "editingold": "Used as warning when editing an old revision of a page.", + "unicode-support-fail": "Error message shown to users if their browser doesn't support Unicode", "yourdiff": "Used as h2 header for the diff of the current version of a page with the user's version in case there is an edit conflict or a spam filter hit.", "copyrightwarning": "Copyright warning displayed under the edit box in editor. Parameters:\n* $1 - link\n* $2 - license name", "copyrightwarning2": "Copyright warning displayed under the edit box in editor\n*$1 - license name", diff --git a/tests/phpunit/includes/EditPageTest.php b/tests/phpunit/includes/EditPageTest.php index 9507811ec2..ad0d07a417 100644 --- a/tests/phpunit/includes/EditPageTest.php +++ b/tests/phpunit/includes/EditPageTest.php @@ -165,6 +165,10 @@ class EditPageTest extends MediaWikiLangTestCase { $edit['wpStarttime'] = wfTimestampNow(); } + if ( !isset( $edit['wpUnicodeCheck'] ) ) { + $edit['wpUnicodeCheck'] = EditPage::UNICODE_CHECK; + } + $req = new FauxRequest( $edit, true ); // session ?? $article = new Article( $title ); @@ -697,7 +701,8 @@ hello 'wpTextbox1' => serialize( 'non-text content' ), 'wpEditToken' => $user->getEditToken(), 'wpEdittime' => '', - 'wpStarttime' => wfTimestampNow() + 'wpStarttime' => wfTimestampNow(), + 'wpUnicodeCheck' => EditPage::UNICODE_CHECK, ]; $req = new FauxRequest( $edit, true ); -- 2.20.1