From 5c7b0addd5fb37fc53b0971ed22bf84487f95733 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Bartosz=20Dziewo=C5=84ski?= Date: Thu, 8 Feb 2018 22:22:34 +0100 Subject: [PATCH] Allow limiting comment length by characters rather than bytes in JS For unfortunate historical reasons, browsers' native maxlength counts the number of UTF-16 code units rather than Unicode codepoints [1], which means that codepoints outside the Basic Multilingual Plane (e.g. many emojis) count as 2 characters each. That could be good enough, but we want our software to be excellent rather than merely good enough. 1. Introduce a few new functions, added to the existing modules: * mediawiki.String: * codePointLength() counts the length of a string in Unicode codepoints (characters). * trimCodePointLength() trims a string to the desired length in Unicode codepoints (characters). * jquery.lengthLimit: * $.fn.codePointLimit() enforces the specified maximum length in codepoints of an input field. * mediawiki.widgets.visibleLengthLimit: * mw.widgets.visibleCodePointLimit() enforces the maximum length in codepoints of an OOUI widget and displays the remaining length in an inline label. 2. Add client-side mw.config variables: * wgCommentByteLimit for the old byte limit, equal to 255. * wgCommentCodePointLimit for the new codepoint limit, equal to 1000. Only one of them may be set, depending on which limit should be applied. 3. Make use of both of these new features. For the sake of an example, I updated the forms shown on action=edit (using visibleCodePointLimit) and on action=protect (using codePointLimit). Many remain to be updated. [1] https://www.w3.org/TR/html5/sec-forms.html#limiting-user-input-length-the-maxlength-attribute Bug: T185948 Change-Id: Ia1269fd898dabbcf1582618eab46cef97e10a3b1 --- includes/EditPage.php | 8 +- includes/ProtectionForm.php | 16 ++- .../ResourceLoaderStartUpModule.php | 3 + resources/src/jquery/jquery.lengthLimit.js | 105 ++++++++++------ .../mediawiki.action/mediawiki.action.edit.js | 12 +- resources/src/mediawiki.legacy/protect.js | 12 +- .../mediawiki.widgets.visibleLengthLimit.js | 25 +++- resources/src/mediawiki/mediawiki.String.js | 115 +++++++++++++----- 8 files changed, 214 insertions(+), 82 deletions(-) diff --git a/includes/EditPage.php b/includes/EditPage.php index 652b355233..86a39ea385 100644 --- a/includes/EditPage.php +++ b/includes/EditPage.php @@ -3138,11 +3138,15 @@ ERROR; * @return array */ private function getSummaryInputAttributes( array $inputAttrs = null ) { - // Note: the maxlength is overridden in JS to 255 and to make it use UTF-8 bytes, not characters. + $conf = $this->context->getConfig(); + $oldCommentSchema = $conf->get( 'CommentTableSchemaMigrationStage' ) === MIGRATION_OLD; + // HTML maxlength uses "UTF-16 code units", which means that characters outside BMP + // (e.g. emojis) count for two each. This limit is overridden in JS to instead count + // Unicode codepoints (or 255 UTF-8 bytes for old schema). return ( is_array( $inputAttrs ) ? $inputAttrs : [] ) + [ 'id' => 'wpSummary', 'name' => 'wpSummary', - 'maxlength' => '200', + 'maxlength' => $oldCommentSchema ? 200 : CommentStore::COMMENT_CHARACTER_LIMIT, 'tabindex' => 1, 'size' => 60, 'spellcheck' => 'true', diff --git a/includes/ProtectionForm.php b/includes/ProtectionForm.php index 53608e849a..51c2923385 100644 --- a/includes/ProtectionForm.php +++ b/includes/ProtectionForm.php @@ -349,7 +349,9 @@ class ProtectionForm { $user = $context->getUser(); $output = $context->getOutput(); $lang = $context->getLanguage(); - $cascadingRestrictionLevels = $context->getConfig()->get( 'CascadingRestrictionLevels' ); + $conf = $context->getConfig(); + $cascadingRestrictionLevels = $conf->get( 'CascadingRestrictionLevels' ); + $oldCommentSchema = $conf->get( 'CommentTableSchemaMigrationStage' ) === MIGRATION_OLD; $out = ''; if ( !$this->disabled ) { $output->addModules( 'mediawiki.legacy.protect' ); @@ -494,6 +496,13 @@ class ProtectionForm { $this->mReasonSelection, 'mwProtect-reason', 4 ); + // HTML maxlength uses "UTF-16 code units", which means that characters outside BMP + // (e.g. emojis) count for two each. This limit is overridden in JS to instead count + // Unicode codepoints (or 180 UTF-8 bytes for old schema). + // Subtract arbitrary 75 to leave some space for the autogenerated null edit's summary + // and other texts chosen by dropdown menus on this page. + $maxlength = $oldCommentSchema ? 180 : CommentStore::COMMENT_CHARACTER_LIMIT - 75; + $out .= Xml::openElement( 'table', [ 'id' => 'mw-protect-table3' ] ) . Xml::openElement( 'tbody' ); $out .= " @@ -511,10 +520,7 @@ class ProtectionForm { " . Xml::input( 'mwProtect-reason', 60, $this->mReason, [ 'type' => 'text', - 'id' => 'mwProtect-reason', 'maxlength' => 180 ] ) . - // Limited maxlength as the database trims at 255 bytes and other texts - // chosen by dropdown menus on this page are also included in this database field. - // The byte limit of 180 bytes is enforced in javascript + 'id' => 'mwProtect-reason', 'maxlength' => $maxlength ] ) . " "; # Disallow watching is user is not logged in diff --git a/includes/resourceloader/ResourceLoaderStartUpModule.php b/includes/resourceloader/ResourceLoaderStartUpModule.php index 8b9feeb8f0..e5fe928738 100644 --- a/includes/resourceloader/ResourceLoaderStartUpModule.php +++ b/includes/resourceloader/ResourceLoaderStartUpModule.php @@ -66,6 +66,7 @@ class ResourceLoaderStartUpModule extends ResourceLoaderModule { } $illegalFileChars = $conf->get( 'IllegalFileChars' ); + $oldCommentSchema = $conf->get( 'CommentTableSchemaMigrationStage' ) === MIGRATION_OLD; // Build list of variables $vars = [ @@ -113,6 +114,8 @@ class ResourceLoaderStartUpModule extends ResourceLoaderModule { 'wgResourceLoaderStorageEnabled' => $conf->get( 'ResourceLoaderStorageEnabled' ), 'wgForeignUploadTargets' => $conf->get( 'ForeignUploadTargets' ), 'wgEnableUploads' => $conf->get( 'EnableUploads' ), + 'wgCommentByteLimit' => $oldCommentSchema ? 255 : null, + 'wgCommentCodePointLimit' => $oldCommentSchema ? null : CommentStore::COMMENT_CHARACTER_LIMIT, ]; Hooks::run( 'ResourceLoaderGetConfigVars', [ &$vars ] ); diff --git a/resources/src/jquery/jquery.lengthLimit.js b/resources/src/jquery/jquery.lengthLimit.js index 00ed7d3edb..2738d1afa8 100644 --- a/resources/src/jquery/jquery.lengthLimit.js +++ b/resources/src/jquery/jquery.lengthLimit.js @@ -14,7 +14,8 @@ 'focus.lengthLimit', 'blur.lengthLimit' ].join( ' ' ), - trimByteLength = require( 'mediawiki.String' ).trimByteLength; + trimByteLength = require( 'mediawiki.String' ).trimByteLength, + trimCodePointLength = require( 'mediawiki.String' ).trimCodePointLength; /** * Utility function to trim down a string, based on byteLimit @@ -30,7 +31,7 @@ * function, if none, pass empty string. * @param {string} newVal New value that may have to be trimmed down. * @param {number} byteLimit Number of bytes the value may be in size. - * @param {Function} [fn] See jQuery#byteLimit. + * @param {Function} [filterFn] See jQuery#byteLimit. * @return {Object} * @return {string} return.newVal * @return {boolean} return.trimmed @@ -38,32 +39,18 @@ mw.log.deprecate( $, 'trimByteLength', trimByteLength, 'Use require( \'mediawiki.String\' ).trimByteLength instead.', '$.trimByteLength' ); - /** - * Enforces a byte limit on an input field, so that UTF-8 entries are counted as well, - * when, for example, a database field has a byte limit rather than a character limit. - * Plugin rationale: Browser has native maxlength for number of characters, this plugin - * exists to limit number of bytes instead. - * - * Can be called with a custom limit (to use that limit instead of the maxlength attribute - * value), a filter function (in case the limit should apply to something other than the - * exact input value), or both. Order of parameters is important! - * - * @param {number} [limit] Limit to enforce, fallsback to maxLength-attribute, - * called with fetched value as argument. - * @param {Function} [fn] Function to call on the string before assessing the length. - * @return {jQuery} - * @chainable - */ - $.fn.byteLimit = function ( limit, fn ) { + function lengthLimit( trimFn, limit, filterFn ) { + var allowNativeMaxlength = trimFn === trimByteLength; + // If the first argument is the function, - // set fn to the first argument's value and ignore the second argument. + // set filterFn to the first argument's value and ignore the second argument. if ( $.isFunction( limit ) ) { - fn = limit; + filterFn = limit; limit = undefined; // Either way, verify it is a function so we don't have to call // isFunction again after this. - } else if ( !fn || !$.isFunction( fn ) ) { - fn = undefined; + } else if ( !filterFn || !$.isFunction( filterFn ) ) { + filterFn = undefined; } // The following is specific to each element in the collection. @@ -72,7 +59,7 @@ $el = $( el ); - // If no limit was passed to byteLimit(), use the maxlength value. + // If no limit was passed to lengthLimit(), use the maxlength value. // Can't re-use 'limit' variable because it's in the higher scope // that would affect the next each() iteration as well. // Note that we use attribute to read the value instead of property, @@ -92,19 +79,19 @@ return; } - if ( fn ) { + if ( filterFn ) { // Save function for reference - $el.data( 'byteLimit.callback', fn ); + $el.data( 'lengthLimit.callback', filterFn ); } // Remove old event handlers (if there are any) - $el.off( '.byteLimit' ); + $el.off( '.lengthLimit' ); - if ( fn ) { + if ( filterFn || !allowNativeMaxlength ) { // Disable the native maxLength (if there is any), because it interferes - // with the (differently calculated) byte limit. - // Aside from being differently calculated (average chars with byteLimit - // is lower), we also support a callback which can make it to allow longer + // with the (differently calculated) character/byte limit. + // Aside from being differently calculated, + // we also support a callback which can make it to allow longer // values (e.g. count "Foo" from "User:Foo"). // maxLength is a strange property. Removing or setting the property to // undefined directly doesn't work. Instead, it can only be unset internally @@ -113,10 +100,12 @@ $el.removeAttr( 'maxlength' ); } else { - // If we don't have a callback the bytelimit can only be lower than the charlimit + // For $.byteLimit only, if we don't have a callback, + // the byteLimit can only be lower than the native maxLength limit // (that is, there are no characters less than 1 byte in size). So lets (re-)enforce // the native limit for efficiency when possible (it will make the while-loop below - // faster by there being less left to interate over). + // faster by there being less left to interate over). This does not work for $.codePointLimit + // (code units for surrogates represent half a character each). $el.attr( 'maxlength', elLimit ); } @@ -139,11 +128,11 @@ // See https://www.w3.org/TR/DOM-Level-3-Events/#events-keyboard-event-order for // the order and characteristics of the key events. $el.on( eventKeys, function () { - var res = trimByteLength( + var res = trimFn( prevSafeVal, this.value, elLimit, - fn + filterFn ); // Only set value property if it was trimmed, because whenever the @@ -157,11 +146,55 @@ $el.trigger( 'change' ); } // Always adjust prevSafeVal to reflect the input value. Not doing this could cause - // trimByteLength to compare the new value to an empty string instead of the + // trimFn to compare the new value to an empty string instead of the // old value, resulting in trimming always from the end (T42850). prevSafeVal = res.newVal; } ); } ); + } + + /** + * Enforces a byte limit on an input field, assuming UTF-8 encoding, for situations + * when, for example, a database field has a byte limit rather than a character limit. + * Plugin rationale: Browser has native maxlength for number of characters (technically, + * UTF-16 code units), this plugin exists to limit number of bytes instead. + * + * Can be called with a custom limit (to use that limit instead of the maxlength attribute + * value), a filter function (in case the limit should apply to something other than the + * exact input value), or both. Order of parameters is important! + * + * @param {number} [limit] Limit to enforce, fallsback to maxLength-attribute, + * called with fetched value as argument. + * @param {Function} [filterFn] Function to call on the string before assessing the length. + * @return {jQuery} + * @chainable + */ + $.fn.byteLimit = function ( limit, filterFn ) { + return lengthLimit.call( this, trimByteLength, limit, filterFn ); + }; + + /** + * Enforces a codepoint (character) limit on an input field. + * + * For unfortunate historical reasons, browsers' native maxlength counts [the number of UTF-16 + * code units rather than Unicode codepoints] [1], which means that codepoints outside the Basic + * Multilingual Plane (e.g. many emojis) count as 2 characters each. This plugin exists to + * correct this. + * + * [1]: https://www.w3.org/TR/html5/sec-forms.html#limiting-user-input-length-the-maxlength-attribute + * + * Can be called with a custom limit (to use that limit instead of the maxlength attribute + * value), a filter function (in case the limit should apply to something other than the + * exact input value), or both. Order of parameters is important! + * + * @param {number} [limit] Limit to enforce, fallsback to maxLength-attribute, + * called with fetched value as argument. + * @param {Function} [filterFn] Function to call on the string before assessing the length. + * @return {jQuery} + * @chainable + */ + $.fn.codePointLimit = function ( limit, filterFn ) { + return lengthLimit.call( this, trimCodePointLength, limit, filterFn ); }; /** diff --git a/resources/src/mediawiki.action/mediawiki.action.edit.js b/resources/src/mediawiki.action/mediawiki.action.edit.js index 087c5bce5d..a85e740a87 100644 --- a/resources/src/mediawiki.action/mediawiki.action.edit.js +++ b/resources/src/mediawiki.action/mediawiki.action.edit.js @@ -18,14 +18,20 @@ $( function () { var editBox, scrollTop, $editForm, - // TODO T6714: Once this can be adjusted, read this from config. - summaryByteLimit = 255, + summaryCodePointLimit = mw.config.get( 'wgCommentCodePointLimit' ), + summaryByteLimit = mw.config.get( 'wgCommentByteLimit' ), wpSummary = OO.ui.infuse( $( '#wpSummaryWidget' ) ); // Show a byte-counter to users with how many bytes are left for their edit summary. // TODO: This looks a bit weird, as there is no unit in the UI, just numbers; showing // 'bytes' confused users in testing, and showing 'chars' would be a lie. See T42035. - mw.widgets.visibleByteLimit( wpSummary, summaryByteLimit ); + // (Showing 'chars' is still confusing with the code point limit, since it's not obvious + // that e.g. combining diacritics or zero-width punctuation count as characters.) + if ( summaryCodePointLimit ) { + mw.widgets.visibleCodePointLimit( wpSummary, summaryCodePointLimit ); + } else if ( summaryByteLimit ) { + mw.widgets.visibleByteLimit( wpSummary, summaryByteLimit ); + } // Restore the edit box scroll state following a preview operation, // and set up a form submission handler to remember this state. diff --git a/resources/src/mediawiki.legacy/protect.js b/resources/src/mediawiki.legacy/protect.js index aa49ae138e..b96bebcb1a 100644 --- a/resources/src/mediawiki.legacy/protect.js +++ b/resources/src/mediawiki.legacy/protect.js @@ -1,6 +1,9 @@ ( function ( mw, $ ) { + var ProtectionForm, + reasonCodePointLimit = mw.config.get( 'wgCommentCodePointLimit' ), + reasonByteLimit = mw.config.get( 'wgCommentByteLimit' ); - var ProtectionForm = window.ProtectionForm = { + ProtectionForm = window.ProtectionForm = { /** * Set up the protection chaining interface (i.e. "unlock move permissions" checkbox) * on the protection form @@ -46,7 +49,12 @@ this.toggleUnchainedInputs( !this.areAllTypesMatching() ); } - $( '#mwProtect-reason' ).byteLimit( 180 ); + // Arbitrary 75 to leave some space for the autogenerated null edit's summary + if ( reasonCodePointLimit ) { + $( '#mwProtect-reason' ).codePointLimit( reasonCodePointLimit - 75 ); + } else if ( reasonByteLimit ) { + $( '#mwProtect-reason' ).byteLimit( reasonByteLimit - 75 ); + } this.updateCascadeCheckbox(); return true; diff --git a/resources/src/mediawiki.widgets.visibleLengthLimit/mediawiki.widgets.visibleLengthLimit.js b/resources/src/mediawiki.widgets.visibleLengthLimit/mediawiki.widgets.visibleLengthLimit.js index 03ffca7b95..52ebe7478e 100644 --- a/resources/src/mediawiki.widgets.visibleLengthLimit/mediawiki.widgets.visibleLengthLimit.js +++ b/resources/src/mediawiki.widgets.visibleLengthLimit/mediawiki.widgets.visibleLengthLimit.js @@ -1,6 +1,7 @@ ( function ( mw ) { - var byteLength = require( 'mediawiki.String' ).byteLength; + var byteLength = require( 'mediawiki.String' ).byteLength, + codePointLength = require( 'mediawiki.String' ).codePointLength; /** * @class mw.widgets @@ -28,4 +29,26 @@ textInputWidget.$input.byteLimit( limit ); }; + /** + * Add a visible codepoint (character) limit label to a TextInputWidget. + * + * Uses jQuery#codePointLimit to enforce the limit. + * + * @param {OO.ui.TextInputWidget} textInputWidget Text input widget + * @param {number} [limit] Byte limit, defaults to $input's maxlength + */ + mw.widgets.visibleCodePointLimit = function ( textInputWidget, limit ) { + limit = limit || +textInputWidget.$input.attr( 'maxlength' ); + + function updateCount() { + textInputWidget.setLabel( ( limit - codePointLength( textInputWidget.getValue() ) ).toString() ); + } + textInputWidget.on( 'change', updateCount ); + // Initialise value + updateCount(); + + // Actually enforce limit + textInputWidget.$input.codePointLimit( limit ); + }; + }( mediaWiki ) ); diff --git a/resources/src/mediawiki/mediawiki.String.js b/resources/src/mediawiki/mediawiki.String.js index 5e11680543..5d9bef0632 100644 --- a/resources/src/mediawiki/mediawiki.String.js +++ b/resources/src/mediawiki/mediawiki.String.js @@ -36,6 +36,19 @@ .length; } + /** + * Calculate the character length of a string (accounting for UTF-16 surrogates). + * + * @param {string} str + * @return {number} + */ + function codePointLength( str ) { + return str + // Low surrogate + high surrogate pairs represent one character (codepoint) each + .replace( /[\uD800-\uDBFF][\uDC00-\uDFFF]/g, '*' ) + .length; + } + // Like String#charAt, but return the pair of UTF-16 surrogates for characters outside of BMP. function codePointAt( string, offset, backwards ) { // We don't need to check for offsets at the beginning or end of string, @@ -50,29 +63,13 @@ } } - /** - * Utility function to trim down a string, based on byteLimit - * and given a safe start position. It supports insertion anywhere - * in the string, so "foo" to "fobaro" if limit is 4 will result in - * "fobo", not "foba". Basically emulating the native maxlength by - * reconstructing where the insertion occurred. - * - * @param {string} safeVal Known value that was previously returned by this - * function, if none, pass empty string. - * @param {string} newVal New value that may have to be trimmed down. - * @param {number} byteLimit Number of bytes the value may be in size. - * @param {Function} [fn] Function to call on the string before assessing the length. - * @return {Object} - * @return {string} return.newVal - * @return {boolean} return.trimmed - */ - function trimByteLength( safeVal, newVal, byteLimit, fn ) { + function trimLength( safeVal, newVal, length, lengthFn ) { var startMatches, endMatches, matchesLen, inpParts, chopOff, oldChar, newChar, oldVal = safeVal; // Run the hook if one was provided, but only on the length // assessment. The value itself is not to be affected by the hook. - if ( byteLength( fn ? fn( newVal ) : newVal ) <= byteLimit ) { + if ( lengthFn( newVal ) <= length ) { // Limit was not reached, just remember the new value // and let the user continue. return { @@ -125,32 +122,84 @@ // Chop off characters from the end of the "inserted content" string // until the limit is statisfied. - if ( fn ) { - // stop, when there is nothing to slice - T43450 - while ( byteLength( fn( inpParts.join( '' ) ) ) > byteLimit && inpParts[ 1 ].length > 0 ) { - // Do not chop off halves of surrogate pairs - chopOff = /[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( inpParts[ 1 ] ) ? 2 : 1; - inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -chopOff ); - } - } else { - while ( byteLength( inpParts.join( '' ) ) > byteLimit ) { - // Do not chop off halves of surrogate pairs - chopOff = /[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( inpParts[ 1 ] ) ? 2 : 1; - inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -chopOff ); - } + // Make sure to stop when there is nothing to slice (T43450). + while ( lengthFn( inpParts.join( '' ) ) > length && inpParts[ 1 ].length > 0 ) { + // Do not chop off halves of surrogate pairs + chopOff = /[\uD800-\uDBFF][\uDC00-\uDFFF]$/.test( inpParts[ 1 ] ) ? 2 : 1; + inpParts[ 1 ] = inpParts[ 1 ].slice( 0, -chopOff ); } return { newVal: inpParts.join( '' ), - // For pathological fn() that always returns a value longer than the limit, we might have + // For pathological lengthFn() that always returns a length greater than the limit, we might have // ended up not trimming - check for this case to avoid infinite loops trimmed: newVal !== inpParts.join( '' ) }; } + /** + * Utility function to trim down a string, based on byteLimit + * and given a safe start position. It supports insertion anywhere + * in the string, so "foo" to "fobaro" if limit is 4 will result in + * "fobo", not "foba". Basically emulating the native maxlength by + * reconstructing where the insertion occurred. + * + * @param {string} safeVal Known value that was previously returned by this + * function, if none, pass empty string. + * @param {string} newVal New value that may have to be trimmed down. + * @param {number} byteLimit Number of bytes the value may be in size. + * @param {Function} [filterFn] Function to call on the string before assessing the length. + * @return {Object} + * @return {string} return.newVal + * @return {boolean} return.trimmed + */ + function trimByteLength( safeVal, newVal, byteLimit, filterFn ) { + var lengthFn; + if ( filterFn ) { + lengthFn = function ( val ) { + return byteLength( filterFn( val ) ); + }; + } else { + lengthFn = byteLength; + } + + return trimLength( safeVal, newVal, byteLimit, lengthFn ); + } + + /** + * Utility function to trim down a string, based on codePointLimit + * and given a safe start position. It supports insertion anywhere + * in the string, so "foo" to "fobaro" if limit is 4 will result in + * "fobo", not "foba". Basically emulating the native maxlength by + * reconstructing where the insertion occurred. + * + * @param {string} safeVal Known value that was previously returned by this + * function, if none, pass empty string. + * @param {string} newVal New value that may have to be trimmed down. + * @param {number} codePointLimit Number of characters the value may be in size. + * @param {Function} [filterFn] Function to call on the string before assessing the length. + * @return {Object} + * @return {string} return.newVal + * @return {boolean} return.trimmed + */ + function trimCodePointLength( safeVal, newVal, codePointLimit, filterFn ) { + var lengthFn; + if ( filterFn ) { + lengthFn = function ( val ) { + return codePointLength( filterFn( val ) ); + }; + } else { + lengthFn = codePointLength; + } + + return trimLength( safeVal, newVal, codePointLimit, lengthFn ); + } + module.exports = { byteLength: byteLength, - trimByteLength: trimByteLength + codePointLength: codePointLength, + trimByteLength: trimByteLength, + trimCodePointLength: trimCodePointLength }; }() ); -- 2.20.1