Preserve grapheme clusters in upper corner completion suggester highlighting
authortjones <tjones@wikimedia.org>
Mon, 19 Aug 2019 18:23:33 +0000 (14:23 -0400)
committerBartosz Dziewoński <matma.rex@gmail.com>
Thu, 29 Aug 2019 21:38:44 +0000 (23:38 +0200)
Grapheme clusters in Indic scripts can be split by highlighting,
resulting in really ugly text in the suggestions. For example,
म +  े should equal मे, but doesn't if highlighting markup comes between
the two characters. Affected scripts include Devanagari, Tamil, Telegu,
Thai, Myanmar, Khmer, and Javanese, and probably others. Latin and
Cyrillic script also have the problem, but the presentation is not as
bad (at least on some browser/OS combinations).

- Define a comboMarks regex (equivalent to \p{Mark} in other programming
languages) and use it in a new highlighting function,
prefixPlusComboHighlight, to include trailing combining characters in
highlighted text.

- Change the calling function to use prefixPlusComboHighlight instead of
prefixHighlight.

Note: This only fixes problems with **grapheme clusters** in
suggestions in the **search box in the upper corner** (upper right in
LTR languages, upper left in RTL languages). The main search box on the
Special:Search page uses different code. Ligatures, such as
Arabic ي + ا becoming يا, are not covered by this simple regex-based
solution.

Bug: T35242
Change-Id: I915c50180bc2196e0302f27835241624b7837f16

resources/src/jquery/jquery.highlightText.js
resources/src/jquery/jquery.suggestions.js

index 7205620..de08607 100644 (file)
                        );
                },
 
+               // match prefix plus any combining characters to prevent ugly rendering (see T35242)
+               prefixPlusComboHighlight: function ( node, prefix ) {
+
+                       // Equivalent to \p{Mark} (which is not currently available in JavaScript)
+                       var comboMarks = '[\u0300-\u036F\u0483-\u0489\u0591-\u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E4\u06E7\u06E8\u06EA-\u06ED\u0711\u0730-\u074A\u07A6-\u07B0\u07EB-\u07F3\u07FD\u0816-\u0819\u081B-\u0823\u0825-\u0827\u0829-\u082D\u0859-\u085B\u08D3-\u08E1\u08E3-\u0903\u093A-\u093C\u093E-\u094F\u0951-\u0957\u0962\u0963\u0981-\u0983\u09BC\u09BE-\u09C4\u09C7\u09C8\u09CB-\u09CD\u09D7\u09E2\u09E3\u09FE\u0A01-\u0A03\u0A3C\u0A3E-\u0A42\u0A47\u0A48\u0A4B-\u0A4D\u0A51\u0A70\u0A71\u0A75\u0A81-\u0A83\u0ABC\u0ABE-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACD\u0AE2\u0AE3\u0AFA-\u0AFF\u0B01-\u0B03\u0B3C\u0B3E-\u0B44\u0B47\u0B48\u0B4B-\u0B4D\u0B56\u0B57\u0B62\u0B63\u0B82\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7\u0C00-\u0C04\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55\u0C56\u0C62\u0C63\u0C81-\u0C83\u0CBC\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD\u0CD5\u0CD6\u0CE2\u0CE3\u0D00-\u0D03\u0D3B\u0D3C\u0D3E-\u0D44\u0D46-\u0D48\u0D4A-\u0D4D\u0D57\u0D62\u0D63\u0D82\u0D83\u0DCA\u0DCF-\u0DD4\u0DD6\u0DD8-\u0DDF\u0DF2\u0DF3\u0E31\u0E34-\u0E3A\u0E47-\u0E4E\u0EB1\u0EB4-\u0EB9\u0EBB\u0EBC\u0EC8-\u0ECD\u0F18\u0F19\u0F35\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86\u0F87\u0F8D-\u0F97\u0F99-\u0FBC\u0FC6\u102B-\u103E\u1056-\u1059\u105E-\u1060\u1062-\u1064\u1067-\u106D\u1071-\u1074\u1082-\u108D\u108F\u109A-\u109D\u135D-\u135F\u1712-\u1714\u1732-\u1734\u1752\u1753\u1772\u1773\u17B4-\u17D3\u17DD\u180B-\u180D\u1885\u1886\u18A9\u1920-\u192B\u1930-\u193B\u1A17-\u1A1B\u1A55-\u1A5E\u1A60-\u1A7C\u1A7F\u1AB0-\u1ABE\u1B00-\u1B04\u1B34-\u1B44\u1B6B-\u1B73\u1B80-\u1B82\u1BA1-\u1BAD\u1BE6-\u1BF3\u1C24-\u1C37\u1CD0-\u1CD2\u1CD4-\u1CE8\u1CED\u1CF2-\u1CF4\u1CF7-\u1CF9\u1DC0-\u1DF9\u1DFB-\u1DFF\u20D0-\u20F0\u2CEF-\u2CF1\u2D7F\u2DE0-\u2DFF\u302A-\u302F\u3099\u309A\uA66F-\uA672\uA674-\uA67D\uA69E\uA69F\uA6F0\uA6F1\uA802\uA806\uA80B\uA823-\uA827\uA880\uA881\uA8B4-\uA8C5\uA8E0-\uA8F1\uA8FF\uA926-\uA92D\uA947-\uA953\uA980-\uA983\uA9B3-\uA9C0\uA9E5\uAA29-\uAA36\uAA43\uAA4C\uAA4D\uAA7B-\uAA7D\uAAB0\uAAB2-\uAAB4\uAAB7\uAAB8\uAABE\uAABF\uAAC1\uAAEB-\uAAEF\uAAF5\uAAF6\uABE3-\uABEA\uABEC\uABED\uFB1E\uFE00-\uFE0F\uFE20-\uFE2F]';
+
+                       $.highlightText.innerHighlight(
+                               node,
+                               new RegExp( '(^)' + mw.RegExp.escape( prefix ) + comboMarks + '*', 'i' )
+                       );
+               },
+
                // scans a node looking for the pattern and wraps a span around each match
                innerHighlight: function ( node, pat ) {
                        var i, match, pos, spannode, middlebit, middleclone;
@@ -81,6 +93,8 @@
         * @param {string} [options.method='splitAndHighlight'] Method of matching to use, one of:
         *   - 'splitAndHighlight': Split `matchString` on spaces, then match each word separately.
         *   - 'prefixHighlight': Match `matchString` at the beginning of text only.
+        *   - 'prefixPlusComboHighlight': Match `matchString` plus any combining characters at
+        *     the beginning of text only.
         * @return {jQuery}
         * @chainable
         */
index f4aea72..31afe3e 100644 (file)
                                                        }
 
                                                        if ( context.config.highlightInput ) {
-                                                               $result.highlightText( context.data.prevText, { method: 'prefixHighlight' } );
+                                                               $result.highlightText( context.data.prevText, { method: 'prefixPlusComboHighlight' } );
                                                        }
 
                                                        // Widen results box if needed (new width is only calculated here, applied later).