mw.Title: Correct handling of Unicode whitespace and bidi control characters

author Bartosz Dziewoński <matma.rex@gmail.com>

Wed, 24 Aug 2016 19:33:45 +0000 (21:33 +0200)

committer Krinkle <krinklemail@gmail.com>

Wed, 31 Aug 2016 00:04:29 +0000 (00:04 +0000)
author Bartosz Dziewoński <matma.rex@gmail.com>
Wed, 24 Aug 2016 19:33:45 +0000 (21:33 +0200)
committer Krinkle <krinklemail@gmail.com>
Wed, 31 Aug 2016 00:04:29 +0000 (00:04 +0000)
diff --git a/resources/src/mediawiki/mediawiki.Title.js b/resources/src/mediawiki/mediawiki.Title.js

index 4c57faa..e468768 100644 (file)
--- a/resources/src/mediawiki/mediawiki.Title.js
+++ b/resources/src/mediawiki/mediawiki.Title.js
@@ -164,9 +164,12 @@
                 '|&#x[0-9A-Fa-f]+;'
         ),
  
-       // From MediaWikiTitleCodec.php#L225 @26fcab1f18c568a41
-       // "Clean up whitespace" in function MediaWikiTitleCodec::splitTitleString()
-       rWhitespace = /[ _\u0009\u00A0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000\s]+/g,
+       // From MediaWikiTitleCodec::splitTitleString() in PHP
+       // Note that this is not equivalent to /\s/, e.g. underscore is included, tab is not included.
+       rWhitespace = /[ _\u00A0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]+/g,
+
+       // From MediaWikiTitleCodec::splitTitleString() in PHP
+       rUnicodeBidi = /[\u200E\u200F\u202A-\u202E]/g,
  
         /**
          * Slightly modified from Flinfo. Credit goes to Lupo and Flominator.
@@ -181,18 +184,6 @@
                         replace: '',
                         generalRule: true
                 },
-               // Space, underscore, tab, NBSP and other unusual spaces
-               {
-                       pattern: rWhitespace,
-                       replace: ' ',
-                       generalRule: true
-               },
-               // unicode bidi override characters: Implicit, Embeds, Overrides
-               {
-                       pattern: /[\u200E\u200F\u202A-\u202E]/g,
-                       replace: '',
-                       generalRule: true
-               },
                 // control characters
                 {
                         pattern: /[\x00-\x1f\x7f]/g,
@@ -261,8 +252,10 @@
                 namespace = defaultNamespace === undefined ? NS_MAIN : defaultNamespace;
  
                 title = title
+                       // Strip Unicode bidi override characters
+                       .replace( rUnicodeBidi, '' )
                         // Normalise whitespace to underscores and remove duplicates
-                       .replace( /[ _\s]+/g, '_' )
+                       .replace( rWhitespace, '_' )
                         // Trim underscores
                         .replace( rUnderscoreTrim, '' );
  
@@ -557,8 +550,8 @@
  
                 namespace = defaultNamespace === undefined ? NS_MAIN : defaultNamespace;
  
-               // Normalise whitespace and remove duplicates
-               title = $.trim( title.replace( rWhitespace, ' ' ) );
+               // Normalise additional whitespace
+               title = $.trim( title.replace( /\s/g, ' ' ) );
  
                 // Process initial colon
                 if ( title !== '' && title[ 0 ] === ':' ) {
diff --git a/tests/phpunit/includes/TitleTest.php b/tests/phpunit/includes/TitleTest.php

index 7850f24..7925c6f 100644 (file)
--- a/tests/phpunit/includes/TitleTest.php
+++ b/tests/phpunit/includes/TitleTest.php
@@ -90,6 +90,8 @@ class TitleTest extends MediaWikiTestCase {
                         [ 'A < B', 'title-invalid-characters' ],
                         [ 'A > B', 'title-invalid-characters' ],
                         [ 'A | B', 'title-invalid-characters' ],
+                       [ "A \t B", 'title-invalid-characters' ],
+                       [ "A \n B", 'title-invalid-characters' ],
                         // URL encoding
                         [ 'A%20B', 'title-invalid-characters' ],
                         [ 'A%23B', 'title-invalid-characters' ],
diff --git a/tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js b/tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js

index 991725b..886e2b6 100644 (file)
--- a/tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js
+++ b/tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js
@@ -38,6 +38,8 @@
                         'A < B',
                         'A > B',
                         'A | B',
+                       'A \t B',
+                       'A \n B',
                         // URL encoding
                         'A%20B',
                         'A%23B',
@@ -222,7 +224,7 @@
                 assert.equal( title.getPrefixedText(), '.foo' );
         } );
  
-       QUnit.test( 'Transformation', 11, function ( assert ) {
+       QUnit.test( 'Transformation', 12, function ( assert ) {
                 var title;
  
                 title = new mw.Title( 'File:quux pif.jpg' );
@@ -242,10 +244,12 @@
                 assert.equal( title.toText(), 'User:HAshAr' );
                 assert.equal( title.getNamespaceId(), 2, 'Case-insensitive namespace prefix' );
  
-               // Don't ask why, it's the way the backend works. One space is kept of each set.
-               title = new mw.Title( 'Foo  __  \t __ bar' );
+               title = new mw.Title( 'Foo \u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000 bar' );
                 assert.equal( title.getMain(), 'Foo_bar', 'Merge multiple types of whitespace/underscores into a single underscore' );
  
+               title = new mw.Title( 'Foo\u200E\u200F\u202A\u202B\u202C\u202D\u202Ebar' );
+               assert.equal( title.getMain(), 'Foobar', 'Strip Unicode bidi override characters' );
+
                 // Regression test: Previously it would only detect an extension if there is no space after it
                 title = new mw.Title( 'Example.js  ' );
                 assert.equal( title.getExtension(), 'js', 'Space after an extension is stripped' );
author	Bartosz Dziewoński <matma.rex@gmail.com>
	Wed, 24 Aug 2016 19:33:45 +0000 (21:33 +0200)
committer	Krinkle <krinklemail@gmail.com>
	Wed, 31 Aug 2016 00:04:29 +0000 (00:04 +0000)
resources/src/mediawiki/mediawiki.Title.js		patch \| blob \| history
tests/phpunit/includes/TitleTest.php		patch \| blob \| history
tests/qunit/suites/resources/mediawiki/mediawiki.Title.test.js		patch \| blob \| history