Remove nbsp and similar characters from section IDs

author Max Semenik <maxsem.wiki@gmail.com>

Fri, 3 Nov 2017 02:35:11 +0000 (19:35 -0700)

committer Max Semenik <maxsem.wiki@gmail.com>

Fri, 3 Nov 2017 02:35:11 +0000 (19:35 -0700)
author Max Semenik <maxsem.wiki@gmail.com>
Fri, 3 Nov 2017 02:35:11 +0000 (19:35 -0700)
committer Max Semenik <maxsem.wiki@gmail.com>
Fri, 3 Nov 2017 02:35:11 +0000 (19:35 -0700)
diff --git a/RELEASE-NOTES-1.31 b/RELEASE-NOTES-1.31

index 4bfcfcb..3688163 100644 (file)
--- a/RELEASE-NOTES-1.31
+++ b/RELEASE-NOTES-1.31
@@ -41,6 +41,7 @@ production.
  * …
  
  === Bug fixes in 1.31 ===
+* (T90902) Non-breaking space in header ID breaks anchor
  * …
  
  === Action API changes in 1.31 ===
diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php

index f2e47dc..3548da9 100644 (file)
--- a/includes/parser/Parser.php
+++ b/includes/parser/Parser.php
@@ -4206,6 +4206,9 @@ class Parser {
  
                         # Decode HTML entities
                         $safeHeadline = Sanitizer::decodeCharReferences( $safeHeadline );
+
+                       $safeHeadline = $this->normalizeSectionName( $safeHeadline );
+
                         $fallbackHeadline = Sanitizer::escapeIdForAttribute( $safeHeadline, Sanitizer::ID_FALLBACK );
                         $linkAnchor = Sanitizer::escapeIdForLink( $safeHeadline );
                         $safeHeadline = Sanitizer::escapeIdForAttribute( $safeHeadline, Sanitizer::ID_PRIMARY );
@@ -5767,6 +5770,8 @@ class Parser {
                 $text = $this->stripSectionName( $text );
                 $text = Sanitizer::normalizeSectionNameWhitespace( $text );
                 $text = Sanitizer::decodeCharReferences( $text );
+               $text = $this->normalizeSectionName( $text );
+
                 return '#' . Sanitizer::escapeIdForLink( $text );
         }
  
@@ -5786,6 +5791,7 @@ class Parser {
                 $text = $this->stripSectionName( $text );
                 $text = Sanitizer::normalizeSectionNameWhitespace( $text );
                 $text = Sanitizer::decodeCharReferences( $text );
+               $text = $this->normalizeSectionName( $text );
  
                 if ( isset( $wgFragmentMode[1] ) && $wgFragmentMode[1] === 'legacy' ) {
                         // ForAttribute() and ForLink() are the same for legacy encoding
@@ -5797,6 +5803,24 @@ class Parser {
                 return "#$id";
         }
  
+       /**
+        * Apply the same normalization as code making links to this section would
+        *
+        * @param string $text
+        * @return string
+        */
+       private function normalizeSectionName( $text ) {
+               # T90902: ensure the same normalization is applied for IDs as to links
+               $titleParser = MediaWikiServices::getInstance()->getTitleParser();
+               try {
+
+                       $parts = $titleParser->splitTitleString( "#$text" );
+               } catch ( MalformedTitleException $ex ) {
+                       return $text;
+               }
+               return $parts['fragment'];
+       }
+
         /**
          * Strips a text string of wikitext for use in a section anchor
          *
diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt

index 3c861ea..1204dbd 100644 (file)
--- a/tests/parser/parserTests.txt
+++ b/tests/parser/parserTests.txt
@@ -29536,3 +29536,17 @@ wgFragmentMode=[ 'html5' ]
  </p><p><a href="#啤酒">#啤酒</a> <a href="#啤酒">#啤酒</a>
  </p>
  !! end
+
+!! test
+T90902: Normalize weird characters in section IDs
+!! config
+wgFragmentMode=[ 'html5', 'legacy' ]
+!! wikitext
+== Foo&nbsp;bar ==
+[[#Foo&nbsp;bar]]
+
+!! html/php
+<h2><span class="mw-headline" id="Foo_bar">Foo&#160;bar</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/index.php?title=Parser_test&amp;action=edit&amp;section=1" title="Edit section: Foo bar">edit</a><span class="mw-editsection-bracket">]</span></span></h2>
+<p><a href="#Foo_bar">#Foo&#160;bar</a>
+</p>
+!! end
author	Max Semenik <maxsem.wiki@gmail.com>
	Fri, 3 Nov 2017 02:35:11 +0000 (19:35 -0700)
committer	Max Semenik <maxsem.wiki@gmail.com>
	Fri, 3 Nov 2017 02:35:11 +0000 (19:35 -0700)
RELEASE-NOTES-1.31		patch \| blob \| history
includes/parser/Parser.php		patch \| blob \| history
tests/parser/parserTests.txt		patch \| blob \| history