Don't break autolinks by stripping the final semicolon from an entity.
authorC. Scott Ananian <cscott@cscott.net>
Thu, 11 Dec 2014 20:15:28 +0000 (15:15 -0500)
committerC. Scott Ananian <cscott@cscott.net>
Thu, 18 Dec 2014 22:27:55 +0000 (17:27 -0500)
Autolinking free external links is clever about making sure that trailing
punctuation isn't included in the link.  But if an HTML entity happens to
terminate the URL, the semicolon from the entity is stripped from the url,
breaking it.

Fix this corner case.  This also unifies autolink parsing with Parsoid.

See: I5ae8435322c78dd1df170d7a3543fff3642759b1
Change-Id: I5482782c25e12283030b0fd2150ac55092f7979b

includes/parser/Parser.php
tests/parser/parserTests.txt

index a9daa22..ecb14ed 100644 (file)
@@ -1484,7 +1484,20 @@ class Parser {
                        $sep .= ')';
                }
 
-               $numSepChars = strspn( strrev( $url ), $sep );
+               $urlRev = strrev( $url );
+               $numSepChars = strspn( $urlRev, $sep );
+               # Don't break a trailing HTML entity by moving the ; into $trail
+               # This is in hot code, so use substr_compare to avoid having to
+               # create a new string object for the comparison
+               if ( $numSepChars && substr_compare( $url, ";", -$numSepChars, 1 ) === 0) {
+                       # more optimization: instead of running preg_match with a $
+                       # anchor, which can be slow, do the match on the reversed
+                       # string starting at the desired offset.
+                       # un-reversed regexp is: /&([a-z]+|#x[\da-f]+|#\d+)$/i
+                       if ( preg_match( '/\G([a-z]+|[\da-f]+x#|\d+#)&/i', $urlRev, $m2, 0, $numSepChars ) ) {
+                               $numSepChars--;
+                       }
+               }
                if ( $numSepChars ) {
                        $trail = substr( $url, -$numSepChars ) . $trail;
                        $url = substr( $url, 0, -$numSepChars );
index c7fc380..63f6a75 100644 (file)
@@ -4171,6 +4171,13 @@ http://example.com!
 http://example.com?
 http://example.com)
 http://example.com/url_with_(brackets)
+(http://example.com/url_without_brackets)
+http://example.com/url_with_entity&nbsp;
+http://example.com/url_with_entity&#xA0;
+http://example.com/url_with_entity&#160;
+http://example.com/url_with_entity&lt;
+http://example.com/url_with_entity&#x3C;
+http://example.com/url_with_entity&#60;
 !! html
 <p><a rel="nofollow" class="external free" href="http://example.com">http://example.com</a>,
 <a rel="nofollow" class="external free" href="http://example.com">http://example.com</a>;
@@ -4181,6 +4188,13 @@ http://example.com/url_with_(brackets)
 <a rel="nofollow" class="external free" href="http://example.com">http://example.com</a>?
 <a rel="nofollow" class="external free" href="http://example.com">http://example.com</a>)
 <a rel="nofollow" class="external free" href="http://example.com/url_with_(brackets)">http://example.com/url_with_(brackets)</a>
+(<a rel="nofollow" class="external free" href="http://example.com/url_without_brackets">http://example.com/url_without_brackets</a>)
+<a rel="nofollow" class="external free" href="http://example.com/url_with_entity ">http://example.com/url_with_entity </a>
+<a rel="nofollow" class="external free" href="http://example.com/url_with_entity ">http://example.com/url_with_entity </a>
+<a rel="nofollow" class="external free" href="http://example.com/url_with_entity ">http://example.com/url_with_entity </a>
+<a rel="nofollow" class="external free" href="http://example.com/url_with_entity">http://example.com/url_with_entity</a>&lt;
+<a rel="nofollow" class="external free" href="http://example.com/url_with_entity%3C">http://example.com/url_with_entity%3C</a>
+<a rel="nofollow" class="external free" href="http://example.com/url_with_entity%3C">http://example.com/url_with_entity%3C</a>
 </p>
 !! end