Step 3: Balance the quotes directly on $text
authorPlatonides <platonides@users.mediawiki.org>
Tue, 26 Jan 2010 18:56:50 +0000 (18:56 +0000)
committerPlatonides <platonides@users.mediawiki.org>
Tue, 26 Jan 2010 18:56:50 +0000 (18:56 +0000)
Side effect: Some ' are converted to &#39; on output.

includes/parser/Parser.php
maintenance/parserTests.txt

index 647b6b9..e419a30 100644 (file)
@@ -1119,70 +1119,44 @@ class Parser
                        return $text;
                else
                {
-                       # Split in groups of 2, 3, 5 or 6 apostrophes.
-                       # If there are ever four apostrophes, assume the first is supposed to
-                       # be text, and the remaining three constitute mark-up for bold text.
-                       # If there are more than 6 apostrophes in a row, assume they're all
-                       # text except for the last 6.           
-                       $arr = preg_split( "/('{2,3}(?:''')?)(?!')/", $text, -1, PREG_SPLIT_DELIM_CAPTURE );
-
-
                        # If there is an odd number of both bold and italics, it is likely
                        # that one of the bold ones was meant to be an apostrophe followed
                        # by italics. Which one we cannot know for certain, but it is more
                        # likely to be one that has a single-letter word before it.
                        if ( ( $numbold % 2 == 1 ) && ( $numitalics % 2 == 1 ) )
                        {
-                               $i = 0;
-                               
-                               # These are indexes to the /next/ array entry than the 
-                               # one holding the text matching the condition which gives name 
-                               # to the variable.
-                               $firstsingleletterword = -1;
-                               $firstmultiletterword = -1;
-                               $firstspace = -1;
-                               
-                               foreach ( $arr as $r )
-                               {
-                                       # Filter the "'''". Separators are on odd positions. 
-                                       # $arr[0] will be an empty string if needed.
-                                       if ( ( $i % 2 == 1 ) and ( strlen( $r ) == 3 ) )
-                                       {
-                                               $x1 = substr ($arr[$i-1], -1);
-                                               $x2 = substr ($arr[$i-1], -2, 1);
-                                               if ($x1 === ' ') {
-                                                       if ($firstspace == -1) $firstspace = $i;
-                                               } elseif ($x2 === ' ') {
-                                                       if ($firstsingleletterword == -1) $firstsingleletterword = $i;
-                                               } elseif ($arr[$i-1] !== "") {
-                                                       if ($firstmultiletterword == -1) $firstmultiletterword = $i;
-                                               }
-                                       }
-                                       $i++;
-                               }
 
-                               # If there is a single-letter word, use it!
-                               if ($firstsingleletterword > -1)
-                               {
-                                       $arr [ $firstsingleletterword ] = "''";
-                                       $arr [ $firstsingleletterword-1 ] .= "'";
-                               }
-                               # If not, but there's a multi-letter word, use that one.
-                               elseif ($firstmultiletterword > -1)
-                               {
-                                       $arr [ $firstmultiletterword ] = "''";
-                                       $arr [ $firstmultiletterword-1 ] .= "'";
-                               }
-                               # ... otherwise use the first one that has neither.
-                               # (notice that it is possible for all three to be -1 if, for example,
-                               # there is only one pentuple-apostrophe in the line)
-                               elseif ($firstspace > -1)
-                               {
-                                       $arr [ $firstspace ] = "''";
-                                       $arr [ $firstspace-1 ] .= "'";
+                               # This algorithm moves the literal quote at the 
+                               # right of a single word, at the right of a 
+                               # multiletter word or at the right of a space.
+                               # Otherwise, it does nothing.
+                               #
+                               # The original if-based version can be found at
+                               # http://svn.wikimedia.org/viewvc/mediawiki/trunk/phase3/includes/parser/Parser.php?revision=61519&view=markup
+                               #
+                               # Unlike the original one, here we convert the 
+                               # texty quotes to &#39; which shouldn't matter.
+
+                               $quoteBalancerReplacements = array( 
+                                                                                               "/(?<= [^ ])'''(?!')/"=>"&#39;''", 
+                                                                                               "/(?<=[^ '])'''(?!')/"=>"&#39;''", 
+                                                                                               "/(^|(?<=[^'])) '''(?!')/"=>" &#39;''");
+
+                               foreach( $quoteBalancerReplacements as $k => $v) {
+                                       $text = preg_replace($k, $v, $text, 1, $count);
+                                       if ($count != 0)
+                                               break;
                                }
                        }
 
+                       # Split in groups of 2, 3, 5 or 6 apostrophes.
+                       # If there are ever four apostrophes, assume the first is supposed to
+                       # be text, and the remaining three constitute mark-up for bold text.
+                       # If there are more than 6 apostrophes in a row, assume they're all
+                       # text except for the last 6.           
+                       $arr = preg_split( "/('{2,3}(?:''')?)(?!')/", $text, -1, PREG_SPLIT_DELIM_CAPTURE );
+
+
                        # Now let's actually convert our apostrophic mush to HTML!
                        $output = ''; # Processed text
                        $buffer = ''; # Content if $state is 'both'
index fabff60..fedbb85 100644 (file)
@@ -116,7 +116,7 @@ Italics and bold
 </li><li> plain<b><i>bold-italic</i>bold</b>plain
 </li><li> plain<i>italic<b>bold-italic</b></i>plain
 </li><li> plain<b>bold<i>bold-italic</i></b>plain
-</li><li> plain l'<i>italic</i>plain
+</li><li> plain l&#39;<i>italic</i>plain
 </li><li> plain l'<b>bold</b> plain
 </li></ul>
 
@@ -6415,7 +6415,7 @@ Another italics / bold test
 !! input
  ''' ''x'
 !! result
-<pre>'<i> </i>x'
+<pre>&#39;<i> </i>x'
 </pre>
 !!end
 
@@ -7585,19 +7585,19 @@ but '''This is bold'' this is bold italic''' and this is bold'''
 |}
 -->
 !! result
-<p><b>Look at <i>this edit'</i>s complicated bold/italic markup!</b>
-</p><p><b>Look at <i>this edit'</i>s complicated bold/italic markup!</b>
-</p><p><span> <b>Look at <i>this edit'</i>s complicated bold/italic markup!</b></span>
-</p><p> <b>Look at <i>this edit'</i>s complicated bold/italic markup!</b>
+<p><b>Look at <i>this edit&#39;</i>s complicated bold/italic markup!</b>
+</p><p><b>Look at <i>this edit&#39;</i>s complicated bold/italic markup!</b>
+</p><p><span> <b>Look at <i>this edit&#39;</i>s complicated bold/italic markup!</b></span>
+</p><p> <b>Look at <i>this edit&#39;</i>s complicated bold/italic markup!</b>
 </p>
-<pre><b>Look at <i>this edit'</i>s complicated bold/italic markup!</b>
+<pre><b>Look at <i>this edit&#39;</i>s complicated bold/italic markup!</b>
 </pre>
 <table>
 <tr>
-<td> <b>Look at <i>this edit'</i>s complicated bold/italic markup!</b>
+<td> <b>Look at <i>this edit&#39;</i>s complicated bold/italic markup!</b>
 </td></tr></table>
-<p><b>This was Italic<i> this was plain'</i> and this was bold</b>
-but <b>This is bold<i> this is bold italic'</i> and this is bold</b>
+<p><b>This was Italic<i> this was plain&#39;</i> and this was bold</b>
+but <b>This is bold<i> this is bold italic&#39;</i> and this is bold</b>
 </p><p><br />
 </p>
 !! end