Fixed bug with magic word fallback to en
[lhc/web/wiklou.git] / includes / Tokenizer.php
index 5412e55..ab18b10 100644 (file)
 <?php
 class Tokenizer {
-       /* private */ var $mText, $mPos, $mTextLength;
-       /* private */ var $mCount, $mM, $mMPos;
+       /* private */ var $mText,               # Text to be processed by the tokenizer
+                         $mPos,                # current position of tokenizer in text
+                         $mTextLength,         # Length of $mText
+                         $mQueuedToken;        # Tokens that were already found, but not
+                                               # returned yet.
 
        /* private */ function Tokenizer()
        {
+               global $wgLang;
+
                $this->mPos=0;
+               $this->mTokenQueue=array();
+               $this->linkPrefixExtension = $wgLang->linkPrefixExtension();
        }
 
        # factory function
        function newFromString( $s )
        {
+               $fname = "Tokenizer::newFromString";
+               wfProfileIn( $fname );
+
                $t = new Tokenizer();
                $t->mText = $s;
-               $t->preParse();
                $t->mTextLength = strlen( $s );
+
+               wfProfileOut( $fname );
                return $t;
        }
 
-       function preParse()
-       {
-               $this->mCount = preg_match_all( "/(\[\[|\]\]|\'\'\'\'\'|\'\'\'|\'\')/",
-                                               $this->mText, $this->mM,
-                                               PREG_PATTERN_ORDER|PREG_OFFSET_CAPTURE);
-               $this->mMPos=0;
-       }
 
-       function nextToken()
+       // Return the next token, but do not increase the pointer. The next call
+       // to previewToken or nextToken will return the same token again.
+       // Actually, the pointer is increased, but the token is queued. The next
+       // call to previewToken or nextToken will check the queue and return
+       // the stored token.
+       function previewToken()
        {
-               $token = $this->previewToken();
-               if ( $token ) {
-                       if ( $token["type"] == "text" ) {
-                               $this->mPos = $token["mPos"];
-                       } else {
-                               $this->mMPos = $token["mMPos"];
-                               $this->mPos = $token["mPos"];
-                       }
+               $fname = "Tokenizer::previewToken";
+               wfProfileIn( $fname );
+
+               if ( count( $this->mQueuedToken ) != 0 ) {
+                       // still one token from the last round around. Return that one first.
+                       $token = $this->mQueuedToken[0];
+               } else {
+                       $token = $this->nextToken();
+                       array_unshift( $this->mQueuedToken, $token );
                }
+
+               wfProfileOut( $fname );
                return $token;
        }
 
 
-       function previewToken()
+       // get the next token
+       // proceeds character by character through the text, looking for characters needing
+       // special attention. Those are currently: I, R, ', [, ], newline
+       //
+       // TODO:  handling of French blanks not yet implemented
+       function nextToken()
        {
-               if ( $this->mMPos <= $this->mCount  ) {
-                       $token["pos"] = $this->mPos;
-                       if ( $this->mPos < $this->mM[0][$this->mMPos][1] ) {
-                               $token["type"] = "text";
-                               $token["text"] = substr( $this->mText, $this->mPos,
-                                                        $this->mM[0][$this->mMPos][1] - $this->mPos );
-                               $token["mPos"] = $this->mM[0][$this->mMPos][1];
-                       } else {
-                               $token["type"] = $this->mM[0][$this->mMPos][0];
-                               $token["mPos"] = $this->mPos + strlen($token["type"]);
-                               $token["mMPos"] = $this->mMPos + 1;
-                       }
-               } elseif ( $this->mPos < $this->mTextLength ) {
-                       $token["type"] = "text";
-                       $token["text"] = substr( $this->mText, $this->mPos );
-                       $token["mPos"] = $this->mTextLength;
+               $fname = "Tokenizer::nextToken";
+               wfProfileIn( $fname );
+
+               if ( count( $this->mQueuedToken ) != 0 ) {
+                       // still one token from the last round around. Return that one first.
+                       $token = array_shift( $this->mQueuedToken );
+               } else if ( $this->mPos > $this->mTextLength ) {
+                       // If no text is left, return "false".
+                       $token = false;
                } else {
-                       $token = FALSE;
-               }
+
+                       $token["text"]="";
+                       $token["type"]="text";
+
+                       while ( $this->mPos <= $this->mTextLength ) {
+                               switch ( @$ch = $this->mText[$this->mPos] ) {
+                                       case 'R': // for "RFC "
+                                               if ( $this->continues("FC ") ) {
+                                                       $queueToken["type"] = $queueToken["text"] = "RFC ";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 3;
+                                                       break 2; // switch + while 
+                                               }
+                                               break;
+                                       case 'I': // for "ISBN "
+                                               if ( $this->continues("SBN ") ) {
+                                                       $queueToken["type"] = $queueToken["text"] = "ISBN ";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 4;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+                                       case "[": // for links "[["
+                                               if ( $this->continues("[[") ) {
+                                                       $queueToken["type"] = "[[[";
+                                                       $queueToken["text"] = "";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 3;
+                                                       break 2; // switch + while
+                                               } else if ( $this->continues("[") ) {
+                                                       $queueToken["type"] = "[[";
+                                                       $queueToken["text"] = "";
+                                                       // Check for a "prefixed link", e.g. Al[[Khazar]]
+                                                       // Mostly for arabic wikipedia
+                                                       if ( $this->linkPrefixExtension ) {
+                                                               while (    $this->linkPrefixExtension
+                                                                       && ($len = strlen( $token["text"] ) ) > 0 
+                                                                       && !ctype_space( $token["text"][$len-1] ) )
+                                                               {
+                                                                       //prepend the character to the link's open tag
+                                                                       $queueToken["text"] = $token["text"][$len-1] . $queueToken["text"];
+                                                                       //remove character from the end of the text token
+                                                                       $token["text"] = substr( $token["text"], 0, -1);
+                                                               }
+                                                       }
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 2;
+                                                       break 2; // switch + while 
+                                               }
+                                               break;
+                                       case "]": // for end of links "]]"
+                                               if ( $this->continues("]") ) {
+                                                       $queueToken["type"] = "]]";
+                                                       $queueToken["text"] = "";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 2;
+                                                       break 2; // switch + while 
+                                               }
+                                               break;
+                                       case "'": // for all kind of em's and strong's
+                                               if ( $this->continues("'") ) {
+                                                       $queueToken["type"] = "'";
+                                                       $queueToken["text"] = "";
+                                                       while(   ($this->mPos+1 < $this->mTextLength) 
+                                                              && $this->mText[$this->mPos+1] == "'" )
+                                                       {
+                                                               $queueToken["type"] .= "'";
+                                                               $queueToken["pos"] = $this->mPos;
+                                                               $this->mPos ++;
+                                                       }
+                                                       
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos ++;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+                                       case "\n": // for block levels, actually, only "----" is handled.
+                                       case "\r": // headings are detected to close any unbalanced em or strong tags in a section
+                                               if ( $this->continues( "----" ) )
+                                               {
+                                                       $queueToken["type"] = "----";
+                                                       $queueToken["text"] = "";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 5;
+                                                       while (     $this->mPos<$this->mTextLength 
+                                                               and $this->mText[$this->mPos] == "-" )
+                                                       {
+                                                               $this->mPos ++;
+                                                       }
+                                                       break 2;
+                                               } else if ( 
+                                                       $this->continues( "<h" ) and (
+                                                               $this->continues( "<h1" ) or
+                                                               $this->continues( "<h2" ) or 
+                                                               $this->continues( "<h3" ) or 
+                                                               $this->continues( "<h4" ) or 
+                                                               $this->continues( "<h5" ) or
+                                                               $this->continues( "<h6" ) 
+                                                       )
+                                               ) { // heading
+                                                       $queueToken["type"] = "h";
+                                                       $queueToken["text"] = "";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos ++;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+                                       case "!": // French spacing rules have a space before exclamation
+                                       case "?": // and question marks. Those have to become &nbsp;
+                                       case ":": // And colons, Hashar says ...
+                                               if ( $this->preceeded( " " ) )
+                                               {
+                                                       // strip blank from Token
+                                                       $token["text"] = substr( $token["text"], 0, -1 );
+                                                       $queueToken["type"] = "blank";
+                                                       $queueToken["text"] = " {$ch}";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos ++;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+                                       case "0": // A space between two numbers is used to ease reading
+                                       case "1": // of big numbers, e.g. 1 000 000. Those spaces need
+                                       case "2": // to be unbreakable
+                                       case "3":
+                                       case "4":
+                                       case "5":
+                                       case "6":
+                                       case "7":
+                                       case "8":
+                                       case "9":
+                                               if (    ($this->mTextLength >= $this->mPos +2)
+                                                    && ($this->mText[$this->mPos+1] == " ")
+                                                    && ctype_digit( $this->mText[$this->mPos+2] ) )
+                                               {
+                                                       $queueToken["type"] = "blank";
+                                                       $queueToken["text"] = $ch . " ";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 2;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+                                       case "\302": // first byte of UTF-8 Character Guillemet-left
+                                               if ( $this->continues( "\253 ") ) // second byte and a blank
+                                               {
+                                                       $queueToken["type"] = "blank";
+                                                       $queueToken["text"] = "\302\253 ";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 3;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+                                       case "\273": //last byte of UTF-8 Character Guillemet-right
+                                               if ( $this->preceeded( " \302" ) )
+                                               {
+                                                       $queueToken["type"] = "blank";
+                                                       $queueToken["text"] = " \302\273";
+                                                       $token["text"] = substr( $token["text"], 0, -2 );
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos ++;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+                                       case "&": //extensions like <timeline>, since HTML stripping has already been done, 
+                                                 //those look like &lt;timeline&gt;
+                                               if ( $this->continues( "lt;timeline&gt;" ) )
+                                               {
+                                                       $queueToken["type"] = "<timeline>";
+                                                       $queueToken["text"] = "&lt;timeline&gt;";
+                                                       $this->mQueuedToken[] = $queueToken;
+                                                       $this->mPos += 16;
+                                                       break 2; // switch + while
+                                               }
+                                               break;
+
+                               } /* switch */
+                               $token["text"].=$ch;
+                               $this->mPos ++;
+                               // echo $this->mPos . "<br>\n"; 
+                       } /* while */
+               } /* if (nothing left in queue) */
+       
+               wfProfileOut( $fname );
                return $token;
        }
 
-               
+       // function continues
+       // checks whether the mText continues with $cont from mPos+1
+       /* private */ function continues( $cont )
+       {
+               // If string is not long enough to contain $cont, return false
+               if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
+                       return false;
+               for ( $i=0; $i < strlen( $cont ); $i++ )
+               {
+                       if ( $this->mText[$this->mPos+1+$i] != $cont[$i] )
+                               return false;
+               }
+               return true;
+       }
+
+       // function preceeded
+       // checks whether the mText is preceeded by $prec at position mPos
+       /* private */ function preceeded( $prec )
+       {
+               $len = strlen( $prec );
+               // if $prec is longer than the text up to mPos, return false
+               if ( $this->mPos < $len )
+                       return false;
+               return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) );
+       }
+
+       function readAllUntil( $border )
+       {
+               $n = strpos( $this->mText, $border, $this->mPos );
+               if ( $n === false )
+                       return "";
+               $ret = substr( $this->mText, $this->mPos, $n - $this->mPos );
+               $this->mPos = $n + strlen( $border ) + 1;
+               return $ret;
+       }
+
 }