<?php
+/**
+ *
+ * @package MediaWiki
+ */
+
+/**
+ *
+ * @package MediaWiki
+ */
class Tokenizer {
/* private */ var $mText, # Text to be processed by the tokenizer
$mPos, # current position of tokenizer in text
$mQueuedToken; # Tokens that were already found, but not
# returned yet.
- /* private */ function Tokenizer()
- {
+ /**
+ * Constructor
+ * @access private
+ */
+ function Tokenizer() {
+ global $wgLang;
+
$this->mPos=0;
$this->mTokenQueue=array();
+ $this->linkPrefixExtension = $wgLang->linkPrefixExtension();
}
- # factory function
- function newFromString( $s )
- {
+ /**
+ * factory function
+ */
+ function newFromString( $s ) {
+ $fname = 'Tokenizer::newFromString';
+ wfProfileIn( $fname );
+
$t = new Tokenizer();
$t->mText = $s;
$t->mTextLength = strlen( $s );
- // echo "New tokenizer generated. <pre>{$s}</pre>\n";
+
+ wfProfileOut( $fname );
return $t;
}
- // Return the next token, but do not increase the pointer. The next call
- // to previewToken or nextToken will return the same token again.
- // Actually, the pointer is increased, but the token is queued. The next
- // call to previewToken or nextToken will check the queue and return
- // the stored token.
- function previewToken()
- {
+ /**
+ * Return the next token, but do not increase the pointer. The next call
+ * to previewToken or nextToken will return the same token again.
+ * Actually, the pointer is increased, but the token is queued. The next
+ * call to previewToken or nextToken will check the queue and return
+ * the stored token.
+ */
+ function previewToken() {
+ $fname = 'Tokenizer::previewToken';
+ wfProfileIn( $fname );
+
if ( count( $this->mQueuedToken ) != 0 ) {
// still one token from the last round around. Return that one first.
$token = $this->mQueuedToken[0];
$token = $this->nextToken();
array_unshift( $this->mQueuedToken, $token );
}
+
+ wfProfileOut( $fname );
return $token;
}
- // get the next token
- // proceeds character by character through the text, looking for characters needing
- // special attention. Those are currently: I, R, ', [, ], newline
- //
- // TODO: prefixed links for Arabic wikipedia not implemented yet
- // handling of French blanks not yet implemented
- function nextToken()
- {
+ /**
+ * Get the next token.
+ *
+ * proceeds character by character through the text, looking for characters needing
+ * special attention. Those are currently: I, R, ', [, ], newline
+ *
+ * @todo handling of French blanks not yet implemented
+ */
+ function nextToken() {
+ $fname = 'Tokenizer::nextToken';
+ wfProfileIn( $fname );
+
if ( count( $this->mQueuedToken ) != 0 ) {
// still one token from the last round around. Return that one first.
$token = array_shift( $this->mQueuedToken );
+ } else if ( $this->mPos > $this->mTextLength ) {
+ // If no text is left, return 'false'.
+ $token = false;
} else {
- $token["text"]="";
- $token["type"]="text";
-
- // If no text is left, return "false".
- if ( $this->mPos > $this->mTextLength )
- return false;
+ $token['text']='';
+ $token['type']='text';
while ( $this->mPos <= $this->mTextLength ) {
switch ( @$ch = $this->mText[$this->mPos] ) {
case 'R': // for "RFC "
- if ( isset($this->mText[$this->mPos+4]) &&
- $this->mText[$this->mPos+1] == 'F' &&
- $this->mText[$this->mPos+2] == 'C' &&
- $this->mText[$this->mPos+4] == ' ' ) {
- $queueToken["type"] = $queueToken["text"] = "RFC ";
+ if ( $this->continues('FC ') ) {
+ $queueToken['type'] = $queueToken['text'] = 'RFC ';
$this->mQueuedToken[] = $queueToken;
$this->mPos += 3;
break 2; // switch + while
}
break;
case 'I': // for "ISBN "
- if ( isset($this->mText[$this->mPos+4]) &&
- $this->mText[$this->mPos+1] == 'S' &&
- $this->mText[$this->mPos+2] == 'B' &&
- $this->mText[$this->mPos+3] == 'N' &&
- $this->mText[$this->mPos+4] == ' ' ) {
- $queueToken["type"] = $queueToken["text"] = "ISBN ";
+ if ( $this->continues('SBN ') ) {
+ $queueToken['type'] = $queueToken['text'] = 'ISBN ';
$this->mQueuedToken[] = $queueToken;
$this->mPos += 4;
break 2; // switch + while
}
break;
- case "[": // for links "[["
- if ( isset($this->mText[$this->mPos+2]) &&
- $this->mText[$this->mPos+1] == "[" &&
- $this->mText[$this->mPos+2] == "[" ) {
- $queueToken["type"] = "[[[";
- $queueToken["text"] = "";
+ case '[': // for links "[["
+ if ( $this->continues('[[') ) {
+ $queueToken['type'] = '[[[';
+ $queueToken['text'] = '';
$this->mQueuedToken[] = $queueToken;
$this->mPos += 3;
break 2; // switch + while
- } else if ( isset($this->mText[$this->mPos+1]) &&
- $this->mText[$this->mPos+1] == "[" ) {
- $queueToken["type"] = "[[";
- $queueToken["text"] = "";
+ } else if ( $this->continues('[') ) {
+ $queueToken['type'] = '[[';
+ $queueToken['text'] = '';
+ // Check for a "prefixed link", e.g. Al[[Khazar]]
+ // Mostly for arabic wikipedia
+ if ( $this->linkPrefixExtension ) {
+ while ( $this->linkPrefixExtension
+ && ($len = strlen( $token['text'] ) ) > 0
+ && !ctype_space( $token['text'][$len-1] ) )
+ {
+ //prepend the character to the link's open tag
+ $queueToken['text'] = $token['text'][$len-1] . $queueToken['text'];
+ //remove character from the end of the text token
+ $token['text'] = substr( $token['text'], 0, -1);
+ }
+ }
$this->mQueuedToken[] = $queueToken;
$this->mPos += 2;
break 2; // switch + while
}
break;
- case "]": // for end of links "]]"
- if ( isset($this->mText[$this->mPos+1]) &&
- $this->mText[$this->mPos+1] == "]" ) {
- $queueToken["type"] = "]]";
- $queueToken["text"] = "";
+ case ']': // for end of links "]]"
+ if ( $this->continues(']') ) {
+ $queueToken['type'] = ']]';
+ $queueToken['text'] = '';
$this->mQueuedToken[] = $queueToken;
$this->mPos += 2;
break 2; // switch + while
}
break;
case "'": // for all kind of em's and strong's
- if ( isset($this->mText[$this->mPos+1]) &&
- $this->mText[$this->mPos+1] == "'" ) {
- $queueToken["type"] = "'";
- $queueToken["text"] = "";
- while(isset($this->mText[$this->mPos+1]) && $this->mText[$this->mPos+1] == "'" ) {
- $queueToken["type"] .= "'";
+ if ( $this->continues("'") ) {
+ $queueToken['type'] = "'";
+ $queueToken['text'] = '';
+ while( ($this->mPos+1 < $this->mTextLength)
+ && $this->mText[$this->mPos+1] == "'" )
+ {
+ $queueToken['type'] .= "'";
+ $queueToken['pos'] = $this->mPos;
$this->mPos ++;
}
}
break;
case "\n": // for block levels, actually, only "----" is handled.
- case "\r":
- if ( isset($this->mText[$this->mPos+4]) &&
- $this->mText[$this->mPos+1] == "-" &&
- $this->mText[$this->mPos+2] == "-" &&
- $this->mText[$this->mPos+3] == "-" &&
- $this->mText[$this->mPos+4] == "-" ) {
- $queueToken["type"] = "----";
- $queueToken["text"] = "";
+ case "\r": // headings are detected to close any unbalanced em or strong tags in a section
+ if ( $this->continues( '----' ) )
+ {
+ $queueToken['type'] = '----';
+ $queueToken['text'] = '';
$this->mQueuedToken[] = $queueToken;
$this->mPos += 5;
- while (isset($this->mText[$this->mPos]) and $this->mText[$this->mPos] == "-" ) {
+ while ( $this->mPos<$this->mTextLength
+ and $this->mText[$this->mPos] == '-' )
+ {
$this->mPos ++;
}
break 2;
+ } else if (
+ $this->continues( '<h' ) and (
+ $this->continues( '<h1' ) or
+ $this->continues( '<h2' ) or
+ $this->continues( '<h3' ) or
+ $this->continues( '<h4' ) or
+ $this->continues( '<h5' ) or
+ $this->continues( '<h6' )
+ )
+ ) { // heading
+ $queueToken['type'] = 'h';
+ $queueToken['text'] = '';
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos ++;
+ break 2; // switch + while
+ }
+ break;
+ case '!': // French spacing rules have a space before exclamation
+ case '?': // and question marks. Those have to become
+ case ':': // And colons, Hashar says ...
+ if ( $this->preceeded( ' ' ) )
+ {
+ // strip blank from Token
+ $token['text'] = substr( $token['text'], 0, -1 );
+ $queueToken['type'] = 'blank';
+ $queueToken['text'] = ' '.$ch;
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos ++;
+ break 2; // switch + while
}
+ break;
+ case '0': // A space between two numbers is used to ease reading
+ case '1': // of big numbers, e.g. 1 000 000. Those spaces need
+ case '2': // to be unbreakable
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ if ( ($this->mTextLength >= $this->mPos +2)
+ && ($this->mText[$this->mPos+1] == ' ')
+ && ctype_digit( $this->mText[$this->mPos+2] ) )
+ {
+ $queueToken['type'] = 'blank';
+ $queueToken['text'] = $ch . ' ';
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos += 2;
+ break 2; // switch + while
+ }
+ break;
+ case "\302": // first byte of UTF-8 Character Guillemet-left
+ if ( $this->continues( "\253 ") ) // second byte and a blank
+ {
+ $queueToken['type'] = 'blank';
+ $queueToken['text'] = "\302\253 ";
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos += 3;
+ break 2; // switch + while
+ }
+ break;
+ case "\273": //last byte of UTF-8 Character Guillemet-right
+ if ( $this->preceeded( " \302" ) )
+ {
+ $queueToken['type'] = 'blank';
+ $queueToken['text'] = " \302\273";
+ $token['text'] = substr( $token['text'], 0, -2 );
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos ++;
+ break 2; // switch + while
+ }
+ break;
+ case '&': //extensions like <timeline>, since HTML stripping has already been done,
+ //those look like <timeline>
+ if ( $this->continues( 'lt;timeline>' ) )
+ {
+ $queueToken['type'] = '<timeline>';
+ $queueToken['text'] = '<timeline>';
+ $this->mQueuedToken[] = $queueToken;
+ $this->mPos += 16;
+ break 2; // switch + while
+ }
+ break;
+
} /* switch */
- $token["text"].=$ch;
+ $token['text'].=$ch;
$this->mPos ++;
// echo $this->mPos . "<br>\n";
} /* while */
} /* if (nothing left in queue) */
+
+ wfProfileOut( $fname );
return $token;
}
-
+ /**
+ * function continues
+ *
+ * checks whether the mText continues with $cont from mPos+1
+ *
+ * @access private
+ */
+ function continues( $cont ) {
+ // If string is not long enough to contain $cont, return false
+ if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
+ return false;
+ for ( $i=0; $i < strlen( $cont ); $i++ )
+ {
+ if ( $this->mText[$this->mPos+1+$i] != $cont[$i] )
+ return false;
+ }
+ return true;
+ }
+
+ /**
+ * function preceeded
+ *
+ * checks whether the mText is preceeded by $prec at position mPos
+ *
+ * @access private
+ */
+ function preceeded( $prec ) {
+ $len = strlen( $prec );
+ // if $prec is longer than the text up to mPos, return false
+ if ( $this->mPos < $len )
+ return false;
+ return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) );
+ }
+
+ /**
+ *
+ */
+ function readAllUntil( $border ) {
+ $n = strpos( $this->mText, $border, $this->mPos );
+ if ( $n === false )
+ return '';
+ $ret = substr( $this->mText, $this->mPos, $n - $this->mPos );
+ $this->mPos = $n + strlen( $border ) + 1;
+ return $ret;
+ }
+
}
-