3 /* private */ var $mText, # Text to be processed by the tokenizer
4 $mPos, # current position of tokenizer in text
5 $mTextLength, # Length of $mText
6 $mQueuedToken; # Tokens that were already found, but not
9 /* private */ function Tokenizer()
12 $this->mTokenQueue
=array();
16 function newFromString( $s )
18 $fname = "Tokenizer::newFromString";
19 wfProfileIn( $fname );
23 $t->mTextLength
= strlen( $s );
25 wfProfileOut( $fname );
30 // Return the next token, but do not increase the pointer. The next call
31 // to previewToken or nextToken will return the same token again.
32 // Actually, the pointer is increased, but the token is queued. The next
33 // call to previewToken or nextToken will check the queue and return
35 function previewToken()
37 $fname = "Tokenizer::previewToken";
38 wfProfileIn( $fname );
40 if ( count( $this->mQueuedToken
) != 0 ) {
41 // still one token from the last round around. Return that one first.
42 $token = $this->mQueuedToken
[0];
44 $token = $this->nextToken();
45 array_unshift( $this->mQueuedToken
, $token );
48 wfProfileOut( $fname );
54 // proceeds character by character through the text, looking for characters needing
55 // special attention. Those are currently: I, R, ', [, ], newline
57 // TODO: prefixed links for Arabic wikipedia not implemented yet
58 // handling of French blanks not yet implemented
61 $fname = "Tokenizer::nextToken";
62 wfProfileIn( $fname );
64 if ( count( $this->mQueuedToken
) != 0 ) {
65 // still one token from the last round around. Return that one first.
66 $token = array_shift( $this->mQueuedToken
);
67 } else if ( $this->mPos
> $this->mTextLength
)
68 { // If no text is left, return "false".
73 $token["type"]="text";
75 while ( $this->mPos
<= $this->mTextLength
) {
76 switch ( @$ch = $this->mText
[$this->mPos
] ) {
77 case 'R': // for "RFC "
78 if ( $this->continues("FC ") ) {
79 $queueToken["type"] = $queueToken["text"] = "RFC ";
80 $this->mQueuedToken
[] = $queueToken;
82 break 2; // switch + while
85 case 'I': // for "ISBN "
86 if ( $this->continues("SBN ") ) {
87 $queueToken["type"] = $queueToken["text"] = "ISBN ";
88 $this->mQueuedToken
[] = $queueToken;
90 break 2; // switch + while
93 case "[": // for links "[["
94 if ( $this->continues("[[") ) {
95 $queueToken["type"] = "[[[";
96 $queueToken["text"] = "";
97 $this->mQueuedToken
[] = $queueToken;
99 break 2; // switch + while
100 } else if ( $this->continues("[") ) {
101 $queueToken["type"] = "[[";
102 $queueToken["text"] = "";
103 $this->mQueuedToken
[] = $queueToken;
105 break 2; // switch + while
108 case "]": // for end of links "]]"
109 if ( $this->continues("]") ) {
110 $queueToken["type"] = "]]";
111 $queueToken["text"] = "";
112 $this->mQueuedToken
[] = $queueToken;
114 break 2; // switch + while
117 case "'": // for all kind of em's and strong's
118 if ( $this->continues("'") ) {
119 $queueToken["type"] = "'";
120 $queueToken["text"] = "";
121 while( ($this->mPos+
1 < $this->mTextLength
)
122 && $this->mText
[$this->mPos+
1] == "'" )
124 $queueToken["type"] .= "'";
128 $this->mQueuedToken
[] = $queueToken;
130 break 2; // switch + while
133 case "\n": // for block levels, actually, only "----" is handled.
135 if ( $this->continues( "----" ) )
137 $queueToken["type"] = "----";
138 $queueToken["text"] = "";
139 $this->mQueuedToken
[] = $queueToken;
141 while ( $this->mPos
<$this->mTextLength
142 and $this->mText
[$this->mPos
] == "-" )
151 // echo $this->mPos . "<br>\n";
153 } /* if (nothing left in queue) */
155 wfProfileOut( $fname );
159 // function continues
160 // checks whether the mText continues with $cont from mPos+1
161 function continues( $cont )
163 // If string is not long enough to contain $cont, return false
164 if ( $this->mTextLength
< $this->mPos +
strlen( $cont ) )
166 for ( $i=0; $i < strlen( $cont ); $i++
)
168 if ( $this->mText
[$this->mPos+
1+
$i] != $cont[$i] )