Allow user to always preview text on editing ( implement http://bugzilla.wikipedia...
[lhc/web/wiklou.git] / includes / Tokenizer.php
1 <?php
2 /**
3 *
4 * @package MediaWiki
5 */
6
7 /**
8 *
9 * @package MediaWiki
10 */
11 class Tokenizer {
12 /* private */ var $mText, # Text to be processed by the tokenizer
13 $mPos, # current position of tokenizer in text
14 $mTextLength, # Length of $mText
15 $mQueuedToken; # Tokens that were already found, but not
16 # returned yet.
17
18 /**
19 * Constructor
20 * @access private
21 */
22 function Tokenizer() {
23 global $wgLang;
24
25 $this->mPos=0;
26 $this->mTokenQueue=array();
27 $this->linkPrefixExtension = $wgLang->linkPrefixExtension();
28 }
29
30 /**
31 * factory function
32 */
33 function newFromString( $s ) {
34 $fname = 'Tokenizer::newFromString';
35 wfProfileIn( $fname );
36
37 $t = new Tokenizer();
38 $t->mText = $s;
39 $t->mTextLength = strlen( $s );
40
41 wfProfileOut( $fname );
42 return $t;
43 }
44
45
46 /**
47 * Return the next token, but do not increase the pointer. The next call
48 * to previewToken or nextToken will return the same token again.
49 * Actually, the pointer is increased, but the token is queued. The next
50 * call to previewToken or nextToken will check the queue and return
51 * the stored token.
52 */
53 function previewToken() {
54 $fname = 'Tokenizer::previewToken';
55 wfProfileIn( $fname );
56
57 if ( count( $this->mQueuedToken ) != 0 ) {
58 // still one token from the last round around. Return that one first.
59 $token = $this->mQueuedToken[0];
60 } else {
61 $token = $this->nextToken();
62 array_unshift( $this->mQueuedToken, $token );
63 }
64
65 wfProfileOut( $fname );
66 return $token;
67 }
68
69
70 /**
71 * Get the next token.
72 *
73 * proceeds character by character through the text, looking for characters needing
74 * special attention. Those are currently: I, R, ', [, ], newline
75 *
76 * @todo handling of French blanks not yet implemented
77 */
78 function nextToken() {
79 $fname = 'Tokenizer::nextToken';
80 wfProfileIn( $fname );
81
82 if ( count( $this->mQueuedToken ) != 0 ) {
83 // still one token from the last round around. Return that one first.
84 $token = array_shift( $this->mQueuedToken );
85 } else if ( $this->mPos > $this->mTextLength ) {
86 // If no text is left, return 'false'.
87 $token = false;
88 } else {
89
90 $token['text']='';
91 $token['type']='text';
92
93 while ( $this->mPos <= $this->mTextLength ) {
94 switch ( @$ch = $this->mText[$this->mPos] ) {
95 case 'R': // for "RFC "
96 if ( $this->continues('FC ') ) {
97 $queueToken['type'] = $queueToken['text'] = 'RFC ';
98 $this->mQueuedToken[] = $queueToken;
99 $this->mPos += 3;
100 break 2; // switch + while
101 }
102 break;
103 case 'I': // for "ISBN "
104 if ( $this->continues('SBN ') ) {
105 $queueToken['type'] = $queueToken['text'] = 'ISBN ';
106 $this->mQueuedToken[] = $queueToken;
107 $this->mPos += 4;
108 break 2; // switch + while
109 }
110 break;
111 case '[': // for links "[["
112 if ( $this->continues('[[') ) {
113 $queueToken['type'] = '[[[';
114 $queueToken['text'] = '';
115 $this->mQueuedToken[] = $queueToken;
116 $this->mPos += 3;
117 break 2; // switch + while
118 } else if ( $this->continues('[') ) {
119 $queueToken['type'] = '[[';
120 $queueToken['text'] = '';
121 // Check for a "prefixed link", e.g. Al[[Khazar]]
122 // Mostly for arabic wikipedia
123 if ( $this->linkPrefixExtension ) {
124 while ( $this->linkPrefixExtension
125 && ($len = strlen( $token['text'] ) ) > 0
126 && !ctype_space( $token['text'][$len-1] ) )
127 {
128 //prepend the character to the link's open tag
129 $queueToken['text'] = $token['text'][$len-1] . $queueToken['text'];
130 //remove character from the end of the text token
131 $token['text'] = substr( $token['text'], 0, -1);
132 }
133 }
134 $this->mQueuedToken[] = $queueToken;
135 $this->mPos += 2;
136 break 2; // switch + while
137 }
138 break;
139 case ']': // for end of links "]]"
140 if ( $this->continues(']') ) {
141 $queueToken['type'] = ']]';
142 $queueToken['text'] = '';
143 $this->mQueuedToken[] = $queueToken;
144 $this->mPos += 2;
145 break 2; // switch + while
146 }
147 break;
148 case "'": // for all kind of em's and strong's
149 if ( $this->continues("'") ) {
150 $queueToken['type'] = "'";
151 $queueToken['text'] = '';
152 while( ($this->mPos+1 < $this->mTextLength)
153 && $this->mText[$this->mPos+1] == "'" )
154 {
155 $queueToken['type'] .= "'";
156 $queueToken['pos'] = $this->mPos;
157 $this->mPos ++;
158 }
159
160 $this->mQueuedToken[] = $queueToken;
161 $this->mPos ++;
162 break 2; // switch + while
163 }
164 break;
165 case "\n": // for block levels, actually, only "----" is handled.
166 case "\r": // headings are detected to close any unbalanced em or strong tags in a section
167 if ( $this->continues( '----' ) )
168 {
169 $queueToken['type'] = '----';
170 $queueToken['text'] = '';
171 $this->mQueuedToken[] = $queueToken;
172 $this->mPos += 5;
173 while ( $this->mPos<$this->mTextLength
174 and $this->mText[$this->mPos] == '-' )
175 {
176 $this->mPos ++;
177 }
178 break 2;
179 } else if (
180 $this->continues( '<h' ) and (
181 $this->continues( '<h1' ) or
182 $this->continues( '<h2' ) or
183 $this->continues( '<h3' ) or
184 $this->continues( '<h4' ) or
185 $this->continues( '<h5' ) or
186 $this->continues( '<h6' )
187 )
188 ) { // heading
189 $queueToken['type'] = 'h';
190 $queueToken['text'] = '';
191 $this->mQueuedToken[] = $queueToken;
192 $this->mPos ++;
193 break 2; // switch + while
194 }
195 break;
196 case '!': // French spacing rules have a space before exclamation
197 case '?': // and question marks. Those have to become &nbsp;
198 case ':': // And colons, Hashar says ...
199 if ( $this->preceeded( ' ' ) )
200 {
201 // strip blank from Token
202 $token['text'] = substr( $token['text'], 0, -1 );
203 $queueToken['type'] = 'blank';
204 $queueToken['text'] = ' '.$ch;
205 $this->mQueuedToken[] = $queueToken;
206 $this->mPos ++;
207 break 2; // switch + while
208 }
209 break;
210 case '0': // A space between two numbers is used to ease reading
211 case '1': // of big numbers, e.g. 1 000 000. Those spaces need
212 case '2': // to be unbreakable
213 case '3':
214 case '4':
215 case '5':
216 case '6':
217 case '7':
218 case '8':
219 case '9':
220 if ( ($this->mTextLength >= $this->mPos +2)
221 && ($this->mText[$this->mPos+1] == ' ')
222 && ctype_digit( $this->mText[$this->mPos+2] ) )
223 {
224 $queueToken['type'] = 'blank';
225 $queueToken['text'] = $ch . ' ';
226 $this->mQueuedToken[] = $queueToken;
227 $this->mPos += 2;
228 break 2; // switch + while
229 }
230 break;
231 case "\302": // first byte of UTF-8 Character Guillemet-left
232 if ( $this->continues( "\253 ") ) // second byte and a blank
233 {
234 $queueToken['type'] = 'blank';
235 $queueToken['text'] = "\302\253 ";
236 $this->mQueuedToken[] = $queueToken;
237 $this->mPos += 3;
238 break 2; // switch + while
239 }
240 break;
241 case "\273": //last byte of UTF-8 Character Guillemet-right
242 if ( $this->preceeded( " \302" ) )
243 {
244 $queueToken['type'] = 'blank';
245 $queueToken['text'] = " \302\273";
246 $token['text'] = substr( $token['text'], 0, -2 );
247 $this->mQueuedToken[] = $queueToken;
248 $this->mPos ++;
249 break 2; // switch + while
250 }
251 break;
252 case '&': //extensions like <timeline>, since HTML stripping has already been done,
253 //those look like &lt;timeline&gt;
254 if ( $this->continues( 'lt;timeline&gt;' ) )
255 {
256 $queueToken['type'] = '<timeline>';
257 $queueToken['text'] = '&lt;timeline&gt;';
258 $this->mQueuedToken[] = $queueToken;
259 $this->mPos += 16;
260 break 2; // switch + while
261 }
262 break;
263
264 } /* switch */
265 $token['text'].=$ch;
266 $this->mPos ++;
267 // echo $this->mPos . "<br>\n";
268 } /* while */
269 } /* if (nothing left in queue) */
270
271 wfProfileOut( $fname );
272 return $token;
273 }
274
275 /**
276 * function continues
277 *
278 * checks whether the mText continues with $cont from mPos+1
279 *
280 * @access private
281 */
282 function continues( $cont ) {
283 // If string is not long enough to contain $cont, return false
284 if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
285 return false;
286 for ( $i=0; $i < strlen( $cont ); $i++ )
287 {
288 if ( $this->mText[$this->mPos+1+$i] != $cont[$i] )
289 return false;
290 }
291 return true;
292 }
293
294 /**
295 * function preceeded
296 *
297 * checks whether the mText is preceeded by $prec at position mPos
298 *
299 * @access private
300 */
301 function preceeded( $prec ) {
302 $len = strlen( $prec );
303 // if $prec is longer than the text up to mPos, return false
304 if ( $this->mPos < $len )
305 return false;
306 return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) );
307 }
308
309 /**
310 *
311 */
312 function readAllUntil( $border ) {
313 $n = strpos( $this->mText, $border, $this->mPos );
314 if ( $n === false )
315 return '';
316 $ret = substr( $this->mText, $this->mPos, $n - $this->mPos );
317 $this->mPos = $n + strlen( $border ) + 1;
318 return $ret;
319 }
320
321 }