handle string limits more nicely, new function 'continues'
[lhc/web/wiklou.git] / includes / Tokenizer.php
1 <?php
2 class Tokenizer {
3 /* private */ var $mText, # Text to be processed by the tokenizer
4 $mPos, # current position of tokenizer in text
5 $mTextLength, # Length of $mText
6 $mQueuedToken; # Tokens that were already found, but not
7 # returned yet.
8
9 /* private */ function Tokenizer()
10 {
11 $this->mPos=0;
12 $this->mTokenQueue=array();
13 }
14
15 # factory function
16 function newFromString( $s )
17 {
18 $fname = "Tokenizer::newFromString";
19 wfProfileIn( $fname );
20
21 $t = new Tokenizer();
22 $t->mText = $s;
23 $t->mTextLength = strlen( $s );
24
25 wfProfileOut( $fname );
26 return $t;
27 }
28
29
30 // Return the next token, but do not increase the pointer. The next call
31 // to previewToken or nextToken will return the same token again.
32 // Actually, the pointer is increased, but the token is queued. The next
33 // call to previewToken or nextToken will check the queue and return
34 // the stored token.
35 function previewToken()
36 {
37 $fname = "Tokenizer::previewToken";
38 wfProfileIn( $fname );
39
40 if ( count( $this->mQueuedToken ) != 0 ) {
41 // still one token from the last round around. Return that one first.
42 $token = $this->mQueuedToken[0];
43 } else {
44 $token = $this->nextToken();
45 array_unshift( $this->mQueuedToken, $token );
46 }
47
48 wfProfileOut( $fname );
49 return $token;
50 }
51
52
53 // get the next token
54 // proceeds character by character through the text, looking for characters needing
55 // special attention. Those are currently: I, R, ', [, ], newline
56 //
57 // TODO: prefixed links for Arabic wikipedia not implemented yet
58 // handling of French blanks not yet implemented
59 function nextToken()
60 {
61 $fname = "Tokenizer::nextToken";
62 wfProfileIn( $fname );
63
64 if ( count( $this->mQueuedToken ) != 0 ) {
65 // still one token from the last round around. Return that one first.
66 $token = array_shift( $this->mQueuedToken );
67 } else if ( $this->mPos > $this->mTextLength )
68 { // If no text is left, return "false".
69 $token = false;
70 } else {
71
72 $token["text"]="";
73 $token["type"]="text";
74
75 while ( $this->mPos <= $this->mTextLength ) {
76 switch ( @$ch = $this->mText[$this->mPos] ) {
77 case 'R': // for "RFC "
78 if ( $this->continues("FC ") ) {
79 $queueToken["type"] = $queueToken["text"] = "RFC ";
80 $this->mQueuedToken[] = $queueToken;
81 $this->mPos += 3;
82 break 2; // switch + while
83 }
84 break;
85 case 'I': // for "ISBN "
86 if ( $this->continues("SBN ") ) {
87 $queueToken["type"] = $queueToken["text"] = "ISBN ";
88 $this->mQueuedToken[] = $queueToken;
89 $this->mPos += 4;
90 break 2; // switch + while
91 }
92 break;
93 case "[": // for links "[["
94 if ( $this->continues("[[") ) {
95 $queueToken["type"] = "[[[";
96 $queueToken["text"] = "";
97 $this->mQueuedToken[] = $queueToken;
98 $this->mPos += 3;
99 break 2; // switch + while
100 } else if ( $this->continues("[") ) {
101 $queueToken["type"] = "[[";
102 $queueToken["text"] = "";
103 $this->mQueuedToken[] = $queueToken;
104 $this->mPos += 2;
105 break 2; // switch + while
106 }
107 break;
108 case "]": // for end of links "]]"
109 if ( $this->continues("]") ) {
110 $queueToken["type"] = "]]";
111 $queueToken["text"] = "";
112 $this->mQueuedToken[] = $queueToken;
113 $this->mPos += 2;
114 break 2; // switch + while
115 }
116 break;
117 case "'": // for all kind of em's and strong's
118 if ( $this->continues("'") ) {
119 $queueToken["type"] = "'";
120 $queueToken["text"] = "";
121 while( ($this->mPos+1 < $this->mTextLength)
122 && $this->mText[$this->mPos+1] == "'" )
123 {
124 $queueToken["type"] .= "'";
125 $this->mPos ++;
126 }
127
128 $this->mQueuedToken[] = $queueToken;
129 $this->mPos ++;
130 break 2; // switch + while
131 }
132 break;
133 case "\n": // for block levels, actually, only "----" is handled.
134 case "\r":
135 if ( $this->continues( "----" ) )
136 {
137 $queueToken["type"] = "----";
138 $queueToken["text"] = "";
139 $this->mQueuedToken[] = $queueToken;
140 $this->mPos += 5;
141 while ( $this->mPos<$this->mTextLength
142 and $this->mText[$this->mPos] == "-" )
143 {
144 $this->mPos ++;
145 }
146 break 2;
147 }
148 } /* switch */
149 $token["text"].=$ch;
150 $this->mPos ++;
151 // echo $this->mPos . "<br>\n";
152 } /* while */
153 } /* if (nothing left in queue) */
154
155 wfProfileOut( $fname );
156 return $token;
157 }
158
159 // function continues
160 // checks whether the mText continues with $cont from mPos+1
161 function continues( $cont )
162 {
163 // If string is not long enough to contain $cont, return false
164 if ( $this->mTextLength < $this->mPos + strlen( $cont ) )
165 return false;
166 for ( $i=0; $i < strlen( $cont ); $i++ )
167 {
168 if ( $this->mText[$this->mPos+1+$i] != $cont[$i] )
169 return false;
170 }
171 return true;
172 }
173
174 }
175