Fix notice. The isset() check consistently produces a big ugly notice about an out...
[lhc/web/wiklou.git] / includes / Tokenizer.php
1 <?php
2 class Tokenizer {
3 /* private */ var $mText, # Text to be processed by the tokenizer
4 $mPos, # current position of tokenizer in text
5 $mTextLength, # Length of $mText
6 $mQueuedToken; # Tokens that were already found, but not
7 # returned yet.
8
9 /* private */ function Tokenizer()
10 {
11 $this->mPos=0;
12 $this->mTokenQueue=array();
13 }
14
15 # factory function
16 function newFromString( $s )
17 {
18 $t = new Tokenizer();
19 $t->mText = $s;
20 $t->mTextLength = strlen( $s );
21 // echo "New tokenizer generated. <pre>{$s}</pre>\n";
22 return $t;
23 }
24
25
26 // Return the next token, but do not increase the pointer. The next call
27 // to previewToken or nextToken will return the same token again.
28 // Actually, the pointer is increased, but the token is queued. The next
29 // call to previewToken or nextToken will check the queue and return
30 // the stored token.
31 function previewToken()
32 {
33 if ( count( $this->mQueuedToken ) != 0 ) {
34 // still one token from the last round around. Return that one first.
35 $token = $this->mQueuedToken[0];
36 } else {
37 $token = $this->nextToken();
38 array_unshift( $this->mQueuedToken, $token );
39 }
40 return $token;
41 }
42
43
44 // get the next token
45 // proceeds character by character through the text, looking for characters needing
46 // special attention. Those are currently: I, R, ', [, ], newline
47 //
48 // TODO: prefixed links for Arabic wikipedia not implemented yet
49 // handling of French blanks not yet implemented
50 function nextToken()
51 {
52 if ( count( $this->mQueuedToken ) != 0 ) {
53 // still one token from the last round around. Return that one first.
54 $token = array_shift( $this->mQueuedToken );
55 } else {
56
57 $token["text"]="";
58 $token["type"]="text";
59
60 // If no text is left, return "false".
61 if ( $this->mPos > $this->mTextLength )
62 return false;
63
64 while ( $this->mPos <= $this->mTextLength ) {
65 switch ( @$ch = $this->mText[$this->mPos] ) {
66 case 'R': // for "RFC "
67 if ( $this->mText[$this->mPos+1] == 'F' &&
68 $this->mText[$this->mPos+2] == 'C' &&
69 $this->mText[$this->mPos+4] == ' ' ) {
70 $queueToken["type"] = $queueToken["text"] = "RFC ";
71 $this->mQueuedToken[] = $queueToken;
72 $this->mPos += 3;
73 break 2; // switch + while
74 }
75 break;
76 case 'I': // for "ISBN "
77 if ( $this->mText[$this->mPos+1] == 'S' &&
78 $this->mText[$this->mPos+2] == 'B' &&
79 $this->mText[$this->mPos+3] == 'N' &&
80 $this->mText[$this->mPos+4] == ' ' ) {
81 $queueToken["type"] = $queueToken["text"] = "ISBN ";
82 $this->mQueuedToken[] = $queueToken;
83 $this->mPos += 4;
84 break 2; // switch + while
85 }
86 break;
87 case "[": // for links "[["
88 if ( $this->mText[$this->mPos+1] == "[" &&
89 $this->mText[$this->mPos+2] == "[" ) {
90 $queueToken["type"] = "[[[";
91 $queueToken["text"] = "";
92 $this->mQueuedToken[] = $queueToken;
93 $this->mPos += 3;
94 break 2; // switch + while
95 } else if ( $this->mText[$this->mPos+1] == "[" ) {
96 $queueToken["type"] = "[[";
97 $queueToken["text"] = "";
98 $this->mQueuedToken[] = $queueToken;
99 $this->mPos += 2;
100 break 2; // switch + while
101 }
102 break;
103 case "]": // for end of links "]]"
104 if ( $this->mText[$this->mPos+1] == "]" ) {
105 $queueToken["type"] = "]]";
106 $queueToken["text"] = "";
107 $this->mQueuedToken[] = $queueToken;
108 $this->mPos += 2;
109 break 2; // switch + while
110 }
111 break;
112 case "'": // for all kind of em's and strong's
113 if ( $this->mText[$this->mPos+1] == "'" ) {
114 $queueToken["type"] = "'";
115 $queueToken["text"] = "";
116 while(isset($this->mText[$this->mPos+1]) && $this->mText[$this->mPos+1] == "'" ) {
117 $queueToken["type"] .= "'";
118 $this->mPos ++;
119 }
120
121 $this->mQueuedToken[] = $queueToken;
122 $this->mPos ++;
123 break 2; // switch + while
124 }
125 break;
126 case "\n": // for block levels, actually, only "----" is handled.
127 case "\r":
128 if ( isset($this->mText[$this->mPos+4]) &&
129 $this->mText[$this->mPos+1] == "-" &&
130 $this->mText[$this->mPos+2] == "-" &&
131 $this->mText[$this->mPos+3] == "-" &&
132 $this->mText[$this->mPos+4] == "-" ) {
133 $queueToken["type"] = "----";
134 $queueToken["text"] = "";
135 $this->mQueuedToken[] = $queueToken;
136 $this->mPos += 5;
137 while ($this->mText[$this->mPos] == "-" ) {
138 $this->mPos ++;
139 }
140 break 2;
141 }
142 } /* switch */
143 $token["text"].=$ch;
144 $this->mPos ++;
145 // echo $this->mPos . "<br>\n";
146 } /* while */
147 } /* if (nothing left in queue) */
148 return $token;
149 }
150
151
152 }
153