fixed bugs added recently
[lhc/web/wiklou.git] / includes / Tokenizer.php
1 <?php
2 class Tokenizer {
3 /* private */ var $mText, # Text to be processed by the tokenizer
4 $mPos, # current position of tokenizer in text
5 $mTextLength, # Length of $mText
6 $mCount, # token count, computed in preParse
7 $mMatch, # matches of tokenizer regex, computed in preParse
8 $mMatchPos; # current token position of tokenizer. Each match can
9 # be up to two tokens: A matched token and the text after it.
10
11 /* private */ function Tokenizer()
12 {
13 $this->mPos=0;
14 }
15
16 # factory function
17 function newFromString( $s )
18 {
19 $t = new Tokenizer();
20 $t->mText = $s;
21 $t->preParse();
22 $t->mTextLength = strlen( $s );
23 return $t;
24 }
25
26 function preParse()
27 {
28 global $wgLang;
29
30 # build up the regex, step by step.
31 # Basic features: Quotes for <em>/<strong> and hyphens for <hr>
32 $regex = "\'\'\'\'\'|\'\'\'|\'\'|\n-----*";
33 # Append regex for linkPrefixExtension
34 if ( $wgLang->linkPrefixExtension() ) {
35 $regex .= "|([a-zA-Z\x80-\xff]+)\[\[";
36 } else {
37 # end tag that can start with 3 [
38 $regex .= "|\[\[\[?";
39 }
40 # Closing link
41 $regex .= "|\]\]";
42 # Magic words that automatically generate links
43 $regex .= "|ISBN |RFC ";
44 # Language-specific additions
45 $regex .= $wgLang->tokenizerRegex();
46 # Finalize regex
47 $regex = "/(" . $regex . ")/";
48
49 # Apply the regex to the text
50 $this->mCount = preg_match_all( $regex, $this->mText, $this->mMatch,
51 PREG_PATTERN_ORDER|PREG_OFFSET_CAPTURE);
52 $this->mMatchPos=0;
53 }
54
55 function nextToken()
56 {
57 $token = $this->previewToken();
58 if ( $token ) {
59 $this->mMatchPos = $token["mMatchPos"];
60 $this->mPos = $token["mPos"];
61 }
62 return $token;
63 }
64
65
66 function previewToken()
67 {
68 if ( $this->mMatchPos < $this->mCount ) {
69 $token["pos"] = $this->mPos;
70 if ( $this->mPos < $this->mMatch[0][$this->mMatchPos][1] ) {
71 $token["type"] = "text";
72 $token["text"] = substr( $this->mText, $this->mPos,
73 $this->mMatch[0][$this->mMatchPos][1] - $this->mPos );
74 # What the pointers would change to if this would not just be a preview
75 $token["mMatchPos"] = $this->mMatchPos;
76 $token["mPos"] = $this->mMatch[0][$this->mMatchPos][1];
77 } else {
78 # If linkPrefixExtension is set, $this->mMatch[2][$this->mMatchPos][0]
79 # contains the link prefix, or is null if no link prefix exist.
80 if ( isset( $this->mMatch[2] ) && $this->mMatch[2][$this->mMatchPos][0] )
81 {
82 # prefixed link open tag, [0] is "prefix[["
83 $token["type"] = "[[";
84 $token["text"] = $this->mMatch[2][$this->mMatchPos][0]; # the prefix
85 } else {
86 $token["type"] = $this->mMatch[0][$this->mMatchPos][0];
87 if ( substr($token["type"],1,4) == "----" )
88 {
89 # any number of hyphens bigger than four is a <HR>.
90 # strip down to four.
91 $token["type"]="----";
92 }
93 }
94 # What the pointers would change to if this would not just be a preview
95 $token["mPos"] = $this->mPos + strlen( $this->mMatch[0][$this->mMatchPos][0] );
96 $token["mMatchPos"] = $this->mMatchPos + 1;
97 }
98 } elseif ( $this->mPos < $this->mTextLength ) {
99 $token["type"] = "text";
100 $token["text"] = substr( $this->mText, $this->mPos );
101 # What the pointers would change to if this would not just be a preview
102 $token["mPos"] = $this->mTextLength;
103 $token["mMatchPos"] = $this->mMatchPos;
104 } else {
105 $token = FALSE;
106 }
107 return $token;
108 }
109
110
111 }
112