Fixed what seems to be an off-by-one error (it tried to access one past the end of...
[lhc/web/wiklou.git] / includes / Tokenizer.php
1 <?php
2 class Tokenizer {
3 /* private */ var $mText, # Text to be processed by the tokenizer
4 $mPos, # current position of tokenizer in text
5 $mTextLength, # Length of $mText
6 $mCount, # token count, computed in preParse
7 $mMatch, # matches of tokenizer regex, computed in preParse
8 $mMatchPos; # current token position of tokenizer. Each match can
9 # be up to two tokens: A matched token and the text after it.
10
11 /* private */ function Tokenizer()
12 {
13 $this->mPos=0;
14 }
15
16 # factory function
17 function newFromString( $s )
18 {
19 $t = new Tokenizer();
20 $t->mText = $s;
21 $t->preParse();
22 $t->mTextLength = strlen( $s );
23 return $t;
24 }
25
26 function preParse()
27 {
28 global $wgLang;
29
30 # build up the regex, step by step.
31 # Basic features: Quotes for <em>/<strong> and hyphens for <hr>
32 $regex = "\'\'\'\'\'|\'\'\'|\'\'|\n-----*";
33 # Append regex for linkPrefixExtension
34 if ( $wgLang->linkPrefixExtension() ) {
35 $regex .= "|([a-zA-Z\x80-\xff]+)\[\[";
36 } else {
37 $regex .= "|\[\[";
38 }
39 # Closing link
40 $regex .= "|\]\]";
41 # Magic words that automatically generate links
42 $regex .= "|ISBN |RFC ";
43 # Language-specific additions
44 $regex .= $wgLang->tokenizerRegex();
45 # Finalize regex
46 $regex = "/(" . $regex . ")/";
47
48 # Apply the regex to the text
49 $this->mCount = preg_match_all( $regex, $this->mText, $this->mMatch,
50 PREG_PATTERN_ORDER|PREG_OFFSET_CAPTURE);
51 $this->mMatchPos=0;
52 }
53
54 function nextToken()
55 {
56 $token = $this->previewToken();
57 if ( $token ) {
58 $this->mMatchPos = $token["mMatchPos"];
59 $this->mPos = $token["mPos"];
60 }
61 return $token;
62 }
63
64
65 function previewToken()
66 {
67 if ( $this->mMatchPos < $this->mCount ) {
68 $token["pos"] = $this->mPos;
69 if ( $this->mPos < $this->mMatch[0][$this->mMatchPos][1] ) {
70 $token["type"] = "text";
71 $token["text"] = substr( $this->mText, $this->mPos,
72 $this->mMatch[0][$this->mMatchPos][1] - $this->mPos );
73 # What the pointers would change to if this would not just be a preview
74 $token["mMatchPos"] = $this->mMatchPos;
75 $token["mPos"] = $this->mMatch[0][$this->mMatchPos][1];
76 } else {
77 # If linkPrefixExtension is set, $this->mMatch[2][$this->mMatchPos][0]
78 # contains the link prefix, or is null if no link prefix exist.
79 if ( isset( $this->mMatch[2] ) && $this->mMatch[2][$this->mMatchPos][0] )
80 {
81 # prefixed link open tag, [0] is "prefix[["
82 $token["type"] = "[[";
83 $token["text"] = $this->mMatch[2][$this->mMatchPos][0]; # the prefix
84 } else {
85 $token["type"] = $this->mMatch[0][$this->mMatchPos][0];
86 if ( substr($token["type"],1,4) == "----" )
87 {
88 # any number of hyphens bigger than four is a <HR>.
89 # strip down to four.
90 $token["type"]="----";
91 }
92 }
93 # What the pointers would change to if this would not just be a preview
94 $token["mPos"] = $this->mPos + strlen( $this->mMatch[0][$this->mMatchPos][0] );
95 $token["mMatchPos"] = $this->mMatchPos + 1;
96 }
97 } elseif ( $this->mPos < $this->mTextLength ) {
98 $token["type"] = "text";
99 $token["text"] = substr( $this->mText, $this->mPos );
100 # What the pointers would change to if this would not just be a preview
101 $token["mPos"] = $this->mTextLength;
102 $token["mMatchPos"] = $this->mMatchPos;
103 } else {
104 $token = FALSE;
105 }
106 return $token;
107 }
108
109
110 }
111