includes/Tokenizer.php

   1 <?php
   2 class Tokenizer {
   3         /* private */ var $mText,               # Text to be processed by the tokenizer
   4                           $mPos,                # current position of tokenizer in text
   5                           $mTextLength,         # Length of $mText
   6                           $mCount,              # token count, computed in preParse
   7                           $mMatch,              # matches of tokenizer regex, computed in preParse
   8                           $mMatchPos;           # current token position of tokenizer. Each match can
   9                                                 # be up to two tokens: A matched token and the text after it.
  10
  11         /* private */ function Tokenizer()
  12         {
  13                 $this->mPos=0;
  14         }
  15
  16         # factory function
  17         function newFromString( $s )
  18         {
  19                 $t = new Tokenizer();
  20                 $t->mText = $s;
  21                 $t->preParse();
  22                 $t->mTextLength = strlen( $s );
  23                 return $t;
  24         }
  25
  26         function preParse()
  27         {
  28                 global $wgLang;
  29
  30                 # build up the regex, step by step.
  31                 # Basic features: Quotes for <em>/<strong> and hyphens for <hr>
  32                 $regex = "\'\'\'\'\'|\'\'\'|\'\'|\n-----*";
  33                 # Append regex for linkPrefixExtension
  34                 if (  $wgLang->linkPrefixExtension() ) {
  35                         $regex .= "|([a-zA-Z\x80-\xff]+)\[\[";
  36                 } else {
  37                         $regex .= "|\[\[";
  38                 }
  39                 # Closing link
  40                 $regex .= "|\]\]";
  41                 # Magic words that automatically generate links
  42                 $regex .= "|ISBN |RFC ";
  43                 # Language-specific additions
  44                 $regex .= $wgLang->tokenizerRegex();
  45                 # Finalize regex
  46                 $regex = "/(" . $regex . ")/";
  47
  48                 # Apply the regex to the text
  49                 $this->mCount = preg_match_all( $regex, $this->mText, $this->mMatch,
  50                                                 PREG_PATTERN_ORDER|PREG_OFFSET_CAPTURE);
  51                 $this->mMatchPos=0;
  52         }
  53
  54         function nextToken()
  55         {
  56                 $token = $this->previewToken();
  57                 if ( $token ) {
  58                         $this->mMatchPos = $token["mMatchPos"];
  59                         $this->mPos = $token["mPos"];
  60                 }
  61                 return $token;
  62         }
  63
  64
  65         function previewToken()
  66         {
  67                 if ( $this->mMatchPos < $this->mCount  ) {
  68                         $token["pos"] = $this->mPos;
  69                         if ( $this->mPos < $this->mMatch[0][$this->mMatchPos][1] ) {
  70                                 $token["type"] = "text";
  71                                 $token["text"] = substr( $this->mText, $this->mPos,
  72                                                          $this->mMatch[0][$this->mMatchPos][1] - $this->mPos );
  73                                 # What the pointers would change to if this would not just be a preview
  74                                 $token["mMatchPos"] = $this->mMatchPos;
  75                                 $token["mPos"] = $this->mMatch[0][$this->mMatchPos][1];
  76                         } else {
  77                                 # If linkPrefixExtension is set,  $this->mMatch[2][$this->mMatchPos][0]
  78                                 # contains the link prefix, or is null if no link prefix exist.
  79                                 if ( isset( $this->mMatch[2] ) && $this->mMatch[2][$this->mMatchPos][0] )
  80                                 {
  81                                         # prefixed link open tag, [0] is "prefix[["
  82                                         $token["type"] = "[[";
  83                                         $token["text"] = $this->mMatch[2][$this->mMatchPos][0]; # the prefix
  84                                 } else {
  85                                         $token["type"] = $this->mMatch[0][$this->mMatchPos][0];
  86                                         if ( substr($token["type"],1,4) == "----" )
  87                                         {
  88                                                 # any number of hyphens bigger than four is a <HR>.
  89                                                 # strip down to four.
  90                                                 $token["type"]="----";
  91                                         }
  92                                 }
  93                                 # What the pointers would change to if this would not just be a preview
  94                                 $token["mPos"] = $this->mPos + strlen( $this->mMatch[0][$this->mMatchPos][0] );
  95                                 $token["mMatchPos"] = $this->mMatchPos + 1;
  96                         }
  97                 } elseif ( $this->mPos < $this->mTextLength ) {
  98                         $token["type"] = "text";
  99                         $token["text"] = substr( $this->mText, $this->mPos );
 100                         # What the pointers would change to if this would not just be a preview
 101                         $token["mPos"] = $this->mTextLength;
 102                         $token["mMatchPos"] = $this->mMatchPos;
 103                 } else {
 104                         $token = FALSE;
 105                 }
 106                 return $token;
 107         }
 108
 109
 110 }
 111