X-Git-Url: http://git.heureux-cyclage.org/?a=blobdiff_plain;f=includes%2FTokenizer.php;h=84a2f06bcb44a6cf6552c78dd7d5bedbc3a651d7;hb=b19e0b60ce8e75d61cfbc3c405464ea767a6eaac;hp=03ea0199977e6e26e5062398b581fbeb59c98145;hpb=8b2d83cc53b120425ea79694152a5c29d4014822;p=lhc%2Fweb%2Fwiklou.git diff --git a/includes/Tokenizer.php b/includes/Tokenizer.php index 03ea019997..84a2f06bcb 100644 --- a/includes/Tokenizer.php +++ b/includes/Tokenizer.php @@ -1,4 +1,13 @@ mPos=0; $this->mTokenQueue=array(); + $this->linkPrefixExtension = $wgLang->linkPrefixExtension(); } - # factory function - function newFromString( $s ) - { + /** + * factory function + */ + function newFromString( $s ) { + $fname = 'Tokenizer::newFromString'; + wfProfileIn( $fname ); + $t = new Tokenizer(); $t->mText = $s; $t->mTextLength = strlen( $s ); - // echo "New tokenizer generated.
{$s}
\n"; + + wfProfileOut( $fname ); return $t; } - // Return the next token, but do not increase the pointer. The next call - // to previewToken or nextToken will return the same token again. - // Actually, the pointer is increased, but the token is queued. The next - // call to previewToken or nextToken will check the queue and return - // the stored token. - function previewToken() - { + /** + * Return the next token, but do not increase the pointer. The next call + * to previewToken or nextToken will return the same token again. + * Actually, the pointer is increased, but the token is queued. The next + * call to previewToken or nextToken will check the queue and return + * the stored token. + */ + function previewToken() { + $fname = 'Tokenizer::previewToken'; + wfProfileIn( $fname ); + if ( count( $this->mQueuedToken ) != 0 ) { // still one token from the last round around. Return that one first. $token = $this->mQueuedToken[0]; @@ -37,90 +61,99 @@ class Tokenizer { $token = $this->nextToken(); array_unshift( $this->mQueuedToken, $token ); } + + wfProfileOut( $fname ); return $token; } - // get the next token - // proceeds character by character through the text, looking for characters needing - // special attention. Those are currently: I, R, ', [, ], newline - // - // TODO: prefixed links for Arabic wikipedia not implemented yet - // handling of French blanks not yet implemented - function nextToken() - { + /** + * Get the next token. + * + * proceeds character by character through the text, looking for characters needing + * special attention. Those are currently: I, R, ', [, ], newline + * + * @todo handling of French blanks not yet implemented + */ + function nextToken() { + $fname = 'Tokenizer::nextToken'; + wfProfileIn( $fname ); + if ( count( $this->mQueuedToken ) != 0 ) { // still one token from the last round around. Return that one first. $token = array_shift( $this->mQueuedToken ); + } else if ( $this->mPos > $this->mTextLength ) { + // If no text is left, return 'false'. + $token = false; } else { - $token["text"]=""; - $token["type"]="text"; - - // If no text is left, return "false". - if ( $this->mPos > $this->mTextLength ) - return false; + $token['text']=''; + $token['type']='text'; while ( $this->mPos <= $this->mTextLength ) { switch ( @$ch = $this->mText[$this->mPos] ) { case 'R': // for "RFC " - if ( isset($this->mText[$this->mPos+4]) && - $this->mText[$this->mPos+1] == 'F' && - $this->mText[$this->mPos+2] == 'C' && - $this->mText[$this->mPos+4] == ' ' ) { - $queueToken["type"] = $queueToken["text"] = "RFC "; + if ( $this->continues('FC ') ) { + $queueToken['type'] = $queueToken['text'] = 'RFC '; $this->mQueuedToken[] = $queueToken; $this->mPos += 3; break 2; // switch + while } break; case 'I': // for "ISBN " - if ( isset($this->mText[$this->mPos+4]) && - $this->mText[$this->mPos+1] == 'S' && - $this->mText[$this->mPos+2] == 'B' && - $this->mText[$this->mPos+3] == 'N' && - $this->mText[$this->mPos+4] == ' ' ) { - $queueToken["type"] = $queueToken["text"] = "ISBN "; + if ( $this->continues('SBN ') ) { + $queueToken['type'] = $queueToken['text'] = 'ISBN '; $this->mQueuedToken[] = $queueToken; $this->mPos += 4; break 2; // switch + while } break; - case "[": // for links "[[" - if ( isset($this->mText[$this->mPos+2]) && - $this->mText[$this->mPos+1] == "[" && - $this->mText[$this->mPos+2] == "[" ) { - $queueToken["type"] = "[[["; - $queueToken["text"] = ""; + case '[': // for links "[[" + if ( $this->continues('[[') ) { + $queueToken['type'] = '[[['; + $queueToken['text'] = ''; $this->mQueuedToken[] = $queueToken; $this->mPos += 3; break 2; // switch + while - } else if ( isset($this->mText[$this->mPos+1]) && - $this->mText[$this->mPos+1] == "[" ) { - $queueToken["type"] = "[["; - $queueToken["text"] = ""; + } else if ( $this->continues('[') ) { + $queueToken['type'] = '[['; + $queueToken['text'] = ''; + // Check for a "prefixed link", e.g. Al[[Khazar]] + // Mostly for arabic wikipedia + if ( $this->linkPrefixExtension ) { + while ( $this->linkPrefixExtension + && ($len = strlen( $token['text'] ) ) > 0 + && !ctype_space( $token['text'][$len-1] ) ) + { + //prepend the character to the link's open tag + $queueToken['text'] = $token['text'][$len-1] . $queueToken['text']; + //remove character from the end of the text token + $token['text'] = substr( $token['text'], 0, -1); + } + } $this->mQueuedToken[] = $queueToken; $this->mPos += 2; break 2; // switch + while } break; - case "]": // for end of links "]]" - if ( isset($this->mText[$this->mPos+1]) && - $this->mText[$this->mPos+1] == "]" ) { - $queueToken["type"] = "]]"; - $queueToken["text"] = ""; + case ']': // for end of links "]]" + if ( $this->continues(']') ) { + $queueToken['type'] = ']]'; + $queueToken['text'] = ''; $this->mQueuedToken[] = $queueToken; $this->mPos += 2; break 2; // switch + while } break; case "'": // for all kind of em's and strong's - if ( isset($this->mText[$this->mPos+1]) && - $this->mText[$this->mPos+1] == "'" ) { - $queueToken["type"] = "'"; - $queueToken["text"] = ""; - while(isset($this->mText[$this->mPos+1]) && $this->mText[$this->mPos+1] == "'" ) { - $queueToken["type"] .= "'"; + if ( $this->continues("'") ) { + $queueToken['type'] = "'"; + $queueToken['text'] = ''; + while( ($this->mPos+1 < $this->mTextLength) + && $this->mText[$this->mPos+1] == "'" ) + { + $queueToken['type'] .= "'"; + $queueToken['pos'] = $this->mPos; $this->mPos ++; } @@ -130,30 +163,159 @@ class Tokenizer { } break; case "\n": // for block levels, actually, only "----" is handled. - case "\r": - if ( isset($this->mText[$this->mPos+4]) && - $this->mText[$this->mPos+1] == "-" && - $this->mText[$this->mPos+2] == "-" && - $this->mText[$this->mPos+3] == "-" && - $this->mText[$this->mPos+4] == "-" ) { - $queueToken["type"] = "----"; - $queueToken["text"] = ""; + case "\r": // headings are detected to close any unbalanced em or strong tags in a section + if ( $this->continues( '----' ) ) + { + $queueToken['type'] = '----'; + $queueToken['text'] = ''; $this->mQueuedToken[] = $queueToken; $this->mPos += 5; - while (isset($this->mText[$this->mPos]) and $this->mText[$this->mPos] == "-" ) { + while ( $this->mPos<$this->mTextLength + and $this->mText[$this->mPos] == '-' ) + { $this->mPos ++; } break 2; + } else if ( + $this->continues( 'continues( 'continues( 'continues( 'continues( 'continues( 'continues( 'mQueuedToken[] = $queueToken; + $this->mPos ++; + break 2; // switch + while + } + break; + case '!': // French spacing rules have a space before exclamation + case '?': // and question marks. Those have to become   + case ':': // And colons, Hashar says ... + if ( $this->preceeded( ' ' ) ) + { + // strip blank from Token + $token['text'] = substr( $token['text'], 0, -1 ); + $queueToken['type'] = 'blank'; + $queueToken['text'] = ' '.$ch; + $this->mQueuedToken[] = $queueToken; + $this->mPos ++; + break 2; // switch + while } + break; + case '0': // A space between two numbers is used to ease reading + case '1': // of big numbers, e.g. 1 000 000. Those spaces need + case '2': // to be unbreakable + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if ( ($this->mTextLength >= $this->mPos +2) + && ($this->mText[$this->mPos+1] == ' ') + && ctype_digit( $this->mText[$this->mPos+2] ) ) + { + $queueToken['type'] = 'blank'; + $queueToken['text'] = $ch . ' '; + $this->mQueuedToken[] = $queueToken; + $this->mPos += 2; + break 2; // switch + while + } + break; + case "\302": // first byte of UTF-8 Character Guillemet-left + if ( $this->continues( "\253 ") ) // second byte and a blank + { + $queueToken['type'] = 'blank'; + $queueToken['text'] = "\302\253 "; + $this->mQueuedToken[] = $queueToken; + $this->mPos += 3; + break 2; // switch + while + } + break; + case "\273": //last byte of UTF-8 Character Guillemet-right + if ( $this->preceeded( " \302" ) ) + { + $queueToken['type'] = 'blank'; + $queueToken['text'] = " \302\273"; + $token['text'] = substr( $token['text'], 0, -2 ); + $this->mQueuedToken[] = $queueToken; + $this->mPos ++; + break 2; // switch + while + } + break; + case '&': //extensions like , since HTML stripping has already been done, + //those look like <timeline> + if ( $this->continues( 'lt;timeline>' ) ) + { + $queueToken['type'] = ''; + $queueToken['text'] = '<timeline>'; + $this->mQueuedToken[] = $queueToken; + $this->mPos += 16; + break 2; // switch + while + } + break; + } /* switch */ - $token["text"].=$ch; + $token['text'].=$ch; $this->mPos ++; // echo $this->mPos . "
\n"; } /* while */ } /* if (nothing left in queue) */ + + wfProfileOut( $fname ); return $token; } - + /** + * function continues + * + * checks whether the mText continues with $cont from mPos+1 + * + * @access private + */ + function continues( $cont ) { + // If string is not long enough to contain $cont, return false + if ( $this->mTextLength < $this->mPos + strlen( $cont ) ) + return false; + for ( $i=0; $i < strlen( $cont ); $i++ ) + { + if ( $this->mText[$this->mPos+1+$i] != $cont[$i] ) + return false; + } + return true; + } + + /** + * function preceeded + * + * checks whether the mText is preceeded by $prec at position mPos + * + * @access private + */ + function preceeded( $prec ) { + $len = strlen( $prec ); + // if $prec is longer than the text up to mPos, return false + if ( $this->mPos < $len ) + return false; + return ( 0 == strcmp( $prec, substr($this->mText, $this->mPos-$len, $len) ) ); + } + + /** + * + */ + function readAllUntil( $border ) { + $n = strpos( $this->mText, $border, $this->mPos ); + if ( $n === false ) + return ''; + $ret = substr( $this->mText, $this->mPos, $n - $this->mPos ); + $this->mPos = $n + strlen( $border ) + 1; + return $ret; + } + } -