* Hence, we limit the number of inclusions of any given page, thus bringing any
* attack back to O(N).
*/
+
define( 'MAX_INCLUDE_REPEAT', 100 );
define( 'MAX_INCLUDE_SIZE', 1000000 ); // 1 Million
define( 'URL_PROTOCOLS', 'http|https|ftp|irc|gopher|news|mailto' );
define( 'HTTP_PROTOCOLS', 'http|https' );
# Everything except bracket, space, or control characters
-define( 'EXT_LINK_URL_CLASS', '[^]\\x00-\\x20\\x7F]' );
-define( 'INVERSE_EXT_LINK_URL_CLASS', '[\]\\x00-\\x20\\x7F]' );
+define( 'EXT_LINK_URL_CLASS', '[^]<>\\x00-\\x20\\x7F]' );
# Including space
define( 'EXT_LINK_TEXT_CLASS', '[^\]\\x00-\\x1F\\x7F]' );
define( 'EXT_IMAGE_FNAME_CLASS', '[A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]' );
* @return ParserOutput a ParserOutput
*/
function parse( $text, &$title, $options, $linestart = true, $clearState = true ) {
- global $wgUseTidy;
+ global $wgUseTidy, $wgContLang;
$fname = 'Parser::parse';
wfProfileIn( $fname );
$stripState = NULL;
$text = $this->strip( $text, $this->mStripState );
+
$text = $this->internalParse( $text, $linestart );
$text = $this->unstrip( $text, $this->mStripState );
# Clean up special characters, only run once, next-to-last before doBlockLevels
}
# only once and last
$text = $this->doBlockLevels( $text, $linestart );
+
+ $text = $wgContLang->convert($text);
+
$text = $this->unstripNoWiki( $text, $this->mStripState );
- if($wgUseTidy) {
- $text = $this->tidy($text);
- }
$this->mOutput->setText( $text );
wfProfileOut( $fname );
return $this->mOutput;
/**
* interface with html tidy, used if $wgUseTidy = true
*
- * @access private
+ * @access public
+ * @static
*/
function tidy ( $text ) {
global $wgTidyConf, $wgTidyBin, $wgTidyOpts;
wfProfileIn( $fname );
$cleansource = '';
+ $opts = '';
switch(strtoupper($wgOutputEncoding)) {
case 'ISO-8859-1':
- $wgTidyOpts .= ($wgInputEncoding == $wgOutputEncoding)? ' -latin1':' -raw';
+ $opts .= ($wgInputEncoding == $wgOutputEncoding)? ' -latin1':' -raw';
break;
case 'UTF-8':
- $wgTidyOpts .= ($wgInputEncoding == $wgOutputEncoding)? ' -utf8':' -raw';
+ $opts .= ($wgInputEncoding == $wgOutputEncoding)? ' -utf8':' -raw';
break;
default:
- $wgTidyOpts .= ' -raw';
+ $opts .= ' -raw';
}
$wrappedtext = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"'.
1 => array('pipe', 'w'),
2 => array('file', '/dev/null', 'a')
);
- $process = proc_open("$wgTidyBin -config $wgTidyConf $wgTidyOpts", $descriptorspec, $pipes);
+ $process = proc_open("$wgTidyBin -config $wgTidyConf $wgTidyOpts$opts", $descriptorspec, $pipes);
if (is_resource($process)) {
fwrite($pipes[0], $wrappedtext);
fclose($pipes[0]);
array_push ( $ltr , $this->fixTagAttributes ( $x ) ) ;
}
else if ( '|' == $fc || '!' == $fc || '|+' == substr ( $x , 0 , 2 ) ) { # Caption
+ # $x is a table row
if ( '|+' == substr ( $x , 0 , 2 ) ) {
$fc = '+' ;
$x = substr ( $x , 1 ) ;
if ( $fc == '!' ) $after = str_replace ( '!!' , '||' , $after ) ;
$after = explode ( '||' , $after ) ;
$t[$k] = '' ;
+
+ # Loop through each table cell
foreach ( $after AS $theline )
{
$z = '' ;
else if ( $fc == '+' ) $l = 'caption' ;
else $l = '' ;
array_push ( $ltd , $l ) ;
+
+ # Cell parameters
$y = explode ( '|' , $theline , 2 ) ;
- if ( count ( $y ) == 1 ) $y = "{$z}<{$l}>{$y[0]}" ;
+ # Note that a '|' inside an invalid link should not
+ # be mistaken as delimiting cell parameters
+ if ( strpos( $y[0], '[[' ) !== false ) {
+ $y = array ($theline);
+ }
+ if ( count ( $y ) == 1 )
+ $y = "{$z}<{$l}>{$y[0]}" ;
else $y = $y = "{$z}<{$l} ".$this->fixTagAttributes($y[0]).">{$y[1]}" ;
$t[$k] .= $y ;
array_push ( $td , true ) ;
$text = $this->removeHTMLtags( $text );
$text = $this->replaceVariables( $text, $args );
- $text = $wgContLang->convert($text);
-
$text = preg_replace( '/(^|\n)-----*/', '\\1<hr />', $text );
$text = $this->doHeadings( $text );
$text = $wgDateFormatter->reformat( $this->mOptions->getDateFormat(), $text );
}
$text = $this->doAllQuotes( $text );
- $text = $this->doMagicLinks( $text );
- $text = $this->replaceInternalLinks ( $text );
- # Another call to replace links and images inside captions of images
$text = $this->replaceInternalLinks ( $text );
$text = $this->replaceExternalLinks( $text );
+ $text = $this->doMagicLinks( $text );
$text = $this->doTableStuff( $text );
$text = $this->formatHeadings( $text, $isMain );
$sk =& $this->mOptions->getSkin();
# replacing any non-bracketed links
$trail = $this->replaceFreeExternalLinks( $trail );
- $la = $sk->getExternalLinkAttributes( $url, $text );
-
# Use the encoded URL
# This means that users can paste URLs directly into the text
# Funny characters like ö aren't valid in URLs anyway
# This was changed in August 2004
- $s .= "<a href=\"{$url}\"{$la}>{$text}</a>{$dtrail}{$paren}{$trail}";
+ $s .= $sk->makeExternalLink( $url, $text, false ) . $dtrail. $paren . $trail;
}
wfProfileOut( $fname );
*
* @access private
*/
+
function replaceInternalLinks( $s ) {
global $wgLang, $wgContLang, $wgLinkCache;
- global $wgNamespacesWithSubpages;
static $fname = 'Parser::replaceInternalLinks' ;
+ # use a counter to prevent too much unknown links from
+ # being checked for different language variants.
+ static $convertCount;
wfProfileIn( $fname );
wfProfileIn( $fname.'-setup' );
$redirect = MagicWord::get ( MAG_REDIRECT ) ;
+ #split the entire text string on occurences of [[
$a = explode( '[[', ' ' . $s );
+ #get the first element (all text up to first [[), and remove the space we added
$s = array_shift( $a );
$s = substr( $s, 1 );
# Match a link having the form [[namespace:link|alternate]]trail
static $e1 = FALSE;
if ( !$e1 ) { $e1 = "/^([{$tc}]+)(?:\\|([^]]+))?]](.*)\$/sD"; }
+ # Match cases where there is no "]]", which might still be images
+ static $e1_img = FALSE;
+ if ( !$e1_img ) { $e1_img = "/^([{$tc}]+)\\|(.*)\$/sD"; }
# Match the end of a line for a word that's not followed by whitespace,
# e.g. in the case of 'The Arab al[[Razi]]', 'al' will be matched
static $e2 = '/^(.*?)([a-zA-Z\x80-\xff]+)$/sD';
$useLinkPrefixExtension = $wgContLang->linkPrefixExtension();
- # Special and Media are pseudo-namespaces; no pages actually exist in them
$nottalk = !Namespace::isTalk( $this->mTitle->getNamespace() );
wfProfileOut( $fname.'-setup' );
- # start procedeeding each line
- foreach ( $a as $line ) {
+ # Loop for each link
+ for ($k = 0; isset( $a[$k] ); $k++) {
+ $line = $a[$k];
wfProfileIn( $fname.'-prefixhandling' );
if ( $useLinkPrefixExtension ) {
if ( preg_match( $e2, $s, $m ) ) {
}
wfProfileOut( $fname.'-prefixhandling' );
+ $might_be_img = false;
+
if ( preg_match( $e1, $line, $m ) ) { # page with normal text or alt
$text = $m[2];
# fix up urlencoded title texts
if(preg_match('/%/', $m[1] )) $m[1] = urldecode($m[1]);
$trail = $m[3];
+ } elseif( preg_match($e1_img, $line, $m) ) { # Invalid, but might be an image with a link in its caption
+ $might_be_img = true;
+ $text = $m[2];
+ if(preg_match('/%/', $m[1] )) $m[1] = urldecode($m[1]);
+ $trail = "";
} else { # Invalid form; output directly
$s .= $prefix . '[[' . $line ;
continue;
continue;
}
- # Valid link forms:
- # Foobar -- normal
- # :Foobar -- override special treatment of prefix (images, language links)
- # /Foobar -- convert to CurrentPage/Foobar
- # /Foobar/ -- convert to CurrentPage/Foobar, strip the initial / from text
-
- # Look at the first character
- $c = substr($m[1],0,1);
- $noforce = ($c != ':');
-
- # subpage
- if( $c == '/' ) {
- # / at end means we don't want the slash to be shown
- if(substr($m[1],-1,1)=='/') {
- $m[1]=substr($m[1],1,strlen($m[1])-2);
- $noslash=$m[1];
- } else {
- $noslash=substr($m[1],1);
- }
-
- # Some namespaces don't allow subpages
- if(!empty($wgNamespacesWithSubpages[$this->mTitle->getNamespace()])) {
- # subpages allowed here
- $link = $this->mTitle->getPrefixedText(). '/' . trim($noslash);
- if( '' == $text ) {
- $text= $m[1];
- } # this might be changed for ugliness reasons
- } else {
- # no subpage allowed, use standard link
- $link = $noslash;
- }
-
- } elseif( $noforce ) { # no subpage
- $link = $m[1];
- } else {
- # We don't want to keep the first character
- $link = substr( $m[1], 1 );
+ # Make subpage if necessary
+ $link = $this->maybeDoSubpageLink( $m[1], $text );
+
+ $noforce = (substr($m[1], 0, 1) != ':');
+ if (!$noforce) {
+ # Strip off leading ':'
+ $link = substr($link, 1);
}
- $wasblank = ( '' == $text );
- if( $wasblank ) $text = $link;
-
$nt = Title::newFromText( $link );
if( !$nt ) {
$s .= $prefix . '[[' . $line;
continue;
}
-
+
+ //check other language variants of the link
+ //if the article does not exist
+ global $wgContLang;
+ $variants = $wgContLang->getVariants();
+
+ if(sizeof($variants) > 1 && $convertCount < 200) {
+ $varnt = false;
+ if($nt->getArticleID() == 0) {
+ foreach ( $variants as $v ) {
+ if($v == $wgContLang->getPreferredVariant())
+ continue;
+ $convertCount ++;
+ $varlink = $wgContLang->autoConvert($link, $v);
+ $varnt = Title::newFromText($varlink);
+ if($varnt && $varnt->getArticleID()>0) {
+ break;
+ }
+ }
+ }
+ if($varnt && $varnt->getArticleID()>0) {
+ $nt = $varnt;
+ $link = $varlink;
+ }
+ }
+
$ns = $nt->getNamespace();
$iw = $nt->getInterWiki();
+ if ($might_be_img) { # if this is actually an invalid link
+ if ($ns == NS_IMAGE && $noforce) { #but might be an image
+ $found = false;
+ while (isset ($a[$k+1]) ) {
+ #look at the next 'line' to see if we can close it there
+ $next_line = array_shift(array_splice( $a, $k + 1, 1) );
+ if( preg_match("/^(.*?]].*?)]](.*)$/sD", $next_line, $m) ) {
+ # the first ]] closes the inner link, the second the image
+ $found = true;
+ $text .= '[[' . $m[1];
+ $trail = $m[2];
+ break;
+ } elseif( preg_match("/^.*?]].*$/sD", $next_line, $m) ) {
+ #if there's exactly one ]] that's fine, we'll keep looking
+ $text .= '[[' . $m[0];
+ } else {
+ #if $next_line is invalid too, we need look no further
+ $text .= '[[' . $next_line;
+ break;
+ }
+ }
+ if ( !$found ) {
+ # we couldn't find the end of this imageLink, so output it raw
+ #but don't ignore what might be perfectly normal links in the text we've examined
+ $text = $this->replaceInternalLinks($text);
+ $s .= $prefix . '[[' . $link . '|' . $text;
+ # note: no $trail, because without an end, there *is* no trail
+ continue;
+ }
+ } else { #it's not an image, so output it raw
+ $s .= $prefix . '[[' . $link . '|' . $text;
+ # note: no $trail, because without an end, there *is* no trail
+ continue;
+ }
+ }
+
+ $wasblank = ( '' == $text );
+ if( $wasblank ) $text = $link;
+
+
# Link not escaped by : , create the various objects
if( $noforce ) {
}
if ( $ns == NS_IMAGE ) {
- $s .= $prefix . $sk->makeImageLinkObj( $nt, $text ) . $trail;
+ # recursively parse links inside the image caption
+ # actually, this will parse them in any other parameters, too,
+ # but it might be hard to fix that, and it doesn't matter ATM
+ $text = $this->replaceExternalLinks($text);
+ $text = $this->replaceInternalLinks($text);
+
+ # replace the image with a link-holder so that replaceExternalLinks() can't mess with it
+ $s .= $prefix . $this->insertStripItem( $sk->makeImageLinkObj( $nt, $text ), $this->mStripState ) . $trail;
$wgLinkCache->addImageLinkObj( $nt );
continue;
}
if ( $ns == NS_CATEGORY ) {
$t = $nt->getText() ;
- $nnt = Title::newFromText ( Namespace::getCanonicalName(NS_CATEGORY).':'.$t ) ;
$wgLinkCache->suspend(); # Don't save in links/brokenlinks
$pPLC=$sk->postParseLinkColour();
$sk->postParseLinkColour( false );
- $t = $sk->makeLinkObj( $nnt, $t, '', '' , $prefix );
+ $t = $sk->makeLinkObj( $nt, $t, '', '' , $prefix );
$sk->postParseLinkColour( $pPLC );
$wgLinkCache->resume();
continue;
}
}
-
+
+ $text = $wgContLang->convert($text);
+
if( ( $nt->getPrefixedText() === $this->mTitle->getPrefixedText() ) &&
( strpos( $link, '#' ) === FALSE ) ) {
# Self-links are handled specially; generally de-link and change to bold.
continue;
}
+ # Special and Media are pseudo-namespaces; no pages actually exist in them
if( $ns == NS_MEDIA ) {
$s .= $prefix . $sk->makeMediaLinkObj( $nt, $text ) . $trail;
$wgLinkCache->addImageLinkObj( $nt );
return $s;
}
+ /**
+ * Handle link to subpage if necessary
+ * @param $target string the source of the link
+ * @param &$text the link text, modified as necessary
+ * @return string the full name of the link
+ * @access private
+ */
+ function maybeDoSubpageLink($target, &$text) {
+ # Valid link forms:
+ # Foobar -- normal
+ # :Foobar -- override special treatment of prefix (images, language links)
+ # /Foobar -- convert to CurrentPage/Foobar
+ # /Foobar/ -- convert to CurrentPage/Foobar, strip the initial / from text
+ global $wgNamespacesWithSubpages;
+
+ $fname = 'Parser::maybeDoSubpageLink';
+ wfProfileIn( $fname );
+ # Look at the first character
+ if( $target{0} == '/' ) {
+ # / at end means we don't want the slash to be shown
+ if(substr($target,-1,1)=='/') {
+ $target=substr($target,1,-1);
+ $noslash=$target;
+ } else {
+ $noslash=substr($target,1);
+ }
+
+ # Some namespaces don't allow subpages
+ if(!empty($wgNamespacesWithSubpages[$this->mTitle->getNamespace()])) {
+ # subpages allowed here
+ $ret = $this->mTitle->getPrefixedText(). '/' . trim($noslash);
+ if( '' === $text ) {
+ $text = $target;
+ } # this might be changed for ugliness reasons
+ } else {
+ # no subpage allowed, use standard link
+ $ret = $target;
+ }
+ } else {
+ # no subpage
+ $ret = $target;
+ }
+
+ wfProfileOut( $fname );
+ return $ret;
+ }
+
/**#@+
* Used by doBlockLevels()
* @access private
# ; title : definition text
# So we check for : in the remainder text to split up the
# title and definition, without b0rking links.
- # FIXME: This is not foolproof. Something better in Tokenizer might help.
- if( preg_match( '/^(.*?(?:\s| )):(.*)$/', $t, $match ) ) {
- $term = $match[1];
+ if ($this->findColonNoLinks($t, $term, $t2) !== false) {
+ $t = $t2;
$output .= $term . $this->nextItem( ':' );
- $t = $match[2];
}
}
} elseif( $prefixLength || $lastPrefixLength ) {
if ( ';' == $char ) {
# FIXME: This is dupe of code above
- if( preg_match( '/^(.*?(?:\s| )):(.*)$/', $t, $match ) ) {
- $term = $match[1];
+ if ($this->findColonNoLinks($t, $term, $t2) !== false) {
+ $t = $t2;
$output .= $term . $this->nextItem( ':' );
- $t = $match[2];
}
}
++$commonPrefixLength;
return $output;
}
+ /**
+ * Split up a string on ':', ignoring any occurences inside
+ * <a>..</a> or <span>...</span>
+ * @param $str string the string to split
+ * @param &$before string set to everything before the ':'
+ * @param &$after string set to everything after the ':'
+ * return string the position of the ':', or false if none found
+ */
+ function findColonNoLinks($str, &$before, &$after) {
+ # I wonder if we should make this count all tags, not just <a>
+ # and <span>. That would prevent us from matching a ':' that
+ # comes in the middle of italics other such formatting....
+ # -- Wil
+ $fname = 'Parser::findColonNoLinks';
+ wfProfileIn( $fname );
+ $pos = 0;
+ do {
+ $colon = strpos($str, ':', $pos);
+
+ if ($colon !== false) {
+ $before = substr($str, 0, $colon);
+ $after = substr($str, $colon + 1);
+
+ # Skip any ':' within <a> or <span> pairs
+ $a = substr_count($before, '<a');
+ $s = substr_count($before, '<span');
+ $ca = substr_count($before, '</a>');
+ $cs = substr_count($before, '</span>');
+
+ if ($a <= $ca and $s <= $cs) {
+ # Tags are balanced before ':'; ok
+ break;
+ }
+ $pos = $colon + 1;
+ }
+ } while ($colon !== false);
+ wfProfileOut( $fname );
+ return $colon;
+ }
+
/**
* Return value of a magic variable (like PAGENAME)
*
* @access private
*/
function initialiseVariables() {
+ $fname = 'Parser::initialiseVariables';
+ wfProfileIn( $fname );
global $wgVariableIDs;
$this->mVariables = array();
foreach ( $wgVariableIDs as $id ) {
$mw =& MagicWord::get( $id );
$mw->addToArray( $this->mVariables, $this->getVariableValue( $id ) );
}
+ wfProfileOut( $fname );
}
/**
}
# Load from database
+ $itcamefromthedatabase = false;
if ( !$found ) {
- $title = Title::newFromText( $part1, NS_TEMPLATE );
+ $ns = NS_TEMPLATE;
+ $part1 = $this->maybeDoSubpageLink( $part1, $subpage='' );
+ if ($subpage !== '') {
+ $ns = $this->mTitle->getNamespace();
+ }
+ $title = Title::newFromText( $part1, $ns );
if ( !is_null( $title ) && !$title->isExternal() ) {
# Check for excessive inclusion
$dbk = $title->getPrefixedDBkey();
if ( $articleContent !== false ) {
$found = true;
$text = $linestart . $articleContent;
+ $itcamefromthedatabase = true;
}
}
$wgLinkCache->addLinkObj( $title );
}
+ # If the template begins with a table or block-level
+ # element, it should be treated as beginning a new line.
+ if ($linestart !== '\n' && preg_match('/^({\\||:|;|#|\*)/', $text)) {
+ $text = "\n" . $text;
+ }
+ }
+
+ # Empties the template path
+ $this->mTemplatePath = array();
+ if ( !$found ) {
+ return $matches[0];
+ } else {
# replace ==section headers==
# XXX this needs to go away once we have a better parser.
- if ( $this->mOutputType == OT_HTML ) {
+ if ( $this->mOutputType != OT_WIKI && $itcamefromthedatabase ) {
if( !is_null( $title ) )
$encodedname = base64_encode($title->getPrefixedDBkey());
else
$nsec++;
}
}
-
- # If the template begins with a table or block-level
- # element, it should be treated as beginning a new line.
- if ($linestart !== '\n' && preg_match('/^({\\||:|;|#|\*)/', $text)) {
- $text = "\n" . $text;
- }
}
# Empties the template path