* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
- * @package MediaWiki
- * @subpackage Parser
+ * @addtogroup Parser
*/
/**
'zwj' => 8205,
'zwnj' => 8204 );
-/** @package MediaWiki */
class Sanitizer {
/**
* Cleans up HTML, removes dangerous tags and attributes, and
* @param array $args for the processing callback
* @return string
*/
- function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
+ static function removeHTMLtags( $text, $processCallback = null, $args = array() ) {
global $wgUseTidy, $wgUserHtml;
- $fname = 'Parser::removeHTMLtags';
- wfProfileIn( $fname );
-
- if( $wgUserHtml ) {
- $htmlpairs = array( # Tags that must be closed
- 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
- 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
- 'strike', 'strong', 'tt', 'var', 'div', 'center',
- 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
- 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span'
- );
- $htmlsingle = array(
- 'br', 'hr', 'li', 'dt', 'dd'
- );
- $htmlsingleonly = array( # Elements that cannot have close tags
- 'br', 'hr'
- );
- $htmlnest = array( # Tags that can be nested--??
- 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
- 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
- );
- $tabletags = array( # Can only appear inside table
- 'td', 'th', 'tr'
- );
- } else {
- $htmlpairs = array();
- $htmlsingle = array();
- $htmlnest = array();
- $tabletags = array();
- }
- $htmlsingle = array_merge( $tabletags, $htmlsingle );
- $htmlelements = array_merge( $htmlsingle, $htmlpairs );
+ static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
+ $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
+
+ wfProfileIn( __METHOD__ );
+
+ if ( !$staticInitialised ) {
+ if( $wgUserHtml ) {
+ $htmlpairs = array( # Tags that must be closed
+ 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
+ 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
+ 'strike', 'strong', 'tt', 'var', 'div', 'center',
+ 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
+ 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
+ );
+ $htmlsingle = array(
+ 'br', 'hr', 'li', 'dt', 'dd'
+ );
+ $htmlsingleonly = array( # Elements that cannot have close tags
+ 'br', 'hr'
+ );
+ $htmlnest = array( # Tags that can be nested--??
+ 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
+ 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
+ );
+ $tabletags = array( # Can only appear inside table, we will close them
+ 'td', 'th', 'tr',
+ );
+ $htmllist = array( # Tags used by list
+ 'ul','ol',
+ );
+ $listtags = array( # Tags that can appear in a list
+ 'li',
+ );
+
+ } else {
+ $htmlpairs = array();
+ $htmlsingle = array();
+ $htmlnest = array();
+ $tabletags = array();
+ }
+
+ $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
+ $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
+
+ # Convert them all to hashtables for faster lookup
+ $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
+ 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
+ foreach ( $vars as $var ) {
+ $$var = array_flip( $$var );
+ }
+ $staticInitialised = true;
+ }
# Remove HTML comments
$text = Sanitizer::removeHTMLcomments( $text );
-
$bits = explode( '<', $text );
- $text = array_shift( $bits );
+ $text = str_replace( '>', '>', array_shift( $bits ) );
if(!$wgUseTidy) {
- $tagstack = array(); $tablestack = array();
+ $tagstack = $tablestack = array();
foreach ( $bits as $x ) {
- $prev = error_reporting( E_ALL & ~( E_NOTICE | E_WARNING ) );
- preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
- $x, $regs );
- list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
- error_reporting( $prev );
+ $regs = array();
+ if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
+ list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
+ } else {
+ $slash = $t = $params = $brace = $rest = null;
+ }
$badtag = 0 ;
- if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
+ if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
# Check our stack
if ( $slash ) {
# Closing a tag...
- if( in_array( $t, $htmlsingleonly ) ) {
+ if( isset( $htmlsingleonly[$t] ) ) {
$badtag = 1;
} elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
- @array_push( $tagstack, $ot );
- $badtag = 1;
+ if ( isset( $htmlsingleallowed[$ot] ) ) {
+ # Pop all elements with an optional close tag
+ # and see if we find a match below them
+ $optstack = array();
+ array_push ($optstack, $ot);
+ while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
+ isset( $htmlsingleallowed[$ot] ) )
+ {
+ array_push ($optstack, $ot);
+ }
+ if ( $t != $ot ) {
+ # No match. Push the optinal elements back again
+ $badtag = 1;
+ while ( $ot = @array_pop( $optstack ) ) {
+ array_push( $tagstack, $ot );
+ }
+ }
+ } else {
+ @array_push( $tagstack, $ot );
+ # <li> can be nested in <ul> or <ol>, skip those cases:
+ if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
+ $badtag = 1;
+ }
+ }
} else {
if ( $t == 'table' ) {
$tagstack = array_pop( $tablestack );
}
- $newparams = '';
}
+ $newparams = '';
} else {
# Keep track for later
- if ( in_array( $t, $tabletags ) &&
+ if ( isset( $tabletags[$t] ) &&
! in_array( 'table', $tagstack ) ) {
$badtag = 1;
} else if ( in_array( $t, $tagstack ) &&
- ! in_array ( $t , $htmlnest ) ) {
+ ! isset( $htmlnest [$t ] ) ) {
$badtag = 1 ;
# Is it a self closed htmlpair ? (bug 5487)
} else if( $brace == '/>' &&
- in_array($t, $htmlpairs) ) {
+ isset( $htmlpairs[$t] ) ) {
$badtag = 1;
- } elseif( in_array( $t, $htmlsingleonly ) ) {
+ } elseif( isset( $htmlsingleonly[$t] ) ) {
# Hack to force empty tag for uncloseable elements
$brace = '/>';
- } else if( in_array( $t, $htmlsingle ) ) {
+ } else if( isset( $htmlsingle[$t] ) ) {
# Hack to not close $htmlsingle tags
$brace = NULL;
+ } else if( isset( $tabletags[$t] )
+ && in_array($t ,$tagstack) ) {
+ // New table tag but forgot to close the previous one
+ $text .= "</$t>";
} else {
if ( $t == 'table' ) {
array_push( $tablestack, $tagstack );
}
if ( ! $badtag ) {
$rest = str_replace( '>', '>', $rest );
- $close = ( $brace == '/>' ) ? ' /' : '';
+ $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
$text .= "<$slash$t$newparams$close>$rest";
continue;
}
foreach ( $bits as $x ) {
preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
$x, $regs );
- @list( $qbar, $slash, $t, $params, $brace, $rest ) = $regs;
- if ( in_array( $t = strtolower( $t ), $htmlelements ) ) {
+ @list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs;
+ if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
if( is_callable( $processCallback ) ) {
call_user_func_array( $processCallback, array( &$params, $args ) );
}
}
}
}
- wfProfileOut( $fname );
+ wfProfileOut( __METHOD__ );
return $text;
}
* @param string $text
* @return string
*/
- function removeHTMLcomments( $text ) {
- $fname='Parser::removeHTMLcomments';
- wfProfileIn( $fname );
+ static function removeHTMLcomments( $text ) {
+ wfProfileIn( __METHOD__ );
while (($start = strpos($text, '<!--')) !== false) {
$end = strpos($text, '-->', $start + 4);
if ($end === false) {
$text = substr_replace($text, '', $start, $end - $start);
}
}
- wfProfileOut( $fname );
+ wfProfileOut( __METHOD__ );
return $text;
}
/**
- * Take a tag soup fragment listing an HTML element's attributes
- * and normalize it to well-formed XML, discarding unwanted attributes.
+ * Take an array of attribute names and values and normalize or discard
+ * illegal values for the given element type.
*
- * - Normalizes attribute names to lowercase
* - Discards attributes not on a whitelist for the given element
- * - Turns broken or invalid entities into plaintext
- * - Double-quotes all attribute values
- * - Attributes without values are given the name as attribute
- * - Double attributes are discarded
* - Unsafe style attributes are discarded
- * - Prepends space if there are attributes.
*
- * @param string $text
+ * @param array $attribs
* @param string $element
- * @return string
+ * @return array
*
* @todo Check for legal values where the DTD limits things.
* @todo Check for unique id attribute :P
*/
- function fixTagAttributes( $text, $element ) {
- if( trim( $text ) == '' ) {
- return '';
- }
-
- # Unquoted attribute
- # Since we quote this later, this can be anything distinguishable
- # from the end of the attribute
- $pairs = array();
- if( !preg_match_all(
- MW_ATTRIBS_REGEX,
- $text,
- $pairs,
- PREG_SET_ORDER ) ) {
- return '';
- }
-
+ static function validateTagAttributes( $attribs, $element ) {
$whitelist = array_flip( Sanitizer::attributeWhitelist( $element ) );
- $attribs = array();
- foreach( $pairs as $set ) {
- $attribute = strtolower( $set[1] );
+ $out = array();
+ foreach( $attribs as $attribute => $value ) {
if( !isset( $whitelist[$attribute] ) ) {
continue;
}
-
- $raw = Sanitizer::getTagAttributeCallback( $set );
- $value = Sanitizer::normalizeAttributeValue( $raw );
-
# Strip javascript "expression" from stylesheets.
# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
if( $attribute == 'style' ) {
- $stripped = Sanitizer::decodeCharReferences( $value );
-
- // Remove any comments; IE gets token splitting wrong
- $stripped = preg_replace( '!/\\*.*?\\*/!S', ' ', $stripped );
- $value = htmlspecialchars( $stripped );
-
- // ... and continue checks
- $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
- 'codepointToUtf8(hexdec("$1"))', $stripped );
- $stripped = str_replace( '\\', '', $stripped );
- if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
- $stripped ) ) {
+ $value = Sanitizer::checkCss( $value );
+ if( $value === false ) {
# haxx0r
continue;
}
if ( $attribute === 'id' )
$value = Sanitizer::escapeId( $value );
- # Templates and links may be expanded in later parsing,
- # creating invalid or dangerous output. Suppress this.
- $value = strtr( $value, array(
- '<' => '<', // This should never happen,
- '>' => '>', // we've received invalid input
- '"' => '"', // which should have been escaped.
- '{' => '{',
- '[' => '[',
- "''" => '''',
- 'ISBN' => 'ISBN',
- 'RFC' => 'RFC',
- 'PMID' => 'PMID',
- ) );
-
- # Stupid hack
- $value = preg_replace_callback(
- '/(' . wfUrlProtocols() . ')/',
- array( 'Sanitizer', 'armorLinksCallback' ),
- $value );
-
// If this attribute was previously set, override it.
// Output should only have one attribute of each name.
- $attribs[$attribute] = "$attribute=\"$value\"";
+ $out[$attribute] = $value;
+ }
+ return $out;
+ }
+
+ /**
+ * Pick apart some CSS and check it for forbidden or unsafe structures.
+ * Returns a sanitized string, or false if it was just too evil.
+ *
+ * Currently URL references, 'expression', 'tps' are forbidden.
+ *
+ * @param string $value
+ * @return mixed
+ */
+ static function checkCss( $value ) {
+ $stripped = Sanitizer::decodeCharReferences( $value );
+
+ // Remove any comments; IE gets token splitting wrong
+ $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
+
+ $value = $stripped;
+
+ // ... and continue checks
+ $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
+ 'codepointToUtf8(hexdec("$1"))', $stripped );
+ $stripped = str_replace( '\\', '', $stripped );
+ if( preg_match( '/(expression|tps*:\/\/|url\\s*\().*/is',
+ $stripped ) ) {
+ # haxx0r
+ return false;
}
+
+ return $value;
+ }
+ /**
+ * Take a tag soup fragment listing an HTML element's attributes
+ * and normalize it to well-formed XML, discarding unwanted attributes.
+ * Output is safe for further wikitext processing, with escaping of
+ * values that could trigger problems.
+ *
+ * - Normalizes attribute names to lowercase
+ * - Discards attributes not on a whitelist for the given element
+ * - Turns broken or invalid entities into plaintext
+ * - Double-quotes all attribute values
+ * - Attributes without values are given the name as attribute
+ * - Double attributes are discarded
+ * - Unsafe style attributes are discarded
+ * - Prepends space if there are attributes.
+ *
+ * @param string $text
+ * @param string $element
+ * @return string
+ */
+ static function fixTagAttributes( $text, $element ) {
+ if( trim( $text ) == '' ) {
+ return '';
+ }
+
+ $stripped = Sanitizer::validateTagAttributes(
+ Sanitizer::decodeTagAttributes( $text ), $element );
+
+ $attribs = array();
+ foreach( $stripped as $attribute => $value ) {
+ $encAttribute = htmlspecialchars( $attribute );
+ $encValue = Sanitizer::safeEncodeAttribute( $value );
+
+ $attribs[] = "$encAttribute=\"$encValue\"";
+ }
return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
}
+ /**
+ * Encode an attribute value for HTML output.
+ * @param $text
+ * @return HTML-encoded text fragment
+ */
+ static function encodeAttribute( $text ) {
+ $encValue = htmlspecialchars( $text );
+
+ // Whitespace is normalized during attribute decoding,
+ // so if we've been passed non-spaces we must encode them
+ // ahead of time or they won't be preserved.
+ $encValue = strtr( $encValue, array(
+ "\n" => ' ',
+ "\r" => ' ',
+ "\t" => '	',
+ ) );
+
+ return $encValue;
+ }
+
+ /**
+ * Encode an attribute value for HTML tags, with extra armoring
+ * against further wiki processing.
+ * @param $text
+ * @return HTML-encoded text fragment
+ */
+ static function safeEncodeAttribute( $text ) {
+ $encValue = Sanitizer::encodeAttribute( $text );
+
+ # Templates and links may be expanded in later parsing,
+ # creating invalid or dangerous output. Suppress this.
+ $encValue = strtr( $encValue, array(
+ '<' => '<', // This should never happen,
+ '>' => '>', // we've received invalid input
+ '"' => '"', // which should have been escaped.
+ '{' => '{',
+ '[' => '[',
+ "''" => '''',
+ 'ISBN' => 'ISBN',
+ 'RFC' => 'RFC',
+ 'PMID' => 'PMID',
+ '|' => '|',
+ '__' => '__',
+ ) );
+
+ # Stupid hack
+ $encValue = preg_replace_callback(
+ '/(' . wfUrlProtocols() . ')/',
+ array( 'Sanitizer', 'armorLinksCallback' ),
+ $encValue );
+ return $encValue;
+ }
+
/**
* Given a value escape it so that it can be used in an id attribute and
* return it, this does not validate the value however (see first link)
* @param string $id
* @return string
*/
- function escapeId( $id ) {
+ static function escapeId( $id ) {
static $replace = array(
'%3A' => ':',
'%' => '.'
return str_replace( array_keys( $replace ), array_values( $replace ), $id );
}
+ /**
+ * Given a value, escape it so that it can be used as a CSS class and
+ * return it.
+ *
+ * @todo For extra validity, input should be validated UTF-8.
+ *
+ * @link http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
+ *
+ * @param string $class
+ * @return string
+ */
+ static function escapeClass( $class ) {
+ // Convert ugly stuff to underscores and kill underscores in ugly places
+ return rtrim(preg_replace(
+ array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
+ '_',
+ $class ), '_');
+ }
+
/**
* Regex replace callback for armoring links against further processing.
* @param array $matches
* @return string
* @private
*/
- function armorLinksCallback( $matches ) {
+ private static function armorLinksCallback( $matches ) {
return str_replace( ':', ':', $matches[1] );
}
* @param string
* @return array
*/
- function decodeTagAttributes( $text ) {
+ static function decodeTagAttributes( $text ) {
$attribs = array();
if( trim( $text ) == '' ) {
foreach( $pairs as $set ) {
$attribute = strtolower( $set[1] );
$value = Sanitizer::getTagAttributeCallback( $set );
+
+ // Normalize whitespace
+ $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
+ $value = trim( $value );
+
+ // Decode character references
$attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
}
return $attribs;
* @return string
* @private
*/
- function getTagAttributeCallback( $set ) {
+ private static function getTagAttributeCallback( $set ) {
if( isset( $set[6] ) ) {
# Illegal #XXXXXX color with no quotes.
return $set[6];
# For 'reduced' form, return explicitly the attribute name here.
return $set[1];
} else {
- wfDebugDieBacktrace( "Tag conditions not met. This should never happen and is a bug." );
+ throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
}
}
* @return string
* @private
*/
- function normalizeAttributeValue( $text ) {
+ private static function normalizeAttributeValue( $text ) {
return str_replace( '"', '"',
preg_replace(
'/\r\n|[\x20\x0d\x0a\x09]/',
* @return string
* @private
*/
- function normalizeCharReferences( $text ) {
+ static function normalizeCharReferences( $text ) {
return preg_replace_callback(
MW_CHAR_REFS_REGEX,
array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
* @param string $matches
* @return string
*/
- function normalizeCharReferencesCallback( $matches ) {
+ static function normalizeCharReferencesCallback( $matches ) {
$ret = null;
if( $matches[1] != '' ) {
$ret = Sanitizer::normalizeEntity( $matches[1] );
*
* @param string $name
* @return string
+ * @static
*/
- function normalizeEntity( $name ) {
+ static function normalizeEntity( $name ) {
global $wgHtmlEntities;
if( isset( $wgHtmlEntities[$name] ) ) {
return "&$name;";
}
}
- function decCharReference( $codepoint ) {
+ static function decCharReference( $codepoint ) {
$point = intval( $codepoint );
if( Sanitizer::validateCodepoint( $point ) ) {
return sprintf( '&#%d;', $point );
}
}
- function hexCharReference( $codepoint ) {
+ static function hexCharReference( $codepoint ) {
$point = hexdec( $codepoint );
if( Sanitizer::validateCodepoint( $point ) ) {
return sprintf( '&#x%x;', $point );
* @param int $codepoint
* @return bool
*/
- function validateCodepoint( $codepoint ) {
+ private static function validateCodepoint( $codepoint ) {
return ($codepoint == 0x09)
|| ($codepoint == 0x0a)
|| ($codepoint == 0x0d)
* @param string $text
* @return string
* @public
+ * @static
*/
- function decodeCharReferences( $text ) {
+ public static function decodeCharReferences( $text ) {
return preg_replace_callback(
MW_CHAR_REFS_REGEX,
array( 'Sanitizer', 'decodeCharReferencesCallback' ),
* @param string $matches
* @return string
*/
- function decodeCharReferencesCallback( $matches ) {
+ static function decodeCharReferencesCallback( $matches ) {
if( $matches[1] != '' ) {
return Sanitizer::decodeEntity( $matches[1] );
} elseif( $matches[2] != '' ) {
* @return string
* @private
*/
- function decodeChar( $codepoint ) {
+ static function decodeChar( $codepoint ) {
if( Sanitizer::validateCodepoint( $codepoint ) ) {
return codepointToUtf8( $codepoint );
} else {
* @param string $name
* @return string
*/
- function decodeEntity( $name ) {
+ static function decodeEntity( $name ) {
global $wgHtmlEntities;
if( isset( $wgHtmlEntities[$name] ) ) {
return codepointToUtf8( $wgHtmlEntities[$name] );
* @param string $element
* @return array
*/
- function attributeWhitelist( $element ) {
+ static function attributeWhitelist( $element ) {
static $list;
if( !isset( $list ) ) {
$list = Sanitizer::setupAttributeWhitelist();
}
/**
+ * @todo Document it a bit
* @return array
*/
- function setupAttributeWhitelist() {
+ static function setupAttributeWhitelist() {
$common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
$block = array_merge( $common, array( 'align' ) );
$tablealign = array( 'align', 'char', 'charoff', 'valign' );
# 11.2.1
'table' => array_merge( $common,
array( 'summary', 'width', 'border', 'frame',
- 'rules', 'cellspacing', 'cellpadding',
- 'align', 'bgcolor', 'frame', 'rules',
- 'border' ) ),
+ 'rules', 'cellspacing', 'cellpadding',
+ 'align', 'bgcolor',
+ ) ),
# 11.2.2
'caption' => array_merge( $common, array( 'align' ) ),
* @param string $text HTML fragment
* @return string
*/
- function stripAllTags( $text ) {
+ static function stripAllTags( $text ) {
# Actual <tags>
- $text = preg_replace( '/ < .*? > /x', '', $text );
+ $text = StringUtils::delimiterReplace( '<', '>', '', $text );
# Normalize &entities and whitespace
$text = Sanitizer::normalizeAttributeValue( $text );
* @return string
* @static
*/
- function hackDocType() {
+ static function hackDocType() {
global $wgHtmlEntities;
$out = "<!DOCTYPE html [\n";
foreach( $wgHtmlEntities as $entity => $codepoint ) {
return $out;
}
+ static function cleanUrl( $url, $hostname=true ) {
+ # Normalize any HTML entities in input. They will be
+ # re-escaped by makeExternalLink().
+ $url = Sanitizer::decodeCharReferences( $url );
+
+ # Escape any control characters introduced by the above step
+ $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
+
+ # Validate hostname portion
+ $matches = array();
+ if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
+ list( /* $whole */, $protocol, $host, $rest ) = $matches;
+
+ // Characters that will be ignored in IDNs.
+ // http://tools.ietf.org/html/3454#section-3.1
+ // Strip them before further processing so blacklists and such work.
+ $strip = "/
+ \\s| # general whitespace
+ \xc2\xad| # 00ad SOFT HYPHEN
+ \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
+ \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
+ \xe2\x81\xa0| # 2060 WORD JOINER
+ \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
+ \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
+ \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
+ \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
+ \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
+ \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
+ \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
+ [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
+ /xuD";
+
+ $host = preg_replace( $strip, '', $host );
+
+ // @fixme: validate hostnames here
+
+ return $protocol . $host . $rest;
+ } else {
+ return $url;
+ }
+ }
+
}
?>