|&\#[xX]([0-9A-Fa-f]+);
|(&)/x';
+ /**
+ * Blacklist for evil uris like javascript:
+ * WARNING: DO NOT use this in any place that actually requires blacklisting
+ * for security reasons. There are NUMEROUS[1] ways to bypass blacklisting, the
+ * only way to be secure from javascript: uri based xss vectors is to whitelist
+ * things that you know are safe and deny everything else.
+ * [1]: http://ha.ckers.org/xss.html
+ */
const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i';
const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/";
/**
* List of all named character entities defined in HTML 4.01
* http://www.w3.org/TR/html4/sgml/entities.html
+ * As well as ' which is only defined starting in XHTML1.
* @private
*/
static $htmlEntities = array(
'amp' => 38,
'and' => 8743,
'ang' => 8736,
+ 'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE.
'Aring' => 197,
'aring' => 229,
'asymp' => 8776,
'strike', 'strong', 'tt', 'var', 'div', 'center',
'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'abbr', 'dfn',
- 'kbd', 'samp', 'thead', 'tbody', 'tfoot'
+ 'kbd', 'samp'
);
$htmlsingle = array(
'br', 'hr', 'li', 'dt', 'dd'
return $text;
}
+ /**
+ * Take an array of attribute names and values and fix some deprecated values
+ * for the given element type.
+ * This does not validate properties, so you should ensure that you call
+ * validateTagAttributes AFTER this to ensure that the resulting style rule
+ * this may add is safe.
+ *
+ * - Converts most presentational attributes like align into inline css
+ *
+ * @param $attribs Array
+ * @param $element String
+ * @return Array
+ */
+ static function fixDeprecatedAttributes( $attribs, $element ) {
+ global $wgHtml5, $wgCleanupPresentationalAttributes;
+
+ // presentational attributes were removed from html5, we can leave them
+ // in when html5 is turned off
+ if ( !$wgHtml5 || !$wgCleanupPresentationalAttributes ) {
+ return $attribs;
+ }
+
+ $table = array( 'table' );
+ $cells = array( 'td', 'th' );
+ $colls = array( 'col', 'colgroup' );
+ $tblocks = array( 'tbody', 'tfoot', 'thead' );
+ $h = array( 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' );
+
+ $presentationalAttribs = array(
+ 'align' => array( 'text-align', array_merge( array( 'caption', 'hr', 'div', 'p', 'tr' ), $table, $cells, $colls, $tblocks, $h ) ),
+ 'clear' => array( 'clear', array( 'br' ) ),
+ 'height' => array( 'height', $cells ),
+ 'nowrap' => array( 'white-space', $cells ),
+ 'size' => array( 'height', array( 'hr' ) ),
+ 'type' => array( 'list-style-type', array( 'li', 'ol', 'ul' ) ),
+ 'valign' => array( 'vertical-align', array_merge( $cells, $colls, $tblocks ) ),
+ 'width' => array( 'width', array_merge( array( 'hr', 'pre' ), $table, $cells, $colls ) ),
+ );
+
+ $style = "";
+ foreach ( $presentationalAttribs as $attribute => $info ) {
+ list( $property, $elements ) = $info;
+
+ // Skip if this attribute is not relevant to this element
+ if ( !in_array( $element, $elements ) ) {
+ continue;
+ }
+
+ // Skip if the attribute is not used
+ if ( !array_key_exists( $attribute, $attribs ) ) {
+ continue;
+ }
+
+ $value = $attribs[$attribute];
+
+ // For nowrap the value should be nowrap instead of whatever text is in the value
+ if ( $attribute === 'nowrap' ) {
+ $value = 'nowrap';
+ }
+
+ // Size based properties should have px applied to them if they have no unit
+ if ( in_array( $attribute, array( 'height', 'width', 'size' ) ) ) {
+ if ( preg_match( '/^[\d.]+$/', $value ) ) {
+ $value = "{$value}px";
+ }
+ }
+
+ $style .= " $property: $value;";
+
+ unset( $attribs[$attribute] );
+ }
+
+ if ( !empty($style) ) {
+ // Prepend our style rules so that they can be overridden by user css
+ if ( isset($attribs['style']) ) {
+ $style .= " " . $attribs['style'];
+ }
+ $attribs['style'] = trim($style);
+ }
+
+ return $attribs;
+ }
+
/**
* Take an array of attribute names and values and normalize or discard
* illegal values for the given element type.
return $value;
}
+ /**
+ * @param $matches array
+ * @return String
+ */
static function cssDecodeCallback( $matches ) {
if ( $matches[1] !== '' ) {
// Line continuation
return '';
}
- $stripped = Sanitizer::validateTagAttributes(
- Sanitizer::decodeTagAttributes( $text ), $element );
+ $decoded = Sanitizer::decodeTagAttributes( $text );
+ $decoded = Sanitizer::fixDeprecatedAttributes( $decoded, $element );
+ $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
$attribs = array();
foreach( $stripped as $attribute => $value ) {
Sanitizer::normalizeCharReferences( $text ) ) );
}
+ /**
+ * @param $text string
+ * @return mixed
+ */
private static function normalizeWhitespace( $text ) {
return preg_replace(
'/\r\n|[\x20\x0d\x0a\x09]/',
}
}
+ /**
+ * @param $codepoint
+ * @return null|string
+ */
static function decCharReference( $codepoint ) {
$point = intval( $codepoint );
if( Sanitizer::validateCodepoint( $point ) ) {
}
}
+ /**
+ * @param $codepoint
+ * @return null|string
+ */
static function hexCharReference( $codepoint ) {
$point = hexdec( $codepoint );
if( Sanitizer::validateCodepoint( $point ) ) {
* return the UTF-8 encoding of that character. Otherwise, returns
* pseudo-entity source (eg &foo;)
*
- * @param $name Strings
+ * @param $name String
* @return String
*/
static function decodeEntity( $name ) {
return $out;
}
+ /**
+ * @param $url string
+ * @return mixed|string
+ */
static function cleanUrl( $url ) {
# Normalize any HTML entities in input. They will be
# re-escaped by makeExternalLink().
$host = preg_replace( $strip, '', $host );
- // @todo Fixme: validate hostnames here
+ // @todo FIXME: Validate hostnames here
return $protocol . $host . $rest;
} else {
}
}
+ /**
+ * @param $matches array
+ * @return string
+ */
static function cleanUrlCallback( $matches ) {
return urlencode( $matches[0] );
}
+
+ /**
+ * Does a string look like an e-mail address?
+ *
+ * This validates an email address using an HTML5 specification found at:
+ * http://www.whatwg.org/specs/web-apps/current-work/multipage/states-of-the-type-attribute.html#valid-e-mail-address
+ * Which as of 2011-01-24 says:
+ *
+ * A valid e-mail address is a string that matches the ABNF production
+ * 1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined
+ * in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section
+ * 3.5.
+ *
+ * This function is an implementation of the specification as requested in
+ * bug 22449.
+ *
+ * Client-side forms will use the same standard validation rules via JS or
+ * HTML 5 validation; additional restrictions can be enforced server-side
+ * by extensions via the 'isValidEmailAddr' hook.
+ *
+ * Note that this validation doesn't 100% match RFC 2822, but is believed
+ * to be liberal enough for wide use. Some invalid addresses will still
+ * pass validation here.
+ *
+ * @since 1.18
+ *
+ * @param $addr String E-mail address
+ * @return Bool
+ */
+ public static function validateEmail( $addr ) {
+ $result = null;
+ if( !wfRunHooks( 'isValidEmailAddr', array( $addr, &$result ) ) ) {
+ return $result;
+ }
+
+ // Please note strings below are enclosed in brackets [], this make the
+ // hyphen "-" a range indicator. Hence it is double backslashed below.
+ // See bug 26948
+ $rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~" ;
+ $rfc1034_ldh_str = "a-z0-9\\-" ;
+
+ $HTML5_email_regexp = "/
+ ^ # start of string
+ [$rfc5322_atext\\.]+ # user part which is liberal :p
+ @ # 'apostrophe'
+ [$rfc1034_ldh_str]+ # First domain part
+ (\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot
+ $ # End of string
+ /ix" ; // case Insensitive, eXtended
+
+ return (bool) preg_match( $HTML5_email_regexp, $addr );
+ }
}