Accept 'OK' status results from search engine

[lhc/web/wiklou.git] / includes / Sanitizer.php
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php

index d321e9f..44e4e3e 100644 (file)
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -41,7 +41,7 @@ class Sanitizer {
  
         /**
          * Acceptable tag name charset from HTML5 parsing spec
-        * http://www.w3.org/TR/html5/syntax.html#tag-open-state
+        * https://www.w3.org/TR/html5/syntax.html#tag-open-state
          */
         const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!';
  
@@ -58,7 +58,7 @@ class Sanitizer {
  
         /**
          * List of all named character entities defined in HTML 4.01
-        * http://www.w3.org/TR/html4/sgml/entities.html
+        * https://www.w3.org/TR/html4/sgml/entities.html
          * As well as &apos; which is only defined starting in XHTML1.
          */
         private static $htmlEntities = [
@@ -333,7 +333,7 @@ class Sanitizer {
         /**
          * Regular expression to match HTML/XML attribute pairs within a tag.
          * Allows some... latitude. Based on,
-        * http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
+        * https://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
          * Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes
          * @return string
          */
@@ -381,14 +381,17 @@ class Sanitizer {
                                 'kbd', 'samp', 'data', 'time', 'mark'
                         ];
                         $htmlsingle = [
-                               'br', 'wbr', 'hr', 'li', 'dt', 'dd'
-                       ];
-                       $htmlsingleonly = [ # Elements that cannot have close tags
-                               'br', 'wbr', 'hr'
+                               'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link'
                         ];
  
-                       $htmlsingle[] = $htmlsingleonly[] = 'meta';
-                       $htmlsingle[] = $htmlsingleonly[] = 'link';
+                       # Elements that cannot have close tags. This is (not coincidentally)
+                       # also the list of tags for which the HTML 5 parsing algorithm
+                       # requires you to "acknowledge the token's self-closing flag", i.e.
+                       # a self-closing tag like <br/> is not an HTML 5 parse error only
+                       # for this list.
+                       $htmlsingleonly = [
+                               'br', 'wbr', 'hr', 'meta', 'link'
+                       ];
  
                         $htmlnest = [ # Tags that can be nested--??
                                 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
@@ -450,10 +453,14 @@ class Sanitizer {
          * @param array|bool $args Arguments for the processing callback
          * @param array $extratags For any extra tags to include
          * @param array $removetags For any tags (default or extra) to exclude
+        * @param callable $warnCallback (Deprecated) Callback allowing the
+        *   addition of a tracking category when bad input is encountered.
+        *   DO NOT ADD NEW PARAMETERS AFTER $warnCallback, since it will be
+        *   removed shortly.
          * @return string
          */
         public static function removeHTMLtags( $text, $processCallback = null,
-               $args = [], $extratags = [], $removetags = []
+               $args = [], $extratags = [], $removetags = [], $warnCallback = null
         ) {
                 extract( self::getRecognizedTagData( $extratags, $removetags ) );
  
@@ -540,6 +547,14 @@ class Sanitizer {
                                                         $badtag = true;
                                                 #  Is it a self closed htmlpair ? (bug 5487)
                                                 } elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) {
+                                                       // Eventually we'll just remove the self-closing
+                                                       // slash, in order to be consistent with HTML5
+                                                       // semantics.
+                                                       // $brace = '>';
+                                                       // For now, let's just warn authors to clean up.
+                                                       if ( is_callable( $warnCallback ) ) {
+                                                               call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
+                                                       }
                                                         $badtag = true;
                                                 } elseif ( isset( $htmlsingleonly[$t] ) ) {
                                                         # Hack to force empty tag for unclosable elements
@@ -604,12 +619,29 @@ class Sanitizer {
                                                         call_user_func_array( $processCallback, [ &$params, $args ] );
                                                 }
  
+                                               if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) {
+                                                       // Eventually we'll just remove the self-closing
+                                                       // slash, in order to be consistent with HTML5
+                                                       // semantics.
+                                                       // $brace = '>';
+                                                       // For now, let's just warn authors to clean up.
+                                                       if ( is_callable( $warnCallback ) ) {
+                                                               call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] );
+                                                       }
+                                               }
                                                 if ( !Sanitizer::validateTag( $params, $t ) ) {
                                                         $badtag = true;
                                                 }
  
                                                 $newparams = Sanitizer::fixTagAttributes( $params, $t );
                                                 if ( !$badtag ) {
+                                                       if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) {
+                                                               # Interpret self-closing tags as empty tags even when
+                                                               # HTML 5 would interpret them as start tags. Such input
+                                                               # is commonly seen on Wikimedia wikis with this intention.
+                                                               $brace = "></$t>";
+                                                       }
+
                                                         $rest = str_replace( '>', '&gt;', $rest );
                                                         $text .= "<$slash$t$newparams$brace$rest";
                                                         continue;
@@ -983,6 +1015,7 @@ class Sanitizer {
                                 | url\s*\(
                                 | image\s*\(
                                 | image-set\s*\(
+                               | attr\s*\([^)]+[\s,]+url
                         !ix', $value ) ) {
                         return '/* insecure input */';
                 }
@@ -1028,12 +1061,14 @@ class Sanitizer {
          * - Double attributes are discarded
          * - Unsafe style attributes are discarded
          * - Prepends space if there are attributes.
+        * - (Optionally) Sorts attributes by name.
          *
          * @param string $text
          * @param string $element
+        * @param bool $sorted Whether to sort the attributes (default: false)
          * @return string
          */
-       static function fixTagAttributes( $text, $element ) {
+       static function fixTagAttributes( $text, $element, $sorted = false ) {
                 if ( trim( $text ) == '' ) {
                         return '';
                 }
@@ -1041,6 +1076,10 @@ class Sanitizer {
                 $decoded = Sanitizer::decodeTagAttributes( $text );
                 $stripped = Sanitizer::validateTagAttributes( $decoded, $element );
  
+               if ( $sorted ) {
+                       ksort( $stripped );
+               }
+
                 return Sanitizer::safeEncodeTagAttributes( $stripped );
         }
  
@@ -1110,11 +1149,11 @@ class Sanitizer {
          * ambiguous if it's part of something that looks like a percent escape
          * (which don't work reliably in fragments cross-browser).
          *
-        * @see http://www.w3.org/TR/html401/types.html#type-name Valid characters
+        * @see https://www.w3.org/TR/html401/types.html#type-name Valid characters
          *   in the id and name attributes
-        * @see http://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with
+        * @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with
          *   the id attribute
-        * @see http://www.whatwg.org/html/elements.html#the-id-attribute
+        * @see https://www.w3.org/TR/html5/dom.html#the-id-attribute
          *   HTML5 definition of id attribute
          *
          * @param string $id Id to escape
@@ -1200,7 +1239,7 @@ class Sanitizer {
          *
          * @todo For extra validity, input should be validated UTF-8.
          *
-        * @see http://www.w3.org/TR/CSS21/syndata.html Valid characters/format
+        * @see https://www.w3.org/TR/CSS21/syndata.html Valid characters/format
          *
          * @param string $class
          * @return string
@@ -1313,7 +1352,7 @@ class Sanitizer {
                 } elseif ( !isset( $set[2] ) ) {
                         # In XHTML, attributes must have a value so return an empty string.
                         # See "Empty attribute syntax",
-                       # http://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
+                       # https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name
                         return "";
                 } else {
                         throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
@@ -1583,7 +1622,7 @@ class Sanitizer {
  
                         # RDFa
                         # These attributes are specified in section 9 of
-                       # http://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
+                       # https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014
                         'about',
                         'property',
                         'resource',
@@ -1591,7 +1630,7 @@ class Sanitizer {
                         'typeof',
  
                         # Microdata. These are specified by
-                       # http://www.whatwg.org/html/microdata.html#the-microdata-model
+                       # https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model
                         'itemid',
                         'itemprop',
                         'itemref',
@@ -1615,7 +1654,7 @@ class Sanitizer {
                 ];
  
                 # Numbers refer to sections in HTML 4.01 standard describing the element.
-               # See: http://www.w3.org/TR/html4/
+               # See: https://www.w3.org/TR/html4/
                 $whitelist = [
                         # 7.5.4
                         'div'        => $block,
@@ -1662,7 +1701,7 @@ class Sanitizer {
                         # 9.3.2
                         'br'         => array_merge( $common, [ 'clear' ] ),
  
-                       # http://www.whatwg.org/html/text-level-semantics.html#the-wbr-element
+                       # https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element
                         'wbr'        => $common,
  
                         # 9.3.4
@@ -1737,7 +1776,7 @@ class Sanitizer {
                         'hr'         => array_merge( $common, [ 'width' ] ),
  
                         # HTML Ruby annotation text module, simple ruby only.
-                       # http://www.whatwg.org/html/text-level-semantics.html#the-ruby-element
+                       # https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element
                         'ruby'       => $common,
                         # rbc
                         'rb'         => $common,
@@ -1747,14 +1786,14 @@ class Sanitizer {
  
                         # MathML root element, where used for extensions
                         # 'title' may not be 100% valid here; it's XHTML
-                       # http://www.w3.org/TR/REC-MathML/
+                       # https://www.w3.org/TR/REC-MathML/
                         'math'       => [ 'class', 'style', 'id', 'title' ],
  
                         # HTML 5 section 4.6
                         'bdi' => $common,
  
                         # HTML5 elements, defined by:
-                       # http://www.whatwg.org/html/
+                       # https://html.spec.whatwg.org/multipage/semantics.html#the-data-element
                         'data' => array_merge( $common, [ 'value' ] ),
                         'time' => array_merge( $common, [ 'datetime' ] ),
                         'mark' => $common,
@@ -1829,7 +1868,7 @@ class Sanitizer {
                         list( /* $whole */, $protocol, $host, $rest ) = $matches;
  
                         // Characters that will be ignored in IDNs.
-                       // http://tools.ietf.org/html/3454#section-3.1
+                       // https://tools.ietf.org/html/rfc3454#section-3.1
                         // Strip them before further processing so blacklists and such work.
                         $strip = "/
                                 \\s|          # general whitespace