Merge "RCFilters UI: Anchor the highlight popup"
authorjenkins-bot <jenkins-bot@gerrit.wikimedia.org>
Wed, 8 Mar 2017 17:26:05 +0000 (17:26 +0000)
committerGerrit Code Review <gerrit@wikimedia.org>
Wed, 8 Mar 2017 17:26:05 +0000 (17:26 +0000)
autoload.php
composer.json
includes/export/XmlDumpWriter.php
includes/tidy/RemexCompatFormatter.php [new file with mode: 0644]
includes/tidy/RemexCompatMunger.php [new file with mode: 0644]
includes/tidy/RemexDriver.php [new file with mode: 0644]
includes/tidy/RemexMungerData.php [new file with mode: 0644]
includes/title/NamespaceAwareForeignTitleFactory.php
tests/phpunit/includes/tidy/RemexDriverTest.php [new file with mode: 0644]
tests/phpunit/includes/title/NamespaceAwareForeignTitleFactoryTest.php

index a16451d..f96d898 100644 (file)
@@ -918,6 +918,10 @@ $wgAutoloadLocalClasses = [
        'MediaWiki\\Tidy\\RaggettInternalHHVM' => __DIR__ . '/includes/tidy/RaggettInternalHHVM.php',
        'MediaWiki\\Tidy\\RaggettInternalPHP' => __DIR__ . '/includes/tidy/RaggettInternalPHP.php',
        'MediaWiki\\Tidy\\RaggettWrapper' => __DIR__ . '/includes/tidy/RaggettWrapper.php',
+       'MediaWiki\\Tidy\\RemexCompatFormatter' => __DIR__ . '/includes/tidy/RemexCompatFormatter.php',
+       'MediaWiki\\Tidy\\RemexCompatMunger' => __DIR__ . '/includes/tidy/RemexCompatMunger.php',
+       'MediaWiki\\Tidy\\RemexDriver' => __DIR__ . '/includes/tidy/RemexDriver.php',
+       'MediaWiki\\Tidy\\RemexMungerData' => __DIR__ . '/includes/tidy/RemexMungerData.php',
        'MediaWiki\\Tidy\\TidyDriverBase' => __DIR__ . '/includes/tidy/TidyDriverBase.php',
        'MediaWiki\\Widget\\ComplexNamespaceInputWidget' => __DIR__ . '/includes/widget/ComplexNamespaceInputWidget.php',
        'MediaWiki\\Widget\\ComplexTitleInputWidget' => __DIR__ . '/includes/widget/ComplexTitleInputWidget.php',
index 6a09d1a..ce38914 100644 (file)
@@ -38,6 +38,7 @@
                "wikimedia/ip-set": "1.1.0",
                "wikimedia/php-session-serializer": "1.0.4",
                "wikimedia/relpath": "1.0.3",
+               "wikimedia/remex-html": "1.0.0",
                "wikimedia/running-stat": "1.1.0",
                "wikimedia/scoped-callback": "1.0.0",
                "wikimedia/utfnormal": "1.1.0",
index 52bf0f0..5a1f92c 100644 (file)
@@ -433,6 +433,9 @@ class XmlDumpWriter {
                global $wgContLang;
                $prefix = $wgContLang->getFormattedNsText( $title->getNamespace() );
 
+               // @todo Emit some kind of warning to the user if $title->getNamespace() !==
+               // NS_MAIN and $prefix === '' (viz. pages in an unregistered namespace)
+
                if ( $prefix !== '' ) {
                        $prefix .= ':';
                }
diff --git a/includes/tidy/RemexCompatFormatter.php b/includes/tidy/RemexCompatFormatter.php
new file mode 100644 (file)
index 0000000..3dc727b
--- /dev/null
@@ -0,0 +1,71 @@
+<?php
+
+namespace MediaWiki\Tidy;
+
+use RemexHtml\HTMLData;
+use RemexHtml\Serializer\HtmlFormatter;
+use RemexHtml\Serializer\SerializerNode;
+use RemexHtml\Tokenizer\PlainAttributes;
+
+/**
+ * @internal
+ */
+class RemexCompatFormatter extends HtmlFormatter {
+       private static $markedEmptyElements = [
+               'li' => true,
+               'p' => true,
+               'tr' => true,
+       ];
+
+       public function __construct( $options = [] ) {
+               parent::__construct( $options );
+               $this->attributeEscapes["\xc2\xa0"] = '&#160;';
+               unset( $this->attributeEscapes["&"] );
+               $this->textEscapes["\xc2\xa0"] = '&#160;';
+               unset( $this->textEscapes["&"] );
+       }
+
+       public function startDocument( $fragmentNamespace, $fragmentName ) {
+               return '';
+       }
+
+       public function element( SerializerNode $parent, SerializerNode $node, $contents ) {
+               $data = $node->snData;
+               if ( $data && $data->isPWrapper ) {
+                       if ( $data->nonblankNodeCount ) {
+                               return "<p>$contents</p>";
+                       } else {
+                               return $contents;
+                       }
+               }
+
+               $name = $node->name;
+               $attrs = $node->attrs;
+               if ( isset( self::$markedEmptyElements[$name] ) && $attrs->count() === 0 ) {
+                       if ( strspn( $contents, "\t\n\f\r " ) === strlen( $contents ) ) {
+                               return "<{$name} class=\"mw-empty-elt\">$contents</{$name}>";
+                       }
+               }
+
+               $s = "<$name";
+               foreach ( $attrs->getValues() as $attrName => $attrValue ) {
+                       $encValue = strtr( $attrValue, $this->attributeEscapes );
+                       $s .= " $attrName=\"$encValue\"";
+               }
+               if ( $node->namespace === HTMLData::NS_HTML && isset( $this->voidElements[$name] ) ) {
+                       $s .= ' />';
+                       return $s;
+               }
+
+               $s .= '>';
+               if ( $node->namespace === HTMLData::NS_HTML
+                       && isset( $contents[0] ) && $contents[0] === "\n"
+                       && isset( $this->prefixLfElements[$name] )
+               ) {
+                       $s .= "\n$contents</$name>";
+               } else {
+                       $s .= "$contents</$name>";
+               }
+               return $s;
+       }
+}
diff --git a/includes/tidy/RemexCompatMunger.php b/includes/tidy/RemexCompatMunger.php
new file mode 100644 (file)
index 0000000..d5f5c28
--- /dev/null
@@ -0,0 +1,468 @@
+<?php
+
+namespace MediaWiki\Tidy;
+
+use RemexHtml\HTMLData;
+use RemexHtml\Serializer\Serializer;
+use RemexHtml\Serializer\SerializerNode;
+use RemexHtml\Tokenizer\Attributes;
+use RemexHtml\Tokenizer\PlainAttributes;
+use RemexHtml\TreeBuilder\TreeBuilder;
+use RemexHtml\TreeBuilder\TreeHandler;
+use RemexHtml\TreeBuilder\Element;
+
+/**
+ * @internal
+ */
+class RemexCompatMunger implements TreeHandler {
+       private static $onlyInlineElements = [
+               "a" => true,
+               "abbr" => true,
+               "acronym" => true,
+               "applet" => true,
+               "b" => true,
+               "basefont" => true,
+               "bdo" => true,
+               "big" => true,
+               "br" => true,
+               "button" => true,
+               "cite" => true,
+               "code" => true,
+               "dfn" => true,
+               "em" => true,
+               "font" => true,
+               "i" => true,
+               "iframe" => true,
+               "img" => true,
+               "input" => true,
+               "kbd" => true,
+               "label" => true,
+               "legend" => true,
+               "map" => true,
+               "object" => true,
+               "param" => true,
+               "q" => true,
+               "rb" => true,
+               "rbc" => true,
+               "rp" => true,
+               "rt" => true,
+               "rtc" => true,
+               "ruby" => true,
+               "s" => true,
+               "samp" => true,
+               "select" => true,
+               "small" => true,
+               "span" => true,
+               "strike" => true,
+               "strong" => true,
+               "sub" => true,
+               "sup" => true,
+               "textarea" => true,
+               "tt" => true,
+               "u" => true,
+               "var" => true,
+       ];
+
+       private static $formattingElements = [
+               'a' => true,
+               'b' => true,
+               'big' => true,
+               'code' => true,
+               'em' => true,
+               'font' => true,
+               'i' => true,
+               'nobr' => true,
+               's' => true,
+               'small' => true,
+               'strike' => true,
+               'strong' => true,
+               'tt' => true,
+               'u' => true,
+       ];
+
+       /**
+        * Constructor
+        *
+        * @param Serializer $serializer
+        */
+       public function __construct( Serializer $serializer ) {
+               $this->serializer = $serializer;
+       }
+
+       public function startDocument( $fragmentNamespace, $fragmentName ) {
+               $this->serializer->startDocument( $fragmentNamespace, $fragmentName );
+               $root = $this->serializer->getRootNode();
+               $root->snData = new RemexMungerData;
+               $root->snData->needsPWrapping = true;
+       }
+
+       public function endDocument( $pos ) {
+               $this->serializer->endDocument( $pos );
+       }
+
+       private function getParentForInsert( $preposition, $refElement ) {
+               if ( $preposition === TreeBuilder::ROOT ) {
+                       return [ $this->serializer->getRootNode(), null ];
+               } elseif ( $preposition === TreeBuilder::BEFORE ) {
+                       $refNode = $refElement->userData;
+                       return [ $this->serializer->getParentNode( $refNode ), $refNode ];
+               } else {
+                       $refNode = $refElement->userData;
+                       $refData = $refNode->snData;
+                       if ( $refData->currentCloneElement ) {
+                               // Follow a chain of clone links if necessary
+                               $origRefData = $refData;
+                               while ( $refData->currentCloneElement ) {
+                                       $refElement = $refData->currentCloneElement;
+                                       $refNode = $refElement->userData;
+                                       $refData = $refNode->snData;
+                               }
+                               // Cache the end of the chain in the requested element
+                               $origRefData->currentCloneElement = $refElement;
+                       } elseif ( $refData->childPElement ) {
+                               $refElement = $refData->childPElement;
+                               $refNode = $refElement->userData;
+                       }
+                       return [ $refNode, $refNode ];
+               }
+       }
+
+       /**
+        * Insert a p-wrapper
+        *
+        * @param SerializerNode $parent
+        * @param integer $sourceStart
+        * @return SerializerNode
+        */
+       private function insertPWrapper( SerializerNode $parent, $sourceStart ) {
+               $pWrap = new Element( HTMLData::NS_HTML, 'mw:p-wrap', new PlainAttributes );
+               $this->serializer->insertElement( TreeBuilder::UNDER, $parent, $pWrap, false,
+                       $sourceStart, 0 );
+               $data = new RemexMungerData;
+               $data->isPWrapper = true;
+               $data->wrapBaseNode = $parent;
+               $pWrap->userData->snData = $data;
+               $parent->snData->childPElement = $pWrap;
+               return $pWrap->userData;
+       }
+
+       public function characters( $preposition, $refElement, $text, $start, $length,
+               $sourceStart, $sourceLength
+       ) {
+               $isBlank = strspn( $text, "\t\n\f\r ", $start, $length ) === $length;
+
+               list( $parent, $refNode ) = $this->getParentForInsert( $preposition, $refElement );
+               $parentData = $parent->snData;
+
+               if ( $preposition === TreeBuilder::UNDER ) {
+                       if ( $parentData->needsPWrapping && !$isBlank ) {
+                               // Add a p-wrapper for bare text under body/blockquote
+                               $refNode = $this->insertPWrapper( $refNode, $sourceStart );
+                               $parent = $refNode;
+                               $parentData = $parent->snData;
+                       } elseif ( $parentData->isSplittable && !$parentData->ancestorPNode ) {
+                               // The parent is splittable and in block mode, so split the tag stack
+                               $refNode = $this->splitTagStack( $refNode, true, $sourceStart );
+                               $parent = $refNode;
+                               $parentData = $parent->snData;
+                       }
+               }
+
+               if ( !$isBlank ) {
+                       // Non-whitespace characters detected
+                       $parentData->nonblankNodeCount++;
+               }
+               $this->serializer->characters( $preposition, $refNode, $text, $start,
+                       $length, $sourceStart, $sourceLength );
+       }
+
+       /**
+        * Insert or reparent an element. Create p-wrappers or split the tag stack
+        * as necessary.
+        *
+        * Consider the following insertion locations. The parent may be:
+        *
+        *   - A: A body or blockquote (!!needsPWrapping)
+        *   - B: A p-wrapper (!!isPWrapper)
+        *   - C: A descendant of a p-wrapper (!!ancestorPNode)
+        *     - CS: With splittable formatting elements in the stack region up to
+        *       the p-wrapper
+        *     - CU: With one or more unsplittable elements in the stack region up
+        *       to the p-wrapper
+        *   - D: Not a descendant of a p-wrapper (!ancestorNode)
+        *     - DS: With splittable formatting elements in the stack region up to
+        *       the body or blockquote
+        *     - DU: With one or more unsplittable elements in the stack region up
+        *       to the body or blockquote
+        *
+        * And consider that we may insert two types of element:
+        *   - b: block
+        *   - i: inline
+        *
+        * We handle the insertion as follows:
+        *
+        *   - A/i: Create a p-wrapper, insert under it
+        *   - A/b: Insert as normal
+        *   - B/i: Insert as normal
+        *   - B/b: Close the p-wrapper, insert under the body/blockquote (wrap
+        *     base) instead)
+        *   - C/i: Insert as normal
+        *   - CS/b: Split the tag stack, insert the block under cloned formatting
+        *     elements which have the wrap base (the parent of the p-wrap) as
+        *     their ultimate parent.
+        *   - CU/b: Disable the p-wrap, by reparenting the currently open child
+        *     of the p-wrap under the p-wrap's parent. Then insert the block as
+        *     normal.
+        *   - D/b: Insert as normal
+        *   - DS/i: Split the tag stack, creating a new p-wrapper as the ultimate
+        *     parent of the formatting elements thus cloned. The parent of the
+        *     p-wrapper is the body or blockquote.
+        *   - DU/i: Insert as normal
+        *
+        * FIXME: fostering ($preposition == BEFORE) is mostly done by inserting as
+        * normal, the full algorithm is not followed.
+        *
+        * @param integer $preposition
+        * @param Element|SerializerNode|null $refElement
+        * @param Element $element
+        * @param bool $void
+        * @param integer $sourceStart
+        * @param integer $sourceLength
+        */
+
+       public function insertElement( $preposition, $refElement, Element $element, $void,
+               $sourceStart, $sourceLength
+       ) {
+               list( $parent, $newRef ) = $this->getParentForInsert( $preposition, $refElement );
+               $parentData = $parent->snData;
+               $parentNs = $parent->namespace;
+               $parentName = $parent->name;
+               $elementName = $element->htmlName;
+
+               $inline = isset( self::$onlyInlineElements[$elementName] );
+               $under = $preposition === TreeBuilder::UNDER;
+
+               if ( $under && $parentData->isPWrapper && !$inline ) {
+                       // [B/b] The element is non-inline and the parent is a p-wrapper,
+                       // close the parent and insert into its parent instead
+                       $newParent = $this->serializer->getParentNode( $parent );
+                       $parent = $newParent;
+                       $parentData = $parent->snData;
+                       $parentData->childPElement = null;
+                       $newRef = $refElement->userData;
+                       // FIXME cannot call endTag() since we don't have an Element
+               } elseif ( $under && $parentData->isSplittable
+                       && (bool)$parentData->ancestorPNode !== $inline
+               ) {
+                       // [CS/b, DS/i] The parent is splittable and the current element is
+                       // inline in block context, or if the current element is a block
+                       // under a p-wrapper, split the tag stack.
+                       $newRef = $this->splitTagStack( $newRef, $inline, $sourceStart );
+                       $parent = $newRef;
+                       $parentData = $parent->snData;
+               } elseif ( $under && $parentData->needsPWrapping && $inline ) {
+                       // [A/i] If the element is inline and we are in body/blockquote,
+                       // we need to create a p-wrapper
+                       $newRef = $this->insertPWrapper( $newRef, $sourceStart );
+                       $parent = $newRef;
+                       $parentData = $parent->snData;
+               } elseif ( $parentData->ancestorPNode && !$inline ) {
+                       // [CU/b] If the element is non-inline and (despite attempting to
+                       // split above) there is still an ancestor p-wrap, disable that
+                       // p-wrap
+                       $this->disablePWrapper( $parent, $sourceStart );
+               }
+               // else [A/b, B/i, C/i, D/b, DU/i] insert as normal
+
+               // An element with element children is a non-blank element
+               $parentData->nonblankNodeCount++;
+
+               // Insert the element downstream and so initialise its userData
+               $this->serializer->insertElement( $preposition, $newRef,
+                       $element, $void, $sourceStart, $sourceLength );
+
+               // Initialise snData
+               if ( !$element->userData->snData ) {
+                       $elementData = $element->userData->snData = new RemexMungerData;
+               } else {
+                       $elementData = $element->userData->snData;
+               }
+               if ( ( $parentData->isPWrapper || $parentData->isSplittable )
+                       && isset( self::$formattingElements[$elementName] )
+               ) {
+                       $elementData->isSplittable = true;
+               }
+               if ( $parentData->isPWrapper ) {
+                       $elementData->ancestorPNode = $parent;
+               } elseif ( $parentData->ancestorPNode ) {
+                       $elementData->ancestorPNode = $parentData->ancestorPNode;
+               }
+               if ( $parentData->wrapBaseNode ) {
+                       $elementData->wrapBaseNode = $parentData->wrapBaseNode;
+               } elseif ( $parentData->needsPWrapping ) {
+                       $elementData->wrapBaseNode = $parent;
+               }
+               if ( $elementName === 'body'
+                       || $elementName === 'blockquote'
+                       || $elementName === 'html'
+               ) {
+                       $elementData->needsPWrapping = true;
+               }
+       }
+
+       /**
+        * Clone nodes in a stack range and return the new parent
+        *
+        * @param SerializerNode $parentNode
+        * @param bool $inline
+        * @param integer $pos The source position
+        * @return SerializerNode
+        */
+       private function splitTagStack( SerializerNode $parentNode, $inline, $pos ) {
+               $parentData = $parentNode->snData;
+               $wrapBase = $parentData->wrapBaseNode;
+               $pWrap = $parentData->ancestorPNode;
+               if ( !$pWrap ) {
+                       $cloneEnd = $wrapBase;
+               } else {
+                       $cloneEnd = $parentData->ancestorPNode;
+               }
+
+               $serializer = $this->serializer;
+               $node = $parentNode;
+               $root = $serializer->getRootNode();
+               $nodes = [];
+               $removableNodes = [];
+               $haveContent = false;
+               while ( $node !== $cloneEnd ) {
+                       $nextParent = $serializer->getParentNode( $node );
+                       if ( $nextParent === $root ) {
+                               throw new \Exception( 'Did not find end of clone range' );
+                       }
+                       $nodes[] = $node;
+                       if ( $node->snData->nonblankNodeCount === 0 ) {
+                               $removableNodes[] = $node;
+                               $nextParent->snData->nonblankNodeCount--;
+                       }
+                       $node = $nextParent;
+               }
+
+               if ( $inline ) {
+                       $pWrap = $this->insertPWrapper( $wrapBase, $pos );
+                       $node = $pWrap;
+               } else {
+                       if ( $pWrap ) {
+                               // End the p-wrap which was open, cancel the diversion
+                               $wrapBase->snData->childPElement = null;
+                       }
+                       $pWrap = null;
+                       $node = $wrapBase;
+               }
+
+               for ( $i = count( $nodes ) - 1; $i >= 0; $i-- ) {
+                       $oldNode = $nodes[$i];
+                       $oldData = $oldNode->snData;
+                       $nodeParent = $node;
+                       $element = new Element( $oldNode->namespace, $oldNode->name, $oldNode->attrs );
+                       $this->serializer->insertElement( TreeBuilder::UNDER, $nodeParent,
+                               $element, false, $pos, 0 );
+                       $oldData->currentCloneElement = $element;
+
+                       $newNode = $element->userData;
+                       $newData = $newNode->snData = new RemexMungerData;
+                       if ( $pWrap ) {
+                               $newData->ancestorPNode = $pWrap;
+                       }
+                       $newData->isSplittable = true;
+                       $newData->wrapBaseNode = $wrapBase;
+                       $newData->isPWrapper = $oldData->isPWrapper;
+
+                       $nodeParent->snData->nonblankNodeCount++;
+
+                       $node = $newNode;
+               }
+               foreach ( $removableNodes as $rNode ) {
+                       $fakeElement = new Element( $rNode->namespace, $rNode->name, $rNode->attrs );
+                       $fakeElement->userData = $rNode;
+                       $this->serializer->removeNode( $fakeElement, $pos );
+               }
+               return $node;
+       }
+
+       /**
+        * Find the ancestor of $node which is a child of a p-wrapper, and
+        * reparent that node so that it is placed after the end of the p-wrapper
+        */
+       private function disablePWrapper( SerializerNode $node, $sourceStart ) {
+               $nodeData = $node->snData;
+               $pWrapNode = $nodeData->ancestorPNode;
+               $newParent = $this->serializer->getParentNode( $pWrapNode );
+               if ( $pWrapNode !== $this->serializer->getLastChild( $newParent ) ) {
+                       // Fostering or something? Abort!
+                       return;
+               }
+
+               $nextParent = $node;
+               do {
+                       $victim = $nextParent;
+                       $victim->snData->ancestorPNode = null;
+                       $nextParent = $this->serializer->getParentNode( $victim );
+               } while ( $nextParent !== $pWrapNode );
+
+               // Make a fake Element to use in a reparenting operation
+               $victimElement = new Element( $victim->namespace, $victim->name, $victim->attrs );
+               $victimElement->userData = $victim;
+
+               // Reparent
+               $this->serializer->insertElement( TreeBuilder::UNDER, $newParent, $victimElement,
+                       false, $sourceStart, 0 );
+
+               // Decrement nonblank node count
+               $pWrapNode->snData->nonblankNodeCount--;
+
+               // Cancel the diversion so that no more elements are inserted under this p-wrap
+               $newParent->snData->childPElement = null;
+       }
+
+       public function endTag( Element $element, $sourceStart, $sourceLength ) {
+               $this->serializer->endTag( $element, $sourceStart, $sourceLength );
+       }
+
+       public function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) {
+               $this->serializer->doctype( $name, $public,  $system, $quirks,
+                       $sourceStart, $sourceLength );
+       }
+
+       public function comment( $preposition, $refElement, $text, $sourceStart, $sourceLength ) {
+               list( $parent, $refNode ) = $this->getParentForInsert( $preposition, $refElement );
+               $this->serializer->comment( $preposition, $refNode, $text,
+                       $sourceStart, $sourceLength );
+       }
+
+       public function error( $text, $pos ) {
+               $this->serializer->error( $text, $pos );
+       }
+
+       public function mergeAttributes( Element $element, Attributes $attrs, $sourceStart ) {
+               $this->serializer->mergeAttributes( $element, $attrs, $sourceStart );
+       }
+
+       public function removeNode( Element $element, $sourceStart ) {
+               $this->serializer->removeNode( $element, $sourceStart );
+       }
+
+       public function reparentChildren( Element $element, Element $newParent, $sourceStart ) {
+               $self = $element->userData;
+               $children = $self->children;
+               $self->children = [];
+               $this->insertElement( TreeBuilder::UNDER, $element, $newParent, false, $sourceStart, 0 );
+               $newParentNode = $newParent->userData;
+               $newParentId = $newParentNode->id;
+               foreach ( $children as $child ) {
+                       if ( is_object( $child ) ) {
+                               $child->parentId = $newParentId;
+                       }
+               }
+               $newParentNode->children = $children;
+       }
+}
diff --git a/includes/tidy/RemexDriver.php b/includes/tidy/RemexDriver.php
new file mode 100644 (file)
index 0000000..e02af88
--- /dev/null
@@ -0,0 +1,57 @@
+<?php
+
+namespace MediaWiki\Tidy;
+
+use RemexHtml\Serializer\Serializer;
+use RemexHtml\Tokenizer\Tokenizer;
+use RemexHtml\TreeBuilder\Dispatcher;
+use RemexHtml\TreeBuilder\TreeBuilder;
+use RemexHtml\TreeBuilder\TreeMutationTracer;
+
+class RemexDriver extends TidyDriverBase {
+       private $trace;
+       private $pwrap;
+
+       public function __construct( array $config ) {
+               $config += [
+                       'treeMutationTrace' => false,
+                       'pwrap' => true
+               ];
+               $this->trace = $config['treeMutationTrace'];
+               $this->pwrap = $config['pwrap'];
+               parent::__construct( $config );
+       }
+
+       public function tidy( $text ) {
+               $formatter = new RemexCompatFormatter;
+               $serializer = new Serializer( $formatter );
+               if ( $this->pwrap ) {
+                       $munger = new RemexCompatMunger( $serializer );
+               } else {
+                       $munger = $serializer;
+               }
+               if ( $this->trace ) {
+                       $tracer = new TreeMutationTracer( $munger, function ( $msg ) {
+                               wfDebug( "RemexHtml: $msg" );
+                       } );
+               } else {
+                       $tracer = $munger;
+               }
+               $treeBuilder = new TreeBuilder( $tracer, [
+                       'ignoreErrors' => true,
+                       'ignoreNulls' => true,
+               ] );
+               $dispatcher = new Dispatcher( $treeBuilder );
+               $tokenizer = new Tokenizer( $dispatcher, $text, [
+                       'ignoreErrors' => true,
+                       'ignoreCharRefs' => true,
+                       'ignoreNulls' => true,
+                       'skipPreprocess' => true,
+               ] );
+               $tokenizer->execute( [
+                       'fragmentNamespace' => \RemexHtml\HTMLData::NS_HTML,
+                       'fragmentName' => 'body'
+               ] );
+               return $serializer->getResult();
+       }
+}
diff --git a/includes/tidy/RemexMungerData.php b/includes/tidy/RemexMungerData.php
new file mode 100644 (file)
index 0000000..d614a38
--- /dev/null
@@ -0,0 +1,78 @@
+<?php
+
+namespace MediaWiki\Tidy;
+
+/**
+ * @internal
+ */
+class RemexMungerData {
+       /**
+        * The Element for the mw:p-wrap which is a child of the current node. If
+        * this is set, inline insertions into this node will be diverted so that
+        * they insert into the p-wrap.
+        *
+        * @var \RemexHtml\TreeBuilder\Element|null
+        */
+       public $childPElement;
+
+       /**
+        * This tracks the mw:p-wrap node in the Serializer stack which is an
+        * ancestor of this node. If there is no mw:p-wrap ancestor, it is null.
+        *
+        * @var \RemexHtml\Serializer\SerializerNode|null
+        */
+       public $ancestorPNode;
+
+       /**
+        * The wrap base node is the body or blockquote node which is the parent
+        * of active p-wrappers. This is set if there is an ancestor p-wrapper,
+        * or if a p-wrapper was closed due to a block element being encountered
+        * inside it.
+        *
+        * @var \RemexHtml\Serializer\SerializerNode|null
+        */
+       public $wrapBaseNode;
+
+       /**
+        * Stack splitting (essentially our idea of AFE reconstruction) can clone
+        * formatting elements which are split over multiple paragraphs.
+        * TreeBuilder is not aware of the cloning, and continues to insert into
+        * the original element. This is set to the newer clone if this node was
+        * cloned, i.e. if there is an active diversion of the insertion location.
+        *
+        * @var \RemexHtml\TreeBuilder\Element|null
+        */
+       public $currentCloneElement;
+
+       /**
+        * Is the node a p-wrapper, with name mw:p-wrap?
+        *
+        * @var bool
+        */
+       public $isPWrapper = false;
+
+       /**
+        * Is the node splittable, i.e. a formatting element or a node with a
+        * formatting element ancestor which is under an active or deactivated
+        * p-wrapper.
+        *
+        * @var bool
+        */
+       public $isSplittable = false;
+
+       /**
+        * This is true if the node is a body or blockquote, which activates
+        * p-wrapping of child nodes.
+        */
+       public $needsPWrapping = false;
+
+       /**
+        * The number of child nodes, not counting whitespace-only text nodes or
+        * comments.
+        */
+       public $nonblankNodeCount = 0;
+
+       public function __set( $name, $value ) {
+               throw new \Exception( "Cannot set property \"$name\"" );
+       }
+}
index 2d67a28..4d24cb8 100644 (file)
@@ -115,15 +115,23 @@ class NamespaceAwareForeignTitleFactory implements ForeignTitleFactory {
        protected function parseTitleWithNs( $title, $ns ) {
                $pieces = explode( ':', $title, 2 );
 
+               // Is $title of the form Namespace:Title (true), or just Title (false)?
+               $titleIncludesNamespace = ( $ns != '0' && count( $pieces ) === 2 );
+
                if ( isset( $this->foreignNamespaces[$ns] ) ) {
                        $namespaceName = $this->foreignNamespaces[$ns];
                } else {
-                       $namespaceName = $ns == '0' ? '' : $pieces[0];
+                       // If the foreign wiki is misconfigured, XML dumps can contain a page with
+                       // a non-zero namespace ID, but whose title doesn't contain a colon
+                       // (T114115). In those cases, output a made-up namespace name to avoid
+                       // collisions. The ImportTitleFactory might replace this with something
+                       // more appropriate.
+                       $namespaceName = $titleIncludesNamespace ? $pieces[0] : "Ns$ns";
                }
 
                // We assume that the portion of the page title before the colon is the
-               // namespace name, except in the case of namespace 0
-               if ( $ns != '0' ) {
+               // namespace name, except in the case of namespace 0.
+               if ( $titleIncludesNamespace ) {
                        $pageName = $pieces[1];
                } else {
                        $pageName = $title;
diff --git a/tests/phpunit/includes/tidy/RemexDriverTest.php b/tests/phpunit/includes/tidy/RemexDriverTest.php
new file mode 100644 (file)
index 0000000..6b16cbf
--- /dev/null
@@ -0,0 +1,297 @@
+<?php
+
+class RemexDriverTest extends MediaWikiTestCase {
+       static private $remexTidyTestData = [
+               // Tests from Html5Depurate
+               [
+                       'Empty string',
+                       "",
+                       ""
+               ],
+               [
+                       'Simple p-wrap',
+                       "x",
+                       "<p>x</p>"
+               ],
+               [
+                       'No p-wrap of blank node',
+                       " ",
+                       " "
+               ],
+               [
+                       'p-wrap terminated by div',
+                       "x<div></div>",
+                       "<p>x</p><div></div>"
+               ],
+               [
+                       'p-wrap not terminated by span',
+                       "x<span></span>",
+                       "<p>x<span></span></p>"
+               ],
+               [
+                       'An element is non-blank and so gets p-wrapped',
+                       "<span></span>",
+                       "<p><span></span></p>"
+               ],
+               [
+                       'The blank flag is set after a block-level element',
+                       "<div></div> ",
+                       "<div></div> "
+               ],
+               [
+                       'Blank detection between two block-level elements',
+                       "<div></div> <div></div>",
+                       "<div></div> <div></div>"
+               ],
+               [
+                       'But p-wrapping of non-blank content works after an element',
+                       "<div></div>x",
+                       "<div></div><p>x</p>"
+               ],
+               [
+                       'p-wrapping between two block-level elements',
+                       "<div></div>x<div></div>",
+                       "<div></div><p>x</p><div></div>"
+               ],
+               [
+                       'p-wrap inside blockquote',
+                       "<blockquote>x</blockquote>",
+                       "<blockquote><p>x</p></blockquote>"
+               ],
+               [
+                       'A comment is blank for p-wrapping purposes',
+                       "<!-- x -->",
+                       "<!-- x -->"
+               ],
+               [
+                       'A comment is blank even when a p-wrap was opened by a text node',
+                       " <!-- x -->",
+                       " <!-- x -->"
+               ],
+               [
+                       'A comment does not open a p-wrap',
+                       "<!-- x -->x",
+                       "<!-- x --><p>x</p>"
+               ],
+               [
+                       'A comment does not close a p-wrap',
+                       "x<!-- x -->",
+                       "<p>x<!-- x --></p>"
+               ],
+               [
+                       'Empty li',
+                       "<ul><li></li></ul>",
+                       "<ul><li class=\"mw-empty-elt\"></li></ul>"
+               ],
+               [
+                       'li with element',
+                       "<ul><li><span></span></li></ul>",
+                       "<ul><li><span></span></li></ul>"
+               ],
+               [
+                       'li with text',
+                       "<ul><li>x</li></ul>",
+                       "<ul><li>x</li></ul>"
+               ],
+               [
+                       'Empty tr',
+                       "<table><tbody><tr></tr></tbody></table>",
+                       "<table><tbody><tr class=\"mw-empty-elt\"></tr></tbody></table>"
+               ],
+               [
+                       'Empty p',
+                       "<p>\n</p>",
+                       "<p class=\"mw-empty-elt\">\n</p>"
+               ],
+               [
+                       'No p-wrapping of an inline element which contains a block element (T150317)',
+                       "<small><div>x</div></small>",
+                       "<small><div>x</div></small>"
+               ],
+               [
+                       'p-wrapping of an inline element which contains an inline element',
+                       "<small><b>x</b></small>",
+                       "<p><small><b>x</b></small></p>"
+               ],
+               [
+                       'p-wrapping is enabled in a blockquote in an inline element',
+                       "<small><blockquote>x</blockquote></small>",
+                       "<small><blockquote><p>x</p></blockquote></small>"
+               ],
+               [
+                       'All bare text should be p-wrapped even when surrounded by block tags',
+                       "<small><blockquote>x</blockquote></small>y<div></div>z",
+                       "<small><blockquote><p>x</p></blockquote></small><p>y</p><div></div><p>z</p>"
+               ],
+               [
+                       'Split tag stack 1',
+                       "<small>x<div>y</div>z</small>",
+                       "<p><small>x</small></p><small><div>y</div></small><p><small>z</small></p>"
+               ],
+               [
+                       'Split tag stack 2',
+                       "<small><div>y</div>z</small>",
+                       "<small><div>y</div></small><p><small>z</small></p>"
+               ],
+               [
+                       'Split tag stack 3',
+                       "<small>x<div>y</div></small>",
+                       "<p><small>x</small></p><small><div>y</div></small>"
+               ],
+               [
+                       'Split tag stack 4 (modified to use splittable tag)',
+                       "a<code>b<i>c<div>d</div></i>e</code>",
+                       "<p>a<code>b<i>c</i></code></p><code><i><div>d</div></i></code><p><code>e</code></p>"
+               ],
+               [
+                       "Split tag stack regression check 1",
+                       "x<span><div>y</div></span>",
+                       "<p>x</p><span><div>y</div></span>"
+               ],
+               [
+                       "Split tag stack regression check 2 (modified to use splittable tag)",
+                       "a<code><i><div>d</div></i>e</code>",
+                       "<p>a</p><code><i><div>d</div></i></code><p><code>e</code></p>"
+               ],
+               // Simple tests from pwrap.js
+               [
+                       'Simple pwrap test 1',
+                       'a',
+                       '<p>a</p>'
+               ],
+               [
+                       '<span> is not a splittable tag, but gets p-wrapped in simple wrapping scenarios',
+                       '<span>a</span>',
+                       '<p><span>a</span></p>'
+               ],
+               [
+                       'Simple pwrap test 3',
+                       'x <div>a</div> <div>b</div> y',
+                       '<p>x </p><div>a</div> <div>b</div><p> y</p>'
+               ],
+               [
+                       'Simple pwrap test 4',
+                       'x<!--c--> <div>a</div> <div>b</div> <!--c-->y',
+                       '<p>x<!--c--> </p><div>a</div> <div>b</div> <!--c--><p>y</p>'
+               ],
+               // Complex tests from pwrap.js
+               [
+                       'Complex pwrap test 1',
+                       '<i>x<div>a</div>y</i>',
+                       '<p><i>x</i></p><i><div>a</div></i><p><i>y</i></p>'
+               ],
+               [
+                       'Complex pwrap test 2',
+                       'a<small>b</small><i>c<div>d</div>e</i>f',
+                       '<p>a<small>b</small><i>c</i></p><i><div>d</div></i><p><i>e</i>f</p>'
+               ],
+               [
+                       'Complex pwrap test 3',
+                       'a<small>b<i>c<div>d</div></i>e</small>',
+                       '<p>a<small>b<i>c</i></small></p><small><i><div>d</div></i></small><p><small>e</small></p>'
+               ],
+               [
+                       'Complex pwrap test 4',
+                       'x<small><div>y</div></small>',
+                       '<p>x</p><small><div>y</div></small>'
+               ],
+               [
+                       'Complex pwrap test 5',
+                       'a<small><i><div>d</div></i>e</small>',
+                       '<p>a</p><small><i><div>d</div></i></small><p><small>e</small></p>'
+               ],
+               [
+                       'Complex pwrap test 6',
+                       '<i>a<div>b</div>c<b>d<div>e</div>f</b>g</i>',
+                       // @codingStandardsIgnoreStart Generic.Files.LineLength.TooLong
+                       // PHP 5 does not allow concatenation in initialisation of a class static variable
+                       '<p><i>a</i></p><i><div>b</div></i><p><i>c<b>d</b></i></p><i><b><div>e</div></b></i><p><i><b>f</b>g</i></p>'
+                       // @codingStandardsIgnoreEnd
+               ],
+               /* FIXME the second <b> causes a stack split which clones the <i> even
+                * though no <p> is actually generated
+               [
+                       'Complex pwrap test 7',
+                       '<i><b><font><div>x</div></font></b><div>y</div><b><font><div>z</div></font></b></i>',
+                       '<i><b><font><div>x</div></font></b><div>y</div><b><font><div>z</div></font></b></i>'
+               ],
+                */
+               // New local tests
+               [
+                       'Blank text node after block end',
+                       '<small>x<div>y</div> <b>z</b></small>',
+                       '<p><small>x</small></p><small><div>y</div></small><p><small> <b>z</b></small></p>'
+               ],
+               [
+                       'Text node fostering (FIXME: wrap missing)',
+                       '<table>x</table>',
+                       'x<table></table>'
+               ],
+               [
+                       'Blockquote fostering',
+                       '<table><blockquote>x</blockquote></table>',
+                       '<blockquote><p>x</p></blockquote><table></table>'
+               ],
+               [
+                       'Block element fostering',
+                       '<table><div>x',
+                       '<div>x</div><table></table>'
+               ],
+               [
+                       'Formatting element fostering (FIXME: wrap missing)',
+                       '<table><b>x',
+                       '<b>x</b><table></table>'
+               ],
+               [
+                       'AAA clone of p-wrapped element (FIXME: empty b)',
+                       '<b>x<p>y</b>z</p>',
+                       '<p><b>x</b></p><b></b><p><b>y</b>z</p>',
+               ],
+               [
+                       'AAA with fostering (FIXME: wrap missing)',
+                       '<table><b>1<p>2</b>3</p>',
+                       '<b>1</b><p><b>2</b>3</p><table></table>'
+               ],
+       ];
+
+       public function provider() {
+               return self::$remexTidyTestData;
+       }
+
+       /**
+        * @dataProvider provider
+        * @covers MediaWiki\Tidy\RemexCompatFormatter
+        * @covers MediaWiki\Tidy\RemexCompatMunger
+        * @covers MediaWiki\Tidy\RemexDriver
+        * @covers MediaWiki\Tidy\RemexMungerData
+        */
+       public function testTidy( $desc, $input, $expected ) {
+               $r = new MediaWiki\Tidy\RemexDriver( [] );
+               $result = $r->tidy( $input );
+               $this->assertEquals( $expected, $result, $desc );
+       }
+
+       public function html5libProvider() {
+               $files = json_decode( file_get_contents( __DIR__ . '/html5lib-tests.json' ), true );
+               $tests = [];
+               foreach ( $files as $file => $fileTests ) {
+                       foreach ( $fileTests as $i => $test ) {
+                               $tests[] = [ "$file:$i", $test['data'] ];
+                       }
+               }
+               return $tests;
+       }
+
+       /**
+        * This is a quick and dirty test to make sure none of the html5lib tests
+        * generate exceptions. We don't really know what the expected output is.
+        *
+        * @dataProvider html5libProvider
+        * @coversNothing
+        */
+       public function testHtml5Lib( $desc, $input ) {
+               $r = new MediaWiki\Tidy\RemexDriver( [] );
+               $result = $r->tidy( $input );
+               $this->assertTrue( true, $desc );
+       }
+}
index 76cedc6..520108a 100644 (file)
@@ -36,10 +36,18 @@ class NamespaceAwareForeignTitleFactoryTest extends MediaWikiTestCase {
                                'MainNamespaceArticle', null,
                                new ForeignTitle( 0, '', 'MainNamespaceArticle' ),
                        ],
+                       [
+                               'Magic:_The_Gathering', 0,
+                               new ForeignTitle( 0, '', 'Magic:_The_Gathering' ),
+                       ],
                        [
                                'Talk:Nice_talk', 1,
                                new ForeignTitle( 1, 'Talk', 'Nice_talk' ),
                        ],
+                       [
+                               'Talk:Magic:_The_Gathering', 1,
+                               new ForeignTitle( 1, 'Talk', 'Magic:_The_Gathering' ),
+                       ],
                        [
                                'Bogus:Nice_talk', 0,
                                new ForeignTitle( 0, '', 'Bogus:Nice_talk' ),
@@ -56,6 +64,11 @@ class NamespaceAwareForeignTitleFactoryTest extends MediaWikiTestCase {
                                'Bogus:Nice_talk', 1,
                                new ForeignTitle( 1, 'Talk', 'Nice_talk' ),
                        ],
+                       // Misconfigured wiki with unregistered namespace (T114115)
+                       [
+                               'Nice_talk', 1234,
+                               new ForeignTitle( 1234, 'Ns1234', 'Nice_talk' ),
+                       ],
                ];
        }