Add support for xml dump schema 0.11
authordaniel <dkinzler@wikimedia.org>
Fri, 5 Oct 2018 08:36:37 +0000 (10:36 +0200)
committerDaniel Kinzler <dkinzler@wikimedia.org>
Thu, 27 Jun 2019 21:56:01 +0000 (21:56 +0000)
Bug: T174031
Change-Id: I2717019ea7efe36694bd2b2fba4dc2952a987cfc

docs/export-0.11.xsd [new file with mode: 0644]
docs/hooks.txt
includes/Defines.php
includes/export/WikiExporter.php
includes/export/XmlDumpWriter.php
maintenance/includes/TextPassDumper.php
tests/phpunit/maintenance/DumpAsserter.php
tests/phpunit/maintenance/backup_PageTest.php

diff --git a/docs/export-0.11.xsd b/docs/export-0.11.xsd
new file mode 100644 (file)
index 0000000..6dbc63b
--- /dev/null
@@ -0,0 +1,335 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+       This is an XML Schema description of the format
+       output by MediaWiki's Special:Export system.
+
+       Version 0.2 adds optional basic file upload info support,
+       which is used by our OAI export/import submodule.
+
+       Version 0.3 adds some site configuration information such
+       as a list of defined namespaces.
+
+       Version 0.4 adds per-revision delete flags, log exports,
+       discussion threading data, a per-page redirect flag, and
+       per-namespace capitalization.
+
+       Version 0.5 adds byte count per revision.
+
+       Version 0.6 adds a separate namespace tag, and resolves the
+       redirect target and adds a separate sha1 tag for each revision.
+
+       Version 0.7 adds a unique identity constraint for both page and
+       revision identifiers. See also bug 4220.
+       Fix type for <ns> from "positiveInteger" to "nonNegativeInteger" to allow 0
+       Moves <logitem> to its right location.
+       Add parentid to revision.
+       Fix type for <id> within <contributor> to "nonNegativeInteger"
+
+       Version 0.8 adds support for a <model> and a <format> tag for
+       each revision. See contenthandler.txt.
+
+       Version 0.9 adds the database name to the site information.
+
+       Version 0.10 moved the <model> and <format> tags before the <text> tag.
+
+       Version 0.11 introduced <content> tag.
+
+       The canonical URL to the schema document is:
+       http://www.mediawiki.org/xml/export-0.11.xsd
+
+       Use the namespace:
+       http://www.mediawiki.org/xml/export-0.11/
+-->
+<schema xmlns="http://www.w3.org/2001/XMLSchema"
+               xmlns:mw="http://www.mediawiki.org/xml/export-0.11/"
+               targetNamespace="http://www.mediawiki.org/xml/export-0.11/"
+               elementFormDefault="qualified">
+
+       <annotation>
+               <documentation xml:lang="en">
+                       MediaWiki's page export format
+               </documentation>
+       </annotation>
+
+       <!-- Need this to reference xml:lang -->
+       <import namespace="http://www.w3.org/XML/1998/namespace"
+                       schemaLocation="http://www.w3.org/2001/xml.xsd" />
+
+       <!-- Our root element -->
+       <element name="mediawiki" type="mw:MediaWikiType">
+               <!-- Page ID contraint, see bug 4220 -->
+               <unique name="PageIDConstraint">
+                       <selector xpath="mw:page" />
+                       <field xpath="mw:id" />
+               </unique>
+               <!-- Revision ID contraint, see bug 4220 -->
+               <unique name="RevIDConstraint">
+                       <selector xpath="mw:page/mw:revision" />
+                       <field xpath="mw:id" />
+               </unique>
+       </element>
+
+       <complexType name="MediaWikiType">
+               <sequence>
+                       <element name="siteinfo" type="mw:SiteInfoType"
+                                        minOccurs="0" maxOccurs="1" />
+                       <element name="page" type="mw:PageType"
+                                        minOccurs="0" maxOccurs="unbounded" />
+                       <element name="logitem" type="mw:LogItemType"
+                                        minOccurs="0" maxOccurs="unbounded" />
+               </sequence>
+               <attribute name="version" type="string" use="required" />
+               <attribute ref="xml:lang" use="required" />
+       </complexType>
+
+       <complexType name="SiteInfoType">
+               <sequence>
+                       <element name="sitename" type="string" minOccurs="0" />
+            <element name="dbname" type="string" minOccurs="0" />
+                       <element name="base" type="anyURI" minOccurs="0" />
+                       <element name="generator" type="string" minOccurs="0" />
+                       <element name="case" type="mw:CaseType" minOccurs="0" />
+                       <element name="namespaces" type="mw:NamespacesType" minOccurs="0" />
+               </sequence>
+       </complexType>
+
+       <simpleType name="CaseType">
+               <restriction base="NMTOKEN">
+                       <!-- Cannot have two titles differing only by case of first letter. -->
+                       <!-- Default behavior through 1.5, $wgCapitalLinks = true -->
+                       <enumeration value="first-letter" />
+
+                       <!-- Complete title is case-sensitive -->
+                       <!-- Behavior when $wgCapitalLinks = false -->
+                       <enumeration value="case-sensitive" />
+
+                       <!-- Cannot have non-case senstitive titles eg [[FOO]] == [[Foo]] -->
+                       <!-- Not yet implemented as of MediaWiki 1.18 -->
+                       <enumeration value="case-insensitive" />
+               </restriction>
+       </simpleType>
+
+       <simpleType name="DeletedFlagType">
+               <restriction base="NMTOKEN">
+                       <enumeration value="deleted" />
+               </restriction>
+       </simpleType>
+
+       <complexType name="NamespacesType">
+               <sequence>
+                       <element name="namespace" type="mw:NamespaceType"
+                                        minOccurs="0" maxOccurs="unbounded" />
+               </sequence>
+       </complexType>
+
+       <complexType name="NamespaceType">
+               <simpleContent>
+                       <extension base="string">
+                               <attribute name="key" type="integer" />
+                               <attribute name="case" type="mw:CaseType" />
+                       </extension>
+               </simpleContent>
+       </complexType>
+
+       <complexType name="RedirectType">
+               <simpleContent>
+                       <extension base="string">
+                               <attribute name="title" type="string" />
+                       </extension>
+               </simpleContent>
+       </complexType>
+
+       <simpleType name="ContentModelType">
+               <restriction base="string">
+                       <pattern value="[a-zA-Z][-+./a-zA-Z0-9]*" />
+               </restriction>
+       </simpleType>
+
+       <simpleType name="ContentFormatType">
+               <restriction base="string">
+                       <pattern value="[a-zA-Z][-+.a-zA-Z0-9]*/[a-zA-Z][-+.a-zA-Z0-9]*" />
+               </restriction>
+       </simpleType>
+
+       <complexType name="PageType">
+               <sequence>
+                       <!-- Title in text form. (Using spaces, not underscores; with namespace ) -->
+                       <element name="title" type="string" />
+
+                       <!-- Namespace in canonical form -->
+                       <element name="ns" type="nonNegativeInteger" />
+
+                       <!-- optional page ID number -->
+                       <element name="id" type="positiveInteger" />
+
+                       <!-- flag if the current revision is a redirect -->
+                       <element name="redirect" type="mw:RedirectType" minOccurs="0" maxOccurs="1" />
+
+                       <!-- comma-separated list of string tokens, if present -->
+                       <element name="restrictions" type="string" minOccurs="0" />
+
+                       <!-- Zero or more sets of revision or upload data -->
+                       <choice minOccurs="0" maxOccurs="unbounded">
+                               <element name="revision" type="mw:RevisionType" />
+                               <element name="upload" type="mw:UploadType" />
+                       </choice>
+
+                       <!-- Zero or One sets of discussion threading data -->
+                       <element name="discussionthreadinginfo" minOccurs="0" maxOccurs="1" type="mw:DiscussionThreadingInfo" />
+               </sequence>
+       </complexType>
+
+       <complexType name="RevisionType">
+               <sequence>
+                       <element name="id" type="positiveInteger" />
+                       <element name="parentid" type="positiveInteger" minOccurs="0" maxOccurs="1"/>
+                       <element name="timestamp" type="dateTime" />
+                       <element name="contributor" type="mw:ContributorType" />
+                       <element name="minor" minOccurs="0" maxOccurs="1"/>
+                       <element name="comment" type="mw:CommentType"/>
+                       <!-- corresponds to slot origin for the main slot -->
+                       <element name="origin" type="positiveInteger" />
+                       <!-- the main slot's content model -->
+                       <element name="model" type="mw:ContentModelType" />
+                       <!-- the main slot's serialization format -->
+                       <element name="format" type="mw:ContentFormatType" />
+                       <!-- the main slot's serialized content -->
+                       <element name="text" type="mw:TextType"/>
+                       <element name="content" type="mw:ContentType" minOccurs="0" maxOccurs="unbounded"/>
+                       <!-- sha1 of the revision, a combined sha1 of content in all slots -->
+                       <element name="sha1" type="string" />
+               </sequence>
+       </complexType>
+
+       <complexType name="ContentType">
+               <sequence>
+                       <!-- corresponds to slot role_name -->
+                       <element name="role" type="mw:SlotRoleType" />
+                       <!-- corresponds to slot origin -->
+                       <element name="origin" type="positiveInteger" />
+                       <element name="model" type="mw:ContentModelType" />
+                       <element name="format" type="mw:ContentFormatType" />
+                       <element name="text" type="mw:ContentTextType" />
+               </sequence>
+       </complexType>
+
+       <simpleType name="SlotRoleType">
+               <restriction base="string">
+                       <pattern value="[a-zA-Z][-+./a-zA-Z0-9]*" />
+               </restriction>
+       </simpleType>
+
+       <complexType name="ContentTextType">
+               <simpleContent>
+                       <extension base="string">
+                               <attribute ref="xml:space" default="preserve" />
+                               <!-- This allows deleted=deleted on non-empty elements, but XSD is not omnipotent -->
+                               <attribute name="deleted" type="mw:DeletedFlagType" />
+                               <attribute name="location" type="anyURI" />
+                               <attribute name="bytes" type="nonNegativeInteger" />
+                       </extension>
+               </simpleContent>
+       </complexType>
+
+       <complexType name="LogItemType">
+               <sequence>
+                       <element name="id" type="positiveInteger" />
+                       <element name="timestamp" type="dateTime" />
+                       <element name="contributor" type="mw:ContributorType" />
+                       <element name="comment" type="mw:CommentType" minOccurs="0" />
+                       <element name="type" type="string" />
+                       <element name="action" type="string" />
+                       <element name="text" type="mw:LogTextType" minOccurs="0" maxOccurs="1" />
+                       <element name="logtitle" type="string" minOccurs="0" maxOccurs="1" />
+                       <element name="params" type="mw:LogParamsType" minOccurs="0" maxOccurs="1" />
+               </sequence>
+       </complexType>
+
+       <complexType name="CommentType">
+               <simpleContent>
+                       <extension base="string">
+                               <!-- This allows deleted=deleted on non-empty elements, but XSD is not omnipotent -->
+                               <attribute name="deleted" type="mw:DeletedFlagType" />
+                       </extension>
+               </simpleContent>
+       </complexType>
+
+       <complexType name="TextType">
+               <simpleContent>
+                       <extension base="string">
+                               <attribute ref="xml:space" default="preserve" />
+                               <!-- This allows deleted=deleted on non-empty elements, but XSD is not omnipotent -->
+                               <attribute name="deleted" type="mw:DeletedFlagType" />
+                               <!-- This isn't a good idea; we should be using "ID" instead of "NMTOKEN" -->
+                               <!-- However, "NMTOKEN" is strictest definition that is both compatible with existing -->
+                               <!-- usage ([0-9]+) and with the "ID" type. -->
+                               <attribute name="id" type="NMTOKEN" />
+                               <attribute name="location" type="anyURI" />
+                               <attribute name="sha1" type="string"/>
+                               <attribute name="bytes" type="nonNegativeInteger" />
+                       </extension>
+               </simpleContent>
+       </complexType>
+
+       <complexType name="LogTextType">
+               <simpleContent>
+                       <extension base="string">
+                               <!-- This allows deleted=deleted on non-empty elements, but XSD is not omnipotent -->
+                               <attribute name="deleted" type="mw:DeletedFlagType" />
+                       </extension>
+               </simpleContent>
+       </complexType>
+
+       <complexType name="LogParamsType">
+               <simpleContent>
+                       <extension base="string">
+                               <attribute ref="xml:space" default="preserve" />
+                       </extension>
+               </simpleContent>
+       </complexType>
+
+       <complexType name="ContributorType">
+               <sequence>
+                       <element name="username" type="string" minOccurs="0" />
+                       <element name="id" type="nonNegativeInteger" minOccurs="0" />
+
+                       <element name="ip" type="string" minOccurs="0" />
+               </sequence>
+               <!-- This allows deleted=deleted on non-empty elements, but XSD is not omnipotent -->
+               <attribute name="deleted" type="mw:DeletedFlagType" />
+       </complexType>
+
+       <complexType name="UploadType">
+               <sequence>
+                       <!-- Revision-style data... -->
+                       <element name="timestamp" type="dateTime" />
+                       <element name="contributor" type="mw:ContributorType" />
+                       <element name="comment" type="string" minOccurs="0" />
+
+                       <!-- Filename. (Using underscores, not spaces. No 'File:' namespace marker.) -->
+                       <element name="filename" type="string" />
+
+                       <!-- URI at which this resource can be obtained -->
+                       <element name="src" type="anyURI" />
+
+                       <element name="size" type="positiveInteger" />
+
+                       <!-- TODO: add other metadata fields -->
+               </sequence>
+       </complexType>
+
+       <!-- Discussion threading data for LiquidThreads -->
+       <complexType name="DiscussionThreadingInfo">
+               <sequence>
+                       <element name="ThreadSubject" type="string" />
+                       <element name="ThreadParent" type="positiveInteger" />
+                       <element name="ThreadAncestor" type="positiveInteger" />
+                       <element name="ThreadPage" type="string" />
+                       <element name="ThreadID" type="positiveInteger" />
+                       <element name="ThreadAuthor" type="string" />
+                       <element name="ThreadEditStatus" type="string" />
+                       <element name="ThreadType" type="string" />
+               </sequence>
+       </complexType>
+
+</schema>
index 99a3d1a..6f8c4ca 100644 (file)
@@ -3985,8 +3985,9 @@ $title: The title of the page.
 add extra metadata.
 &$obj: The XmlDumpWriter object.
 &$out: The text being output.
-$row: The database row for the revision.
-$text: The revision text.
+$row: The database row for the revision being dumped. DEPRECATED, use $rev instead.
+$text: The revision text to be dumped. DEPRECATED, use $rev instead.
+$rev: The RevisionRecord that is being dumped to XML
 
 More hooks might be available but undocumented, you can execute
 "php maintenance/findHooks.php" to find hidden ones.
index e5cd5ed..648e493 100644 (file)
@@ -322,4 +322,5 @@ define( 'MIGRATION_NEW', 0x30000000 | SCHEMA_COMPAT_NEW );
  * were already unsupported at the time these constants were introduced.
  */
 define( 'XML_DUMP_SCHEMA_VERSION_10', '0.10' );
+define( 'XML_DUMP_SCHEMA_VERSION_11', '0.11' );
 /**@}*/
index 0b0c801..f834fb1 100644 (file)
@@ -53,8 +53,8 @@ class WikiExporter {
        const LOGS = 8;
        const RANGE = 16;
 
-       const TEXT = 0;
-       const STUB = 1;
+       const TEXT = XmlDumpWriter::WRITE_CONTENT;
+       const STUB = XmlDumpWriter::WRITE_STUB;
 
        const BATCH_SIZE = 50000;
 
index d1b993d..bedfe13 100644 (file)
  * @file
  */
 use MediaWiki\MediaWikiServices;
+use MediaWiki\Revision\RevisionRecord;
 use MediaWiki\Revision\RevisionStore;
+use MediaWiki\Revision\SlotRecord;
+use MediaWiki\Revision\SuppressedDataException;
 use MediaWiki\Storage\SqlBlobStore;
+use Wikimedia\Assert\Assert;
 
 /**
  * @ingroup Dump
  */
 class XmlDumpWriter {
+
+       /** Output serialized revision content. */
+       const WRITE_CONTENT = 0;
+
+       /** Only output subs for revision content. */
+       const WRITE_STUB = 1;
+
+       /**
+        * Only output subs for revision content, indicating that the content has been
+        * deleted/suppressed. For internal use only.
+        */
+       const WRITE_STUB_DELETED = 2;
+
        /**
         * @var string[] the schema versions supported for output
         * @final
         */
        public static $supportedSchemas = [
                XML_DUMP_SCHEMA_VERSION_10,
+               XML_DUMP_SCHEMA_VERSION_11
        ];
 
+       /**
+        * @var string which schema version the generated XML should comply to.
+        * One of the values from self::$supportedSchemas, using the SCHEMA_VERSION_XX
+        * constants.
+        */
+       private $schemaVersion;
+
        /**
         * Title of the currently processed page
         *
@@ -45,6 +70,40 @@ class XmlDumpWriter {
         */
        private $currentTitle = null;
 
+       /**
+        * @var int Whether to output revision content or just stubs. WRITE_CONTENT or WRITE_STUB.
+        */
+       private $contentMode;
+
+       /**
+        * XmlDumpWriter constructor.
+        *
+        * @param int $contentMode WRITE_CONTENT or WRITE_STUB.
+        * @param string $schemaVersion which schema version the generated XML should comply to.
+        * One of the values from self::$supportedSchemas, using the XML_DUMP_SCHEMA_VERSION_XX
+        * constants.
+        */
+       public function __construct(
+               $contentMode = self::WRITE_CONTENT,
+               $schemaVersion = XML_DUMP_SCHEMA_VERSION_11
+       ) {
+               Assert::parameter(
+                       in_array( $contentMode, [ self::WRITE_CONTENT, self::WRITE_STUB ] ),
+                       '$contentMode',
+                       'must be one of the following constants: WRITE_CONTENT or WRITE_STUB.'
+               );
+
+               Assert::parameter(
+                       in_array( $schemaVersion, self::$supportedSchemas ),
+                       '$schemaVersion',
+                       'must be one of the following schema versions: '
+                               . implode( ',', self::$supportedSchemas )
+               );
+
+               $this->contentMode = $contentMode;
+               $this->schemaVersion = $schemaVersion;
+       }
+
        /**
         * Opens the XML output stream's root "<mediawiki>" element.
         * This does not include an xml directive, so is safe to include
@@ -56,7 +115,7 @@ class XmlDumpWriter {
         * @return string
         */
        function openStream() {
-               $ver = WikiExporter::schemaVersion();
+               $ver = $this->schemaVersion;
                return Xml::element( 'mediawiki', [
                        'xmlns'              => "http://www.mediawiki.org/xml/export-$ver/",
                        'xmlns:xsi'          => "http://www.w3.org/2001/XMLSchema-instance",
@@ -253,137 +312,188 @@ class XmlDumpWriter {
                );
 
                $out = "    <revision>\n";
-               $out .= "      " . Xml::element( 'id', null, strval( $row->rev_id ) ) . "\n";
-               if ( isset( $row->rev_parent_id ) && $row->rev_parent_id ) {
-                       $out .= "      " . Xml::element( 'parentid', null, strval( $row->rev_parent_id ) ) . "\n";
+               $out .= "      " . Xml::element( 'id', null, strval( $rev->getId() ) ) . "\n";
+
+               if ( $rev->getParentId() ) {
+                       $out .= "      " . Xml::element( 'parentid', null, strval( $rev->getParentId() ) ) . "\n";
                }
 
-               $out .= $this->writeTimestamp( $row->rev_timestamp );
+               $out .= $this->writeTimestamp( $rev->getTimestamp() );
 
-               if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_USER ) ) {
+               if ( $rev->isDeleted( Revision::DELETED_USER ) ) {
                        $out .= "      " . Xml::element( 'contributor', [ 'deleted' => 'deleted' ] ) . "\n";
                } else {
                        // empty values get written out as uid 0, see T224221
-                       $out .= $this->writeContributor( $row->rev_user ?: 0, $row->rev_user_text );
+                       $user = $rev->getUser();
+                       $out .= $this->writeContributor(
+                               $user ? $user->getId() : 0,
+                               $user ? $user->getName() : ''
+                       );
                }
 
-               if ( isset( $row->rev_minor_edit ) && $row->rev_minor_edit ) {
+               if ( $rev->isMinor() ) {
                        $out .= "      <minor/>\n";
                }
-               if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_COMMENT ) ) {
+               if ( $rev->isDeleted( Revision::DELETED_COMMENT ) ) {
                        $out .= "      " . Xml::element( 'comment', [ 'deleted' => 'deleted' ] ) . "\n";
                } else {
-                       $comment = CommentStore::getStore()->getComment( 'rev_comment', $row )->text;
-                       if ( $comment != '' ) {
-                               $out .= "      " . Xml::elementClean( 'comment', [], strval( $comment ) ) . "\n";
-                       }
+                       $out .= "      "
+                               . Xml::elementClean( 'comment', [], strval( $rev->getComment()->text ) )
+                               . "\n";
+               }
+
+               $contentMode = $rev->isDeleted( Revision::DELETED_TEXT ) ? self::WRITE_STUB_DELETED
+                       : $this->contentMode;
+
+               foreach ( $rev->getSlots()->getSlots() as $slot ) {
+                       $out .= $this->writeSlot( $slot, $contentMode );
                }
 
-               // TODO: rev_content_model no longer exists with MCR, see T174031
-               if ( isset( $row->rev_content_model ) && !is_null( $row->rev_content_model ) ) {
-                       $content_model = strval( $row->rev_content_model );
+               if ( $rev->isDeleted( Revision::DELETED_TEXT ) ) {
+                       $out .= "      <sha1/>\n";
                } else {
-                       // probably using $wgContentHandlerUseDB = false;
-                       $content_model = ContentHandler::getDefaultModelFor( $this->currentTitle );
+                       $out .= "      " . Xml::element( 'sha1', null, strval( $rev->getSha1() ) ) . "\n";
                }
 
-               $content_handler = ContentHandler::getForModelID( $content_model );
+               // Avoid PHP 7.1 warning from passing $this by reference
+               $writer = $this;
+               $text = $rev->getContent( SlotRecord::MAIN, RevisionRecord::RAW );
+               Hooks::run( 'XmlDumpWriterWriteRevision', [ &$writer, &$out, $row, $text, $rev ] );
 
-               // TODO: rev_content_format no longer exists with MCR, see T174031
-               if ( isset( $row->rev_content_format ) && !is_null( $row->rev_content_format ) ) {
-                       $content_format = strval( $row->rev_content_format );
-               } else {
-                       // probably using $wgContentHandlerUseDB = false;
-                       $content_format = $content_handler->getDefaultFormat();
+               $out .= "    </revision>\n";
+
+               return $out;
+       }
+
+       /**
+        * @param SlotRecord $slot
+        * @param int $contentMode see the WRITE_XXX constants
+        *
+        * @return string
+        */
+       private function writeSlot( SlotRecord $slot, $contentMode ) {
+               $isMain = $slot->getRole() === SlotRecord::MAIN;
+               $isV11 = $this->schemaVersion >= XML_DUMP_SCHEMA_VERSION_11;
+
+               if ( !$isV11 && !$isMain ) {
+                       // ignore extra slots
+                       return '';
                }
 
-               $out .= "      " . Xml::element( 'model', null, strval( $content_model ) ) . "\n";
-               $out .= "      " . Xml::element( 'format', null, strval( $content_format ) ) . "\n";
+               $out = '';
+               $indent = '      ';
 
-               $text = '';
-               if ( isset( $row->rev_deleted ) && ( $row->rev_deleted & Revision::DELETED_TEXT ) ) {
-                       $out .= "      " . Xml::element( 'text', [ 'deleted' => 'deleted' ] ) . "\n";
-               } elseif ( isset( $row->old_text ) ) {
-                       // Raw text from the database may have invalid chars
-                       $text = strval( Revision::getRevisionText( $row ) );
-                       try {
-                               $text = $content_handler->exportTransform( $text, $content_format );
-                       }
-                       catch ( Exception $ex ) {
-                               if ( $ex instanceof MWException || $ex instanceof RuntimeException ) {
-                                       // leave text as is; that's the way it goes
-                                       wfLogWarning( 'exportTransform failed on text for revid ' . $row->rev_id . "\n" );
-                               } else {
-                                       throw $ex;
-                               }
-                       }
-                       $out .= "      " . Xml::elementClean( 'text',
-                               [ 'xml:space' => 'preserve', 'bytes' => intval( $row->rev_len ) ],
-                               strval( $text ) ) . "\n";
-               } elseif ( isset( $row->_load_content ) ) {
-                       // TODO: make this fully MCR aware, see T174031
-                       $slot = $rev->getSlot( 'main' );
-                       try {
-                               $content = $slot->getContent();
+               if ( !$isMain ) {
+                       // non-main slots are wrapped into an additional element.
+                       $out .= '      ' . Xml::openElement( 'content' ) . "\n";
+                       $indent .= '  ';
+                       $out .= $indent . Xml::element( 'role', null, strval( $slot->getRole() ) ) . "\n";
+               }
 
-                               if ( $content instanceof TextContent ) {
-                                       // HACK: For text based models, bypass the serialization step.
-                                       // This allows extensions (like Flow)that use incompatible combinations
-                                       // of serialization format and content model.
-                                       $text = $content->getNativeData();
-                               } else {
-                                       $text = $content->serialize( $content_format );
-                               }
-                               $text = $content_handler->exportTransform( $text, $content_format );
-                               $out .= "      " . Xml::elementClean( 'text',
-                                       [ 'xml:space' => 'preserve', 'bytes' => intval( $slot->getSize() ) ],
-                                       strval( $text ) ) . "\n";
+               if ( $isV11 ) {
+                       $out .= $indent . Xml::element( 'origin', null, strval( $slot->getOrigin() ) ) . "\n";
+               }
+
+               $contentModel = $slot->getModel();
+               $contentHandler = ContentHandler::getForModelID( $contentModel );
+               $contentFormat = $contentHandler->getDefaultFormat();
+
+               // XXX: The content format is only relevant when actually outputting serialized content.
+               // It should probably be an attribute on the text tag.
+               $out .= $indent . Xml::element( 'model', null, strval( $contentModel ) ) . "\n";
+               $out .= $indent . Xml::element( 'format', null, strval( $contentFormat ) ) . "\n";
+
+               $textAttributes = [
+                       'xml:space' => 'preserve',
+                       'bytes' => $slot->getSize(),
+               ];
+
+               if ( $isV11 ) {
+                       $textAttributes['sha1'] = $slot->getSha1();
+               }
+
+               if ( $contentMode === self::WRITE_CONTENT ) {
+                       try {
+                               // write <text> tag
+                               $out .= $this->writeText( $slot->getContent(), $textAttributes, $indent );
+                       } catch ( SuppressedDataException $ex ) {
+                               // NOTE: this shouldn't happen, since the caller is supposed to have checked
+                               // for suppressed content!
+                               // write <text> placeholder tag
+                               $textAttributes['deleted'] = 'deleted';
+                               $out .= $indent . Xml::element( 'text', $textAttributes ) . "\n";
                        }
                        catch ( Exception $ex ) {
                                if ( $ex instanceof MWException || $ex instanceof RuntimeException ) {
-                                       // there's no provsion in the schema for an attribute that will let
+                                       // there's no provision in the schema for an attribute that will let
                                        // the user know this element was unavailable due to error; an empty
                                        // tag is the best we can do
-                                       $out .= "      " . Xml::element( 'text' ) . "\n";
-                                       wfLogWarning( 'failed to load content for revid ' . $row->rev_id . "\n" );
+                                       $out .= $indent . Xml::element( 'text' ) . "\n";
+                                       wfLogWarning(
+                                               'failed to load content slot ' . $slot->getRole() . ' for revision '
+                                               . $slot->getRevision() . "\n"
+                                       );
                                } else {
                                        throw $ex;
                                }
                        }
-               } elseif ( isset( $row->rev_text_id ) ) {
-                       // Stub output for pre-MCR schema
-                       // TODO: MCR: rev_text_id only exists in the pre-MCR schema. Remove this when
-                       // we drop support for the old schema.
-                       $out .= "      " . Xml::element( 'text',
-                               [ 'id' => $row->rev_text_id, 'bytes' => intval( $row->rev_len ) ],
-                               "" ) . "\n";
+               } elseif ( $contentMode === self::WRITE_STUB_DELETED ) {
+                       // write <text> placeholder tag
+                       $textAttributes['deleted'] = 'deleted';
+                       $out .= $indent . Xml::element( 'text', $textAttributes ) . "\n";
                } else {
-                       // Backwards-compatible stub output for MCR aware schema
-                       // TODO: MCR: emit content addresses instead of text ids, see T174031, T199121
-                       $slot = $rev->getSlot( 'main' );
+                       // write <text> stub tag
+                       if ( $isV11 ) {
+                               $textAttributes['location'] = $slot->getAddress();
+                       }
 
+                       // Output the numerical text ID if possible, for backwards compatibility.
                        // Note that this is currently the ONLY reason we have a BlobStore here at all.
                        // When removing this line, check whether the BlobStore has become unused.
                        $textId = $this->getBlobStore()->getTextIdFromAddress( $slot->getAddress() );
-                       $out .= "      " . Xml::element( 'text',
-                                       [ 'id' => $textId, 'bytes' => intval( $slot->getSize() ) ],
-                                       "" ) . "\n";
+                       if ( $textId ) {
+                               $textAttributes['id'] = $textId;
+                       } elseif ( !$isV11 ) {
+                               throw new InvalidArgumentException(
+                                       'Cannot produce stubs for non-text-table content blobs with schema version '
+                                       . $this->schemaVersion
+                               );
+                       }
+
+                       $out .= $indent . Xml::element( 'text', $textAttributes ) . "\n";
                }
 
-               if ( isset( $row->rev_sha1 )
-                       && $row->rev_sha1
-                       && !( $row->rev_deleted & Revision::DELETED_TEXT )
-               ) {
-                       $out .= "      " . Xml::element( 'sha1', null, strval( $row->rev_sha1 ) ) . "\n";
-               } else {
-                       $out .= "      <sha1/>\n";
+               if ( !$isMain ) {
+                       $out .= '      ' . Xml::closeElement( 'content' ) . "\n";
                }
 
-               // Avoid PHP 7.1 warning from passing $this by reference
-               $writer = $this;
-               Hooks::run( 'XmlDumpWriterWriteRevision', [ &$writer, &$out, $row, $text ] );
+               return $out;
+       }
 
-               $out .= "    </revision>\n";
+       /**
+        * @param Content $content
+        * @param string[] $textAttributes
+        * @param string $indent
+        *
+        * @return string
+        */
+       private function writeText( Content $content, $textAttributes, $indent ) {
+               $out = '';
+
+               $contentHandler = $content->getContentHandler();
+               $contentFormat = $contentHandler->getDefaultFormat();
+
+               if ( $content instanceof TextContent ) {
+                       // HACK: For text based models, bypass the serialization step. This allows extensions (like Flow)
+                       // that use incompatible combinations of serialization format and content model.
+                       $data = $content->getNativeData();
+               } else {
+                       $data = $content->serialize( $contentFormat );
+               }
+
+               $data = $contentHandler->exportTransform( $data, $contentFormat );
+               $textAttributes['bytes'] = $size = strlen( $data ); // make sure to use the actual size
+               $out .= $indent . Xml::elementClean( 'text', $textAttributes, strval( $data ) ) . "\n";
 
                return $out;
        }
index eaed7ed..b37fec1 100644 (file)
@@ -281,7 +281,7 @@ TEXT
                $this->finalOptionCheck();
 
                // we only want this so we know how to close a stream :-P
-               $this->xmlwriterobj = new XmlDumpWriter();
+               $this->xmlwriterobj = new XmlDumpWriter( XmlDumpWriter::WRITE_CONTENT, $this->schemaVersion );
 
                $input = fopen( $this->input, "rt" );
                $this->readDump( $input );
index ad33f6e..e8c1cd6 100644 (file)
@@ -137,6 +137,34 @@ class DumpAsserter {
                }
        }
 
+       /**
+        * Asserts that the xml reader is at an element of given name, and that element
+        * is an empty tag.
+        *
+        * @param string $name The name of the element to check for
+        *   (e.g.: "text" for <text/>)
+        * @param bool $skip (optional) if true, skip past the found element
+        * @param bool $skip_ws (optional) if true, also skip past white spaces that trail the
+        *   closing element.
+        */
+       public function assertEmptyNode( $name, $skip = true, $skip_ws = true ) {
+               $this->assertNodeStart( $name, false );
+               Assert::assertFalse( $this->xml->hasValue, "$name tag has content" );
+
+               if ( $skip ) {
+                       Assert::assertTrue( $this->xml->read(), "Skipping $name tag" );
+                       if ( ( $this->xml->nodeType == XMLReader::END_ELEMENT )
+                               && ( $this->xml->name == $name )
+                       ) {
+                               $this->xml->read();
+                       }
+
+                       if ( $skip_ws ) {
+                               $this->skipWhitespace();
+                       }
+               }
+       }
+
        /**
         * Asserts that the xml reader is at an closing element of given name, and optionally
         * skips past it.
@@ -246,6 +274,11 @@ class DumpAsserter {
                $this->assertTextNode( "comment", $summary );
                $this->skipWhitespace();
 
+               if ( $this->schemaVersion >= XML_DUMP_SCHEMA_VERSION_11 ) {
+                       $this->assertTextNode( "origin", false );
+                       $this->skipWhitespace();
+               }
+
                $this->assertTextNode( "model", $model );
                $this->skipWhitespace();
 
@@ -258,9 +291,16 @@ class DumpAsserter {
                        $this->assertText( $id, $text_id, $text_bytes, $text );
                } else {
                        $text_found = false;
+                       if ( $this->schemaVersion >= XML_DUMP_SCHEMA_VERSION_11 ) {
+                               Assert::fail( 'Missing text node' );
+                       }
                }
 
-               $this->assertTextNode( "sha1", $text_sha1 );
+               if ( $text_sha1 ) {
+                       $this->assertTextNode( "sha1", $text_sha1 );
+               } else {
+                       $this->assertEmptyNode( "sha1" );
+               }
 
                if ( !$text_found ) {
                        $this->assertText( $id, $text_id, $text_bytes, $text );
@@ -278,17 +318,9 @@ class DumpAsserter {
                }
 
                if ( $text === false ) {
-                       // Testing for a stub
                        Assert::assertEquals( $this->xml->getAttribute( "id" ), $text_id,
                                "Text id of revision " . $id );
-                       Assert::assertFalse( $this->xml->hasValue, "Revision has text" );
-                       Assert::assertTrue( $this->xml->read(), "Skipping text start tag" );
-                       if ( ( $this->xml->nodeType == XMLReader::END_ELEMENT )
-                               && ( $this->xml->name == "text" )
-                       ) {
-                               $this->xml->read();
-                       }
-                       $this->skipWhitespace();
+                       $this->assertEmptyNode( "text" );
                } else {
                        // Testing for a real dump
                        Assert::assertTrue( $this->xml->read(), "Skipping text start tag" );
index 17c8757..7a78e52 100644 (file)
@@ -5,8 +5,11 @@ namespace MediaWiki\Tests\Maintenance;
 use DumpBackup;
 use Exception;
 use MediaWiki\MediaWikiServices;
+use MediaWiki\Revision\RevisionRecord;
 use MediaWikiTestCase;
 use MWException;
+use RequestContext;
+use RevisionDeleter;
 use Title;
 use WikiExporter;
 use Wikimedia\Rdbms\IDatabase;
@@ -77,6 +80,17 @@ class BackupDumperPageTest extends DumpTestCase {
                                "BackupDumperTestP2Summary4 extra " );
                        $this->pageId2 = $page->getId();
 
+                       $revDel = RevisionDeleter::createList(
+                               'revision',
+                               RequestContext::getMain(),
+                               $this->pageTitle2,
+                               [ $this->revId2_2 ]
+                       );
+                       $revDel->setVisibility( [
+                               'value' => [ RevisionRecord::DELETED_TEXT => 1 ],
+                               'comment' => 'testing!'
+                       ] );
+
                        $this->pageTitle3 = Title::newFromText( 'BackupDumperTestP3', $this->namespace );
                        $page = WikiPage::factory( $this->pageTitle3 );
                        list( $this->revId3_1, $this->textId3_1 ) = $this->addRevision( $page,
@@ -232,10 +246,10 @@ class BackupDumperPageTest extends DumpTestCase {
                $asserter->assertRevision(
                        $this->revId2_2,
                        "BackupDumperTestP2Summary2",
-                       $this->textId2_2,
-                       23,
-                       "b7vj5ks32po5m1z1t1br4o7scdwwy95",
-                       "BackupDumperTestP2Text2",
+                       null, // deleted!
+                       false, // deleted!
+                       null, // deleted!
+                       false, // deleted!
                        $this->revId2_1
                );
                $asserter->assertRevision(
@@ -346,10 +360,10 @@ class BackupDumperPageTest extends DumpTestCase {
                $asserter->assertRevision(
                        $this->revId2_2,
                        "BackupDumperTestP2Summary2",
-                       $this->textId2_2,
-                       23,
-                       "b7vj5ks32po5m1z1t1br4o7scdwwy95",
-                       false,
+                       null, // deleted!
+                       false, // deleted!
+                       null, // deleted!
+                       false, // deleted!
                        $this->revId2_1
                );
                $asserter->assertRevision(
@@ -622,10 +636,10 @@ class BackupDumperPageTest extends DumpTestCase {
                $asserter->assertRevision(
                        $this->revId2_2,
                        "BackupDumperTestP2Summary2",
-                       $this->textId2_2,
-                       23,
-                       "b7vj5ks32po5m1z1t1br4o7scdwwy95",
-                       false,
+                       null, // deleted!
+                       false, // deleted!
+                       null, // deleted!
+                       false, // deleted!
                        $this->revId2_1
                );
                $asserter->assertRevision(