Merge "Make DBAccessBase use DBConnRef, rename $wiki, and hide getLoadBalancer()"
[lhc/web/wiklou.git] / maintenance / importDump.php
1 <?php
2 /**
3 * Import XML dump files into the current wiki.
4 *
5 * Copyright © 2005 Brion Vibber <brion@pobox.com>
6 * https://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 * @ingroup Maintenance
25 */
26
27 use MediaWiki\MediaWikiServices;
28
29 require_once __DIR__ . '/Maintenance.php';
30
31 /**
32 * Maintenance script that imports XML dump files into the current wiki.
33 *
34 * @ingroup Maintenance
35 */
36 class BackupReader extends Maintenance {
37 public $reportingInterval = 100;
38 public $pageCount = 0;
39 public $revCount = 0;
40 public $dryRun = false;
41 public $uploads = false;
42 protected $uploadCount = 0;
43 public $imageBasePath = false;
44 /** @var array|false */
45 public $nsFilter = false;
46
47 function __construct() {
48 parent::__construct();
49 $gz = in_array( 'compress.zlib', stream_get_wrappers() )
50 ? 'ok'
51 : '(disabled; requires PHP zlib module)';
52 $bz2 = in_array( 'compress.bzip2', stream_get_wrappers() )
53 ? 'ok'
54 : '(disabled; requires PHP bzip2 module)';
55
56 $this->addDescription(
57 <<<TEXT
58 This script reads pages from an XML file as produced from Special:Export or
59 dumpBackup.php, and saves them into the current wiki.
60
61 Compressed XML files may be read directly:
62 .gz $gz
63 .bz2 $bz2
64 .7z (if 7za executable is in PATH)
65
66 Note that for very large data sets, importDump.php may be slow; there are
67 alternate methods which can be much faster for full site restoration:
68 <https://www.mediawiki.org/wiki/Manual:Importing_XML_dumps>
69 TEXT
70 );
71 $this->stderr = fopen( "php://stderr", "wt" );
72 $this->addOption( 'report',
73 'Report position and speed after every n pages processed', false, true );
74 $this->addOption( 'namespaces',
75 'Import only the pages from namespaces belonging to the list of ' .
76 'pipe-separated namespace names or namespace indexes', false, true );
77 $this->addOption( 'rootpage', 'Pages will be imported as subpages of the specified page',
78 false, true );
79 $this->addOption( 'dry-run', 'Parse dump without actually importing pages' );
80 $this->addOption( 'debug', 'Output extra verbose debug information' );
81 $this->addOption( 'uploads', 'Process file upload data if included (experimental)' );
82 $this->addOption(
83 'no-updates',
84 'Disable link table updates. Is faster but leaves the wiki in an inconsistent state'
85 );
86 $this->addOption( 'image-base-path', 'Import files from a specified path', false, true );
87 $this->addOption( 'skip-to', 'Start from nth page by skipping first n-1 pages', false, true );
88 $this->addOption( 'username-prefix', 'Prefix for interwiki usernames', false, true );
89 $this->addOption( 'no-local-users',
90 'Treat all usernames as interwiki. ' .
91 'The default is to assign edits to local users where they exist.',
92 false, false
93 );
94 $this->addArg( 'file', 'Dump file to import [else use stdin]', false );
95 }
96
97 public function execute() {
98 if ( wfReadOnly() ) {
99 $this->fatalError( "Wiki is in read-only mode; you'll need to disable it for import to work." );
100 }
101
102 $this->reportingInterval = intval( $this->getOption( 'report', 100 ) );
103 if ( !$this->reportingInterval ) {
104 $this->reportingInterval = 100; // avoid division by zero
105 }
106
107 $this->dryRun = $this->hasOption( 'dry-run' );
108 $this->uploads = $this->hasOption( 'uploads' ); // experimental!
109 if ( $this->hasOption( 'image-base-path' ) ) {
110 $this->imageBasePath = $this->getOption( 'image-base-path' );
111 }
112 if ( $this->hasOption( 'namespaces' ) ) {
113 $this->setNsfilter( explode( '|', $this->getOption( 'namespaces' ) ) );
114 }
115
116 if ( $this->hasArg( 0 ) ) {
117 $this->importFromFile( $this->getArg( 0 ) );
118 } else {
119 $this->importFromStdin();
120 }
121
122 $this->output( "Done!\n" );
123 $this->output( "You might want to run rebuildrecentchanges.php to regenerate RecentChanges,\n" );
124 $this->output( "and initSiteStats.php to update page and revision counts\n" );
125 }
126
127 function setNsfilter( array $namespaces ) {
128 if ( count( $namespaces ) == 0 ) {
129 $this->nsFilter = false;
130
131 return;
132 }
133 $this->nsFilter = array_unique( array_map( [ $this, 'getNsIndex' ], $namespaces ) );
134 }
135
136 private function getNsIndex( $namespace ) {
137 $contLang = MediaWikiServices::getInstance()->getContentLanguage();
138 $result = $contLang->getNsIndex( $namespace );
139 if ( $result !== false ) {
140 return $result;
141 }
142 $ns = intval( $namespace );
143 if ( strval( $ns ) === $namespace && $contLang->getNsText( $ns ) !== false ) {
144 return $ns;
145 }
146 $this->fatalError( "Unknown namespace text / index specified: $namespace" );
147 }
148
149 /**
150 * @param Title|Revision $obj
151 * @throws MWException
152 * @return bool
153 */
154 private function skippedNamespace( $obj ) {
155 $title = null;
156 if ( $obj instanceof Title ) {
157 $title = $obj;
158 } elseif ( $obj instanceof Revision ) {
159 $title = $obj->getTitle();
160 } elseif ( $obj instanceof WikiRevision ) {
161 $title = $obj->title;
162 } else {
163 throw new MWException( "Cannot get namespace of object in " . __METHOD__ );
164 }
165
166 if ( is_null( $title ) ) {
167 // Probably a log entry
168 return false;
169 }
170
171 $ns = $title->getNamespace();
172
173 return is_array( $this->nsFilter ) && !in_array( $ns, $this->nsFilter );
174 }
175
176 function reportPage( $page ) {
177 $this->pageCount++;
178 }
179
180 /**
181 * @param Revision $rev
182 */
183 function handleRevision( $rev ) {
184 $title = $rev->getTitle();
185 if ( !$title ) {
186 $this->progress( "Got bogus revision with null title!" );
187
188 return;
189 }
190
191 if ( $this->skippedNamespace( $title ) ) {
192 return;
193 }
194
195 $this->revCount++;
196 $this->report();
197
198 if ( !$this->dryRun ) {
199 call_user_func( $this->importCallback, $rev );
200 }
201 }
202
203 /**
204 * @param Revision $revision
205 * @return bool
206 */
207 function handleUpload( $revision ) {
208 if ( $this->uploads ) {
209 if ( $this->skippedNamespace( $revision ) ) {
210 return false;
211 }
212 $this->uploadCount++;
213 // $this->report();
214 // @phan-suppress-next-line PhanUndeclaredMethod
215 $this->progress( "upload: " . $revision->getFilename() );
216
217 if ( !$this->dryRun ) {
218 // bluuuh hack
219 // call_user_func( $this->uploadCallback, $revision );
220 $dbw = $this->getDB( DB_MASTER );
221
222 return $dbw->deadlockLoop( [ $revision, 'importUpload' ] );
223 }
224 }
225
226 return false;
227 }
228
229 function handleLogItem( $rev ) {
230 if ( $this->skippedNamespace( $rev ) ) {
231 return;
232 }
233 $this->revCount++;
234 $this->report();
235
236 if ( !$this->dryRun ) {
237 call_user_func( $this->logItemCallback, $rev );
238 }
239 }
240
241 function report( $final = false ) {
242 if ( $final xor ( $this->pageCount % $this->reportingInterval == 0 ) ) {
243 $this->showReport();
244 }
245 }
246
247 function showReport() {
248 if ( !$this->mQuiet ) {
249 $delta = microtime( true ) - $this->startTime;
250 if ( $delta ) {
251 $rate = sprintf( "%.2f", $this->pageCount / $delta );
252 $revrate = sprintf( "%.2f", $this->revCount / $delta );
253 } else {
254 $rate = '-';
255 $revrate = '-';
256 }
257 # Logs dumps don't have page tallies
258 if ( $this->pageCount ) {
259 $this->progress( "$this->pageCount ($rate pages/sec $revrate revs/sec)" );
260 } else {
261 $this->progress( "$this->revCount ($revrate revs/sec)" );
262 }
263 }
264 wfWaitForSlaves();
265 }
266
267 function progress( $string ) {
268 fwrite( $this->stderr, $string . "\n" );
269 }
270
271 function importFromFile( $filename ) {
272 if ( preg_match( '/\.gz$/', $filename ) ) {
273 $filename = 'compress.zlib://' . $filename;
274 } elseif ( preg_match( '/\.bz2$/', $filename ) ) {
275 $filename = 'compress.bzip2://' . $filename;
276 } elseif ( preg_match( '/\.7z$/', $filename ) ) {
277 $filename = 'mediawiki.compress.7z://' . $filename;
278 }
279
280 $file = fopen( $filename, 'rt' );
281
282 return $this->importFromHandle( $file );
283 }
284
285 function importFromStdin() {
286 $file = fopen( 'php://stdin', 'rt' );
287 if ( self::posix_isatty( $file ) ) {
288 $this->maybeHelp( true );
289 }
290
291 return $this->importFromHandle( $file );
292 }
293
294 function importFromHandle( $handle ) {
295 $this->startTime = microtime( true );
296
297 $source = new ImportStreamSource( $handle );
298 $importer = new WikiImporter( $source, $this->getConfig() );
299
300 // Updating statistics require a lot of time so disable it
301 $importer->disableStatisticsUpdate();
302
303 if ( $this->hasOption( 'debug' ) ) {
304 $importer->setDebug( true );
305 }
306 if ( $this->hasOption( 'no-updates' ) ) {
307 $importer->setNoUpdates( true );
308 }
309 if ( $this->hasOption( 'username-prefix' ) ) {
310 $importer->setUsernamePrefix(
311 $this->getOption( 'username-prefix' ),
312 !$this->hasOption( 'no-local-users' )
313 );
314 }
315 if ( $this->hasOption( 'rootpage' ) ) {
316 $statusRootPage = $importer->setTargetRootPage( $this->getOption( 'rootpage' ) );
317 if ( !$statusRootPage->isGood() ) {
318 // Die here so that it doesn't print "Done!"
319 $this->fatalError( $statusRootPage->getMessage()->text() );
320 return false;
321 }
322 }
323 if ( $this->hasOption( 'skip-to' ) ) {
324 $nthPage = (int)$this->getOption( 'skip-to' );
325 $importer->setPageOffset( $nthPage );
326 $this->pageCount = $nthPage - 1;
327 }
328 $importer->setPageCallback( [ $this, 'reportPage' ] );
329 $importer->setNoticeCallback( function ( $msg, $params ) {
330 echo wfMessage( $msg, $params )->text() . "\n";
331 } );
332 $this->importCallback = $importer->setRevisionCallback(
333 [ $this, 'handleRevision' ] );
334 $this->uploadCallback = $importer->setUploadCallback(
335 [ $this, 'handleUpload' ] );
336 $this->logItemCallback = $importer->setLogItemCallback(
337 [ $this, 'handleLogItem' ] );
338 if ( $this->uploads ) {
339 $importer->setImportUploads( true );
340 }
341 if ( $this->imageBasePath ) {
342 $importer->setImageBasePath( $this->imageBasePath );
343 }
344
345 if ( $this->dryRun ) {
346 $importer->setPageOutCallback( null );
347 }
348
349 return $importer->doImport();
350 }
351 }
352
353 $maintClass = BackupReader::class;
354 require_once RUN_MAINTENANCE_IF_MAIN;