Rewrite importImages.php to use Maintenance infrastructure
[lhc/web/wiklou.git] / maintenance / importImages.php
1 <?php
2 /**
3 * Import one or more images from the local file system into the wiki without
4 * using the web-based interface.
5 *
6 * "Smart import" additions:
7 * - aim: preserve the essential metadata (user, description) when importing media
8 * files from an existing wiki.
9 * - process:
10 * - interface with the source wiki, don't use bare files only (see --source-wiki-url).
11 * - fetch metadata from source wiki for each file to import.
12 * - commit the fetched metadata to the destination wiki while submitting.
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License along
25 * with this program; if not, write to the Free Software Foundation, Inc.,
26 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
27 * http://www.gnu.org/copyleft/gpl.html
28 *
29 * @file
30 * @ingroup Maintenance
31 * @author Rob Church <robchur@gmail.com>
32 * @author Mij <mij@bitchx.it>
33 */
34
35 require_once __DIR__ . '/Maintenance.php';
36
37 class ImportImages extends Maintenance {
38
39 public function __construct() {
40 parent::__construct();
41
42 $this->addDescription( 'Imports images and other media files into the wiki' );
43 $this->addArg( 'dir', 'Path to the directory containing images to be imported' );
44
45 $this->addOption( 'extensions',
46 'Comma-separated list of allowable extensions, defaults to $wgFileExtensions',
47 false,
48 true
49 );
50 $this->addOption( 'overwrite',
51 'Overwrite existing images with the same name (default is to skip them)' );
52 $this->addOption( 'limit',
53 'Limit the number of images to process. Ignored or skipped images are not counted',
54 false,
55 true
56 );
57 $this->addOption( 'from',
58 "Ignore all files until the one with the given name. Useful for resuming aborted "
59 . "imports. The name should be the file's canonical database form.",
60 false,
61 true
62 );
63 $this->addOption( 'skip-dupes',
64 'Skip images that were already uploaded under a different name (check SHA1)' );
65 $this->addOption( 'search-recursively', 'Search recursively for files in subdirectories' );
66 $this->addOption( 'sleep',
67 'Sleep between files. Useful mostly for debugging',
68 false,
69 true
70 );
71 $this->addOption( 'user',
72 "Set username of uploader, default 'Maintenance script'",
73 false,
74 true
75 );
76 // This parameter can optionally have an argument. If none specified, getOption()
77 // returns 1 which is precisely what we need.
78 $this->addOption( 'check-userblock', 'Check if the user got blocked during import' );
79 $this->addOption( 'comment',
80 "Set file description, default 'Importing file'",
81 false,
82 true
83 );
84 $this->addOption( 'comment-file',
85 'Set description to the content of this file',
86 false,
87 true
88 );
89 $this->addOption( 'comment-ext',
90 'Causes the description for each file to be loaded from a file with the same name, but '
91 . 'the extension provided. If a global description is also given, it is appended.',
92 false,
93 true
94 );
95 $this->addOption( 'summary',
96 'Upload summary, description will be used if not provided',
97 false,
98 true
99 );
100 $this->addOption( 'license',
101 'Use an optional license template',
102 false,
103 true
104 );
105 $this->addOption( 'timestamp',
106 'Override upload time/date, all MediaWiki timestamp formats are accepted',
107 false,
108 true
109 );
110 $this->addOption( 'protect',
111 'Specify the protect value (autoconfirmed,sysop)',
112 false,
113 true
114 );
115 $this->addOption( 'unprotect', 'Unprotects all uploaded images' );
116 $this->addOption( 'source-wiki-url',
117 'If specified, take User and Comment data for each imported file from this URL. '
118 . 'For example, --source-wiki-url="http://en.wikipedia.org/',
119 false,
120 true
121 );
122 $this->addOption( 'dry', "Dry run, don't import anything" );
123 }
124
125 public function execute() {
126 global $wgFileExtensions, $wgUser, $wgRestrictionLevels;
127
128 $processed = $added = $ignored = $skipped = $overwritten = $failed = 0;
129
130 $this->output( "Import Images\n\n" );
131
132 $dir = $this->getArg( 0 );
133
134 # Check Protection
135 if ( $this->hasOption( 'protect' ) && $this->hasOption( 'unprotect' ) ) {
136 $this->error( "Cannot specify both protect and unprotect. Only 1 is allowed.\n", 1 );
137 }
138
139 if ( $this->hasOption( 'protect' ) && trim( $this->getOption( 'protect' ) ) ) {
140 $this->error( "You must specify a protection option.\n", 1 );
141 }
142
143 # Prepare the list of allowed extensions
144 $extensions = $this->hasOption( 'extensions' )
145 ? explode( ',', strtolower( $this->getOption( 'extensions' ) ) )
146 : $wgFileExtensions;
147
148 # Search the path provided for candidates for import
149 $files = $this->findFiles( $dir, $extensions, $this->hasOption( 'search-recursively' ) );
150
151 # Initialise the user for this operation
152 $user = $this->hasOption( 'user' )
153 ? User::newFromName( $this->getOption( 'user' ) )
154 : User::newSystemUser( 'Maintenance script', [ 'steal' => true ] );
155 if ( !$user instanceof User ) {
156 $user = User::newSystemUser( 'Maintenance script', [ 'steal' => true ] );
157 }
158 $wgUser = $user;
159
160 # Get block check. If a value is given, this specified how often the check is performed
161 $checkUserBlock = (int)$this->getOption( 'check-userblock' );
162
163 $from = $this->getOption( 'from' );
164 $sleep = (int)$this->getOption( 'sleep' );
165 $limit = (int)$this->getOption( 'limit' );
166 $timestamp = $this->getOption( 'timestamp', false );
167
168 # Get the upload comment. Provide a default one in case there's no comment given.
169 $commentFile = $this->getOption( 'comment-file' );
170 if ( $commentFile !== null ) {
171 $comment = file_get_contents( $commentFile );
172 if ( $comment === false || $comment === null ) {
173 $this->error( "failed to read comment file: {$commentFile}\n", 1 );
174 }
175 } else {
176 $comment = $this->getOption( 'comment', 'Importing file' );
177 }
178 $commentExt = $this->getOption( 'comment-ext' );
179 $summary = $this->getOption( 'summary', '' );
180
181 $license = $this->getOption( 'license', '' );
182
183 $sourceWikiUrl = $this->getOption( 'source-wiki-url' );
184
185 # Batch "upload" operation
186 $count = count( $files );
187 if ( $count > 0 ) {
188
189 foreach ( $files as $file ) {
190 $base = UtfNormal\Validator::cleanUp( wfBaseName( $file ) );
191
192 # Validate a title
193 $title = Title::makeTitleSafe( NS_FILE, $base );
194 if ( !is_object( $title ) ) {
195 $this->output(
196 "{$base} could not be imported; a valid title cannot be produced\n" );
197 continue;
198 }
199
200 if ( $from ) {
201 if ( $from == $title->getDBkey() ) {
202 $from = null;
203 } else {
204 $ignored++;
205 continue;
206 }
207 }
208
209 if ( $checkUserBlock && ( ( $processed % $checkUserBlock ) == 0 ) ) {
210 $user->clearInstanceCache( 'name' ); // reload from DB!
211 if ( $user->isBlocked() ) {
212 $this->output( $user->getName() . " was blocked! Aborting.\n" );
213 break;
214 }
215 }
216
217 # Check existence
218 $image = wfLocalFile( $title );
219 if ( $image->exists() ) {
220 if ( $this->hasOption( 'overwrite' ) ) {
221 $this->output( "{$base} exists, overwriting..." );
222 $svar = 'overwritten';
223 } else {
224 $this->output( "{$base} exists, skipping\n" );
225 $skipped++;
226 continue;
227 }
228 } else {
229 if ( $this->hasOption( 'skip-dupes' ) ) {
230 $repo = $image->getRepo();
231 # XXX: we end up calculating this again when actually uploading. that sucks.
232 $sha1 = FSFile::getSha1Base36FromPath( $file );
233
234 $dupes = $repo->findBySha1( $sha1 );
235
236 if ( $dupes ) {
237 $this->output(
238 "{$base} already exists as {$dupes[0]->getName()}, skipping\n" );
239 $skipped++;
240 continue;
241 }
242 }
243
244 $this->output( "Importing {$base}..." );
245 $svar = 'added';
246 }
247
248 if ( $sourceWikiUrl ) {
249 /* find comment text directly from source wiki, through MW's API */
250 $real_comment = $this->getFileCommentFromSourceWiki( $sourceWikiUrl, $base );
251 if ( $real_comment === false ) {
252 $commentText = $comment;
253 } else {
254 $commentText = $real_comment;
255 }
256
257 /* find user directly from source wiki, through MW's API */
258 $real_user = $this->getFileUserFromSourceWiki( $sourceWikiUrl, $base );
259 if ( $real_user === false ) {
260 $wgUser = $user;
261 } else {
262 $wgUser = User::newFromName( $real_user );
263 if ( $wgUser === false ) {
264 # user does not exist in target wiki
265 $this->output(
266 "failed: user '$real_user' does not exist in target wiki." );
267 continue;
268 }
269 }
270 } else {
271 # Find comment text
272 $commentText = false;
273
274 if ( $commentExt ) {
275 $f = $this->findAuxFile( $file, $commentExt );
276 if ( !$f ) {
277 $this->output( " No comment file with extension {$commentExt} found "
278 . "for {$file}, using default comment. " );
279 } else {
280 $commentText = file_get_contents( $f );
281 if ( !$commentText ) {
282 $this->output(
283 " Failed to load comment file {$f}, using default comment. " );
284 }
285 }
286 }
287
288 if ( !$commentText ) {
289 $commentText = $comment;
290 }
291 }
292
293 # Import the file
294 if ( $this->hasOption( 'dry' ) ) {
295 $this->output(
296 " publishing {$file} by '{$wgUser->getName()}', comment '$commentText'... "
297 );
298 } else {
299 $mwProps = new MWFileProps( MimeMagic::singleton() );
300 $props = $mwProps->getPropsFromPath( $file, true );
301 $flags = 0;
302 $publishOptions = [];
303 $handler = MediaHandler::getHandler( $props['mime'] );
304 if ( $handler ) {
305 $publishOptions['headers'] = $handler->getStreamHeaders( $props['metadata'] );
306 } else {
307 $publishOptions['headers'] = [];
308 }
309 $archive = $image->publish( $file, $flags, $publishOptions );
310 if ( !$archive->isGood() ) {
311 $this->output( "failed. (" .
312 $archive->getWikiText( false, false, 'en' ) .
313 ")\n" );
314 $failed++;
315 continue;
316 }
317 }
318
319 $commentText = SpecialUpload::getInitialPageText( $commentText, $license );
320 if ( !$this->hasOption( 'summary' ) ) {
321 $summary = $commentText;
322 }
323
324 if ( $this->hasOption( 'dry' ) ) {
325 $this->output( "done.\n" );
326 } elseif ( $image->recordUpload2(
327 $archive->value,
328 $summary,
329 $commentText,
330 $props,
331 $timestamp
332 ) ) {
333 # We're done!
334 $this->output( "done.\n" );
335
336 $doProtect = false;
337
338 $protectLevel = $this->getOption( 'protect' );
339
340 if ( $protectLevel && in_array( $protectLevel, $wgRestrictionLevels ) ) {
341 $doProtect = true;
342 }
343 if ( $this->hasOption( 'unprotect' ) ) {
344 $protectLevel = '';
345 $doProtect = true;
346 }
347
348 if ( $doProtect ) {
349 # Protect the file
350 $this->output( "\nWaiting for replica DBs...\n" );
351 // Wait for replica DBs.
352 sleep( 2.0 ); # Why this sleep?
353 wfWaitForSlaves();
354
355 $this->output( "\nSetting image restrictions ... " );
356
357 $cascade = false;
358 $restrictions = [];
359 foreach ( $title->getRestrictionTypes() as $type ) {
360 $restrictions[$type] = $protectLevel;
361 }
362
363 $page = WikiPage::factory( $title );
364 $status = $page->doUpdateRestrictions( $restrictions, [], $cascade, '', $user );
365 $this->output( ( $status->isOK() ? 'done' : 'failed' ) . "\n" );
366 }
367 } else {
368 $this->output( "failed. (at recordUpload stage)\n" );
369 $svar = 'failed';
370 }
371
372 $$svar++;
373 $processed++;
374
375 if ( $limit && $processed >= $limit ) {
376 break;
377 }
378
379 if ( $sleep ) {
380 sleep( $sleep );
381 }
382 }
383
384 # Print out some statistics
385 $this->output( "\n" );
386 foreach (
387 [
388 'count' => 'Found',
389 'limit' => 'Limit',
390 'ignored' => 'Ignored',
391 'added' => 'Added',
392 'skipped' => 'Skipped',
393 'overwritten' => 'Overwritten',
394 'failed' => 'Failed'
395 ] as $var => $desc
396 ) {
397 if ( $$var > 0 ) {
398 $this->output( "{$desc}: {$$var}\n" );
399 }
400 }
401 } else {
402 $this->output( "No suitable files could be found for import.\n" );
403 }
404 }
405
406 /**
407 * Search a directory for files with one of a set of extensions
408 *
409 * @param string $dir Path to directory to search
410 * @param array $exts Array of extensions to search for
411 * @param bool $recurse Search subdirectories recursively
412 * @return array|bool Array of filenames on success, or false on failure
413 */
414 private function findFiles( $dir, $exts, $recurse = false ) {
415 if ( is_dir( $dir ) ) {
416 $dhl = opendir( $dir );
417 if ( $dhl ) {
418 $files = [];
419 while ( ( $file = readdir( $dhl ) ) !== false ) {
420 if ( is_file( $dir . '/' . $file ) ) {
421 list( /* $name */, $ext ) = $this->splitFilename( $dir . '/' . $file );
422 if ( array_search( strtolower( $ext ), $exts ) !== false ) {
423 $files[] = $dir . '/' . $file;
424 }
425 } elseif ( $recurse && is_dir( $dir . '/' . $file ) && $file !== '..' && $file !== '.' ) {
426 $files = array_merge( $files, $this->findFiles( $dir . '/' . $file, $exts, true ) );
427 }
428 }
429
430 return $files;
431 } else {
432 return [];
433 }
434 } else {
435 return [];
436 }
437 }
438
439 /**
440 * Split a filename into filename and extension
441 *
442 * @param string $filename Filename
443 * @return array
444 */
445 private function splitFilename( $filename ) {
446 $parts = explode( '.', $filename );
447 $ext = $parts[count( $parts ) - 1];
448 unset( $parts[count( $parts ) - 1] );
449 $fname = implode( '.', $parts );
450
451 return [ $fname, $ext ];
452 }
453
454 /**
455 * Find an auxilliary file with the given extension, matching
456 * the give base file path. $maxStrip determines how many extensions
457 * may be stripped from the original file name before appending the
458 * new extension. For example, with $maxStrip = 1 (the default),
459 * file files acme.foo.bar.txt and acme.foo.txt would be auxilliary
460 * files for acme.foo.bar and the extension ".txt". With $maxStrip = 2,
461 * acme.txt would also be acceptable.
462 *
463 * @param string $file Base path
464 * @param string $auxExtension The extension to be appended to the base path
465 * @param int $maxStrip The maximum number of extensions to strip from the base path (default: 1)
466 * @return string|bool
467 */
468 private function findAuxFile( $file, $auxExtension, $maxStrip = 1 ) {
469 if ( strpos( $auxExtension, '.' ) !== 0 ) {
470 $auxExtension = '.' . $auxExtension;
471 }
472
473 $d = dirname( $file );
474 $n = basename( $file );
475
476 while ( $maxStrip >= 0 ) {
477 $f = $d . '/' . $n . $auxExtension;
478
479 if ( file_exists( $f ) ) {
480 return $f;
481 }
482
483 $idx = strrpos( $n, '.' );
484 if ( !$idx ) {
485 break;
486 }
487
488 $n = substr( $n, 0, $idx );
489 $maxStrip -= 1;
490 }
491
492 return false;
493 }
494
495 # @todo FIXME: Access the api in a saner way and performing just one query
496 # (preferably batching files too).
497 private function getFileCommentFromSourceWiki( $wiki_host, $file ) {
498 $url = $wiki_host . '/api.php?action=query&format=xml&titles=File:'
499 . rawurlencode( $file ) . '&prop=imageinfo&&iiprop=comment';
500 $body = Http::get( $url, [], __METHOD__ );
501 if ( preg_match( '#<ii comment="([^"]*)" />#', $body, $matches ) == 0 ) {
502 return false;
503 }
504
505 return html_entity_decode( $matches[1] );
506 }
507
508 private function getFileUserFromSourceWiki( $wiki_host, $file ) {
509 $url = $wiki_host . '/api.php?action=query&format=xml&titles=File:'
510 . rawurlencode( $file ) . '&prop=imageinfo&&iiprop=user';
511 $body = Http::get( $url, [], __METHOD__ );
512 if ( preg_match( '#<ii user="([^"]*)" />#', $body, $matches ) == 0 ) {
513 return false;
514 }
515
516 return html_entity_decode( $matches[1] );
517 }
518
519 }
520
521 $maintClass = 'ImportImages';
522 require_once RUN_MAINTENANCE_IF_MAIN;