7f8e16a6b13c95c284f0b8e501a128eba46a02d5
[lhc/web/wiklou.git] / maintenance / importImages.php
1 <?php
2 /**
3 * Import one or more images from the local file system into the wiki without
4 * using the web-based interface.
5 *
6 * "Smart import" additions:
7 * - aim: preserve the essential metadata (user, description) when importing media
8 * files from an existing wiki.
9 * - process:
10 * - interface with the source wiki, don't use bare files only (see --source-wiki-url).
11 * - fetch metadata from source wiki for each file to import.
12 * - commit the fetched metadata to the destination wiki while submitting.
13 *
14 * This program is free software; you can redistribute it and/or modify
15 * it under the terms of the GNU General Public License as published by
16 * the Free Software Foundation; either version 2 of the License, or
17 * (at your option) any later version.
18 *
19 * This program is distributed in the hope that it will be useful,
20 * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 * GNU General Public License for more details.
23 *
24 * You should have received a copy of the GNU General Public License along
25 * with this program; if not, write to the Free Software Foundation, Inc.,
26 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
27 * http://www.gnu.org/copyleft/gpl.html
28 *
29 * @file
30 * @ingroup Maintenance
31 * @author Rob Church <robchur@gmail.com>
32 * @author Mij <mij@bitchx.it>
33 */
34
35 use MediaWiki\MediaWikiServices;
36
37 require_once __DIR__ . '/Maintenance.php';
38
39 class ImportImages extends Maintenance {
40
41 public function __construct() {
42 parent::__construct();
43
44 $this->addDescription( 'Imports images and other media files into the wiki' );
45 $this->addArg( 'dir', 'Path to the directory containing images to be imported' );
46
47 $this->addOption( 'extensions',
48 'Comma-separated list of allowable extensions, defaults to $wgFileExtensions',
49 false,
50 true
51 );
52 $this->addOption( 'overwrite',
53 'Overwrite existing images with the same name (default is to skip them)' );
54 $this->addOption( 'limit',
55 'Limit the number of images to process. Ignored or skipped images are not counted',
56 false,
57 true
58 );
59 $this->addOption( 'from',
60 "Ignore all files until the one with the given name. Useful for resuming aborted "
61 . "imports. The name should be the file's canonical database form.",
62 false,
63 true
64 );
65 $this->addOption( 'skip-dupes',
66 'Skip images that were already uploaded under a different name (check SHA1)' );
67 $this->addOption( 'search-recursively', 'Search recursively for files in subdirectories' );
68 $this->addOption( 'sleep',
69 'Sleep between files. Useful mostly for debugging',
70 false,
71 true
72 );
73 $this->addOption( 'user',
74 "Set username of uploader, default 'Maintenance script'",
75 false,
76 true
77 );
78 // This parameter can optionally have an argument. If none specified, getOption()
79 // returns 1 which is precisely what we need.
80 $this->addOption( 'check-userblock', 'Check if the user got blocked during import' );
81 $this->addOption( 'comment',
82 "Set file description, default 'Importing file'",
83 false,
84 true
85 );
86 $this->addOption( 'comment-file',
87 'Set description to the content of this file',
88 false,
89 true
90 );
91 $this->addOption( 'comment-ext',
92 'Causes the description for each file to be loaded from a file with the same name, but '
93 . 'the extension provided. If a global description is also given, it is appended.',
94 false,
95 true
96 );
97 $this->addOption( 'summary',
98 'Upload summary, description will be used if not provided',
99 false,
100 true
101 );
102 $this->addOption( 'license',
103 'Use an optional license template',
104 false,
105 true
106 );
107 $this->addOption( 'timestamp',
108 'Override upload time/date, all MediaWiki timestamp formats are accepted',
109 false,
110 true
111 );
112 $this->addOption( 'protect',
113 'Specify the protect value (autoconfirmed,sysop)',
114 false,
115 true
116 );
117 $this->addOption( 'unprotect', 'Unprotects all uploaded images' );
118 $this->addOption( 'source-wiki-url',
119 'If specified, take User and Comment data for each imported file from this URL. '
120 . 'For example, --source-wiki-url="http://en.wikipedia.org/',
121 false,
122 true
123 );
124 $this->addOption( 'dry', "Dry run, don't import anything" );
125 }
126
127 public function execute() {
128 global $wgFileExtensions, $wgUser, $wgRestrictionLevels;
129
130 $processed = $added = $ignored = $skipped = $overwritten = $failed = 0;
131
132 $this->output( "Importing Files\n\n" );
133
134 $dir = $this->getArg( 0 );
135
136 # Check Protection
137 if ( $this->hasOption( 'protect' ) && $this->hasOption( 'unprotect' ) ) {
138 $this->fatalError( "Cannot specify both protect and unprotect. Only 1 is allowed.\n" );
139 }
140
141 if ( $this->hasOption( 'protect' ) && trim( $this->getOption( 'protect' ) ) ) {
142 $this->fatalError( "You must specify a protection option.\n" );
143 }
144
145 # Prepare the list of allowed extensions
146 $extensions = $this->hasOption( 'extensions' )
147 ? explode( ',', strtolower( $this->getOption( 'extensions' ) ) )
148 : $wgFileExtensions;
149
150 # Search the path provided for candidates for import
151 $files = $this->findFiles( $dir, $extensions, $this->hasOption( 'search-recursively' ) );
152
153 # Initialise the user for this operation
154 $user = $this->hasOption( 'user' )
155 ? User::newFromName( $this->getOption( 'user' ) )
156 : User::newSystemUser( 'Maintenance script', [ 'steal' => true ] );
157 if ( !$user instanceof User ) {
158 $user = User::newSystemUser( 'Maintenance script', [ 'steal' => true ] );
159 }
160 $wgUser = $user;
161
162 # Get block check. If a value is given, this specified how often the check is performed
163 $checkUserBlock = (int)$this->getOption( 'check-userblock' );
164
165 $from = $this->getOption( 'from' );
166 $sleep = (int)$this->getOption( 'sleep' );
167 $limit = (int)$this->getOption( 'limit' );
168 $timestamp = $this->getOption( 'timestamp', false );
169
170 # Get the upload comment. Provide a default one in case there's no comment given.
171 $commentFile = $this->getOption( 'comment-file' );
172 if ( $commentFile !== null ) {
173 $comment = file_get_contents( $commentFile );
174 if ( $comment === false || $comment === null ) {
175 $this->fatalError( "failed to read comment file: {$commentFile}\n" );
176 }
177 } else {
178 $comment = $this->getOption( 'comment', 'Importing file' );
179 }
180 $commentExt = $this->getOption( 'comment-ext' );
181 $summary = $this->getOption( 'summary', '' );
182
183 $license = $this->getOption( 'license', '' );
184
185 $sourceWikiUrl = $this->getOption( 'source-wiki-url' );
186
187 # Batch "upload" operation
188 $count = count( $files );
189 if ( $count > 0 ) {
190 foreach ( $files as $file ) {
191 if ( $sleep && ( $processed > 0 ) ) {
192 sleep( $sleep );
193 }
194
195 $base = UtfNormal\Validator::cleanUp( wfBaseName( $file ) );
196
197 # Validate a title
198 $title = Title::makeTitleSafe( NS_FILE, $base );
199 if ( !is_object( $title ) ) {
200 $this->output(
201 "{$base} could not be imported; a valid title cannot be produced\n" );
202 continue;
203 }
204
205 if ( $from ) {
206 if ( $from == $title->getDBkey() ) {
207 $from = null;
208 } else {
209 $ignored++;
210 continue;
211 }
212 }
213
214 if ( $checkUserBlock && ( ( $processed % $checkUserBlock ) == 0 ) ) {
215 $user->clearInstanceCache( 'name' ); // reload from DB!
216 // @TODO Use PermissionManager::isBlockedFrom() instead.
217 if ( $user->getBlock() ) {
218 $this->output( $user->getName() . " was blocked! Aborting.\n" );
219 break;
220 }
221 }
222
223 # Check existence
224 $image = MediaWikiServices::getInstance()->getRepoGroup()->getLocalRepo()
225 ->newFile( $title );
226 if ( $image->exists() ) {
227 if ( $this->hasOption( 'overwrite' ) ) {
228 $this->output( "{$base} exists, overwriting..." );
229 $svar = 'overwritten';
230 } else {
231 $this->output( "{$base} exists, skipping\n" );
232 $skipped++;
233 continue;
234 }
235 } else {
236 if ( $this->hasOption( 'skip-dupes' ) ) {
237 $repo = $image->getRepo();
238 # XXX: we end up calculating this again when actually uploading. that sucks.
239 $sha1 = FSFile::getSha1Base36FromPath( $file );
240
241 $dupes = $repo->findBySha1( $sha1 );
242
243 if ( $dupes ) {
244 $this->output(
245 "{$base} already exists as {$dupes[0]->getName()}, skipping\n" );
246 $skipped++;
247 continue;
248 }
249 }
250
251 $this->output( "Importing {$base}..." );
252 $svar = 'added';
253 }
254
255 if ( $sourceWikiUrl ) {
256 /* find comment text directly from source wiki, through MW's API */
257 $real_comment = $this->getFileCommentFromSourceWiki( $sourceWikiUrl, $base );
258 if ( $real_comment === false ) {
259 $commentText = $comment;
260 } else {
261 $commentText = $real_comment;
262 }
263
264 /* find user directly from source wiki, through MW's API */
265 $real_user = $this->getFileUserFromSourceWiki( $sourceWikiUrl, $base );
266 if ( $real_user === false ) {
267 $wgUser = $user;
268 } else {
269 $wgUser = User::newFromName( $real_user );
270 if ( $wgUser === false ) {
271 # user does not exist in target wiki
272 $this->output(
273 "failed: user '$real_user' does not exist in target wiki." );
274 continue;
275 }
276 }
277 } else {
278 # Find comment text
279 $commentText = false;
280
281 if ( $commentExt ) {
282 $f = $this->findAuxFile( $file, $commentExt );
283 if ( !$f ) {
284 $this->output( " No comment file with extension {$commentExt} found "
285 . "for {$file}, using default comment. " );
286 } else {
287 $commentText = file_get_contents( $f );
288 if ( !$commentText ) {
289 $this->output(
290 " Failed to load comment file {$f}, using default comment. " );
291 }
292 }
293 }
294
295 if ( !$commentText ) {
296 $commentText = $comment;
297 }
298 }
299
300 # Import the file
301 if ( $this->hasOption( 'dry' ) ) {
302 $this->output(
303 " publishing {$file} by '{$wgUser->getName()}', comment '$commentText'... "
304 );
305 } else {
306 $mwProps = new MWFileProps( MediaWiki\MediaWikiServices::getInstance()->getMimeAnalyzer() );
307 $props = $mwProps->getPropsFromPath( $file, true );
308 $flags = 0;
309 $publishOptions = [];
310 $handler = MediaHandler::getHandler( $props['mime'] );
311 if ( $handler ) {
312 $metadata = \Wikimedia\AtEase\AtEase::quietCall( 'unserialize', $props['metadata'] );
313
314 $publishOptions['headers'] = $handler->getContentHeaders( $metadata );
315 } else {
316 $publishOptions['headers'] = [];
317 }
318 $archive = $image->publish( $file, $flags, $publishOptions );
319 if ( !$archive->isGood() ) {
320 $this->output( "failed. (" .
321 $archive->getWikiText( false, false, 'en' ) .
322 ")\n" );
323 $failed++;
324 continue;
325 }
326 }
327
328 $commentText = SpecialUpload::getInitialPageText( $commentText, $license );
329 if ( !$this->hasOption( 'summary' ) ) {
330 $summary = $commentText;
331 }
332
333 if ( $this->hasOption( 'dry' ) ) {
334 $this->output( "done.\n" );
335 // @phan-suppress-next-line PhanUndeclaredMethod
336 } elseif ( $image->recordUpload2(
337 $archive->value,
338 $summary,
339 $commentText,
340 $props,
341 $timestamp
342 )->isOK() ) {
343 $this->output( "done.\n" );
344
345 $doProtect = false;
346
347 $protectLevel = $this->getOption( 'protect' );
348
349 if ( $protectLevel && in_array( $protectLevel, $wgRestrictionLevels ) ) {
350 $doProtect = true;
351 }
352 if ( $this->hasOption( 'unprotect' ) ) {
353 $protectLevel = '';
354 $doProtect = true;
355 }
356
357 if ( $doProtect ) {
358 # Protect the file
359 $this->output( "\nWaiting for replica DBs...\n" );
360 // Wait for replica DBs.
361 sleep( 2 ); # Why this sleep?
362 wfWaitForSlaves();
363
364 $this->output( "\nSetting image restrictions ... " );
365
366 $cascade = false;
367 $restrictions = [];
368 foreach ( $title->getRestrictionTypes() as $type ) {
369 $restrictions[$type] = $protectLevel;
370 }
371
372 $page = WikiPage::factory( $title );
373 $status = $page->doUpdateRestrictions( $restrictions, [], $cascade, '', $user );
374 $this->output( ( $status->isOK() ? 'done' : 'failed' ) . "\n" );
375 }
376 } else {
377 $this->output( "failed. (at recordUpload stage)\n" );
378 $svar = 'failed';
379 }
380
381 $$svar++;
382 $processed++;
383
384 if ( $limit && $processed >= $limit ) {
385 break;
386 }
387 }
388
389 # Print out some statistics
390 $this->output( "\n" );
391 foreach (
392 [
393 'count' => 'Found',
394 'limit' => 'Limit',
395 'ignored' => 'Ignored',
396 'added' => 'Added',
397 'skipped' => 'Skipped',
398 'overwritten' => 'Overwritten',
399 'failed' => 'Failed'
400 ] as $var => $desc
401 ) {
402 if ( $$var > 0 ) {
403 $this->output( "{$desc}: {$$var}\n" );
404 }
405 }
406 } else {
407 $this->output( "No suitable files could be found for import.\n" );
408 }
409 }
410
411 /**
412 * Search a directory for files with one of a set of extensions
413 *
414 * @param string $dir Path to directory to search
415 * @param array $exts Array of extensions to search for
416 * @param bool $recurse Search subdirectories recursively
417 * @return array|bool Array of filenames on success, or false on failure
418 */
419 private function findFiles( $dir, $exts, $recurse = false ) {
420 if ( is_dir( $dir ) ) {
421 $dhl = opendir( $dir );
422 if ( $dhl ) {
423 $files = [];
424 while ( ( $file = readdir( $dhl ) ) !== false ) {
425 if ( is_file( $dir . '/' . $file ) ) {
426 $ext = pathinfo( $file, PATHINFO_EXTENSION );
427 if ( array_search( strtolower( $ext ), $exts ) !== false ) {
428 $files[] = $dir . '/' . $file;
429 }
430 } elseif ( $recurse && is_dir( $dir . '/' . $file ) && $file !== '..' && $file !== '.' ) {
431 $files = array_merge( $files, $this->findFiles( $dir . '/' . $file, $exts, true ) );
432 }
433 }
434
435 return $files;
436 } else {
437 return [];
438 }
439 } else {
440 return [];
441 }
442 }
443
444 /**
445 * Find an auxilliary file with the given extension, matching
446 * the give base file path. $maxStrip determines how many extensions
447 * may be stripped from the original file name before appending the
448 * new extension. For example, with $maxStrip = 1 (the default),
449 * file files acme.foo.bar.txt and acme.foo.txt would be auxilliary
450 * files for acme.foo.bar and the extension ".txt". With $maxStrip = 2,
451 * acme.txt would also be acceptable.
452 *
453 * @param string $file Base path
454 * @param string $auxExtension The extension to be appended to the base path
455 * @param int $maxStrip The maximum number of extensions to strip from the base path (default: 1)
456 * @return string|bool
457 */
458 private function findAuxFile( $file, $auxExtension, $maxStrip = 1 ) {
459 if ( strpos( $auxExtension, '.' ) !== 0 ) {
460 $auxExtension = '.' . $auxExtension;
461 }
462
463 $d = dirname( $file );
464 $n = basename( $file );
465
466 while ( $maxStrip >= 0 ) {
467 $f = $d . '/' . $n . $auxExtension;
468
469 if ( file_exists( $f ) ) {
470 return $f;
471 }
472
473 $idx = strrpos( $n, '.' );
474 if ( !$idx ) {
475 break;
476 }
477
478 $n = substr( $n, 0, $idx );
479 $maxStrip -= 1;
480 }
481
482 return false;
483 }
484
485 # @todo FIXME: Access the api in a saner way and performing just one query
486 # (preferably batching files too).
487 private function getFileCommentFromSourceWiki( $wiki_host, $file ) {
488 $url = $wiki_host . '/api.php?action=query&format=xml&titles=File:'
489 . rawurlencode( $file ) . '&prop=imageinfo&&iiprop=comment';
490 $body = Http::get( $url, [], __METHOD__ );
491 if ( preg_match( '#<ii comment="([^"]*)" />#', $body, $matches ) == 0 ) {
492 return false;
493 }
494
495 return html_entity_decode( $matches[1] );
496 }
497
498 private function getFileUserFromSourceWiki( $wiki_host, $file ) {
499 $url = $wiki_host . '/api.php?action=query&format=xml&titles=File:'
500 . rawurlencode( $file ) . '&prop=imageinfo&&iiprop=user';
501 $body = Http::get( $url, [], __METHOD__ );
502 if ( preg_match( '#<ii user="([^"]*)" />#', $body, $matches ) == 0 ) {
503 return false;
504 }
505
506 return html_entity_decode( $matches[1] );
507 }
508
509 }
510
511 $maintClass = ImportImages::class;
512 require_once RUN_MAINTENANCE_IF_MAIN;