Merge "Drop index oi_name_archive_name on table oldimage"
[lhc/web/wiklou.git] / includes / export / WikiExporter.php
1 <?php
2 /**
3 * Base class for exporting
4 *
5 * Copyright © 2003, 2005, 2006 Brion Vibber <brion@pobox.com>
6 * https://www.mediawiki.org/
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License along
19 * with this program; if not, write to the Free Software Foundation, Inc.,
20 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
21 * http://www.gnu.org/copyleft/gpl.html
22 *
23 * @file
24 */
25
26 /**
27 * @defgroup Dump Dump
28 */
29
30 use Wikimedia\Rdbms\ResultWrapper;
31
32 /**
33 * @ingroup SpecialPage Dump
34 */
35 class WikiExporter {
36 /** @var bool Return distinct author list (when not returning full history) */
37 public $list_authors = false;
38
39 /** @var bool */
40 public $dumpUploads = false;
41
42 /** @var bool */
43 public $dumpUploadFileContents = false;
44
45 /** @var string */
46 public $author_list = "";
47
48 const FULL = 1;
49 const CURRENT = 2;
50 const STABLE = 4; // extension defined
51 const LOGS = 8;
52 const RANGE = 16;
53
54 const BUFFER = 0;
55 const STREAM = 1;
56
57 const TEXT = 0;
58 const STUB = 1;
59
60 /** @var int */
61 public $buffer;
62
63 /** @var int */
64 public $text;
65
66 /** @var DumpOutput */
67 public $sink;
68
69 /**
70 * Returns the export schema version.
71 * @return string
72 */
73 public static function schemaVersion() {
74 return "0.10";
75 }
76
77 /**
78 * If using WikiExporter::STREAM to stream a large amount of data,
79 * provide a database connection which is not managed by
80 * LoadBalancer to read from: some history blob types will
81 * make additional queries to pull source data while the
82 * main query is still running.
83 *
84 * @param IDatabase $db
85 * @param int|array $history One of WikiExporter::FULL, WikiExporter::CURRENT,
86 * WikiExporter::RANGE or WikiExporter::STABLE, or an associative array:
87 * - offset: non-inclusive offset at which to start the query
88 * - limit: maximum number of rows to return
89 * - dir: "asc" or "desc" timestamp order
90 * @param int $buffer One of WikiExporter::BUFFER or WikiExporter::STREAM
91 * @param int $text One of WikiExporter::TEXT or WikiExporter::STUB
92 */
93 function __construct( $db, $history = WikiExporter::CURRENT,
94 $buffer = WikiExporter::BUFFER, $text = WikiExporter::TEXT ) {
95 $this->db = $db;
96 $this->history = $history;
97 $this->buffer = $buffer;
98 $this->writer = new XmlDumpWriter();
99 $this->sink = new DumpOutput();
100 $this->text = $text;
101 }
102
103 /**
104 * Set the DumpOutput or DumpFilter object which will receive
105 * various row objects and XML output for filtering. Filters
106 * can be chained or used as callbacks.
107 *
108 * @param DumpOutput $sink
109 */
110 public function setOutputSink( &$sink ) {
111 $this->sink =& $sink;
112 }
113
114 public function openStream() {
115 $output = $this->writer->openStream();
116 $this->sink->writeOpenStream( $output );
117 }
118
119 public function closeStream() {
120 $output = $this->writer->closeStream();
121 $this->sink->writeCloseStream( $output );
122 }
123
124 /**
125 * Dumps a series of page and revision records for all pages
126 * in the database, either including complete history or only
127 * the most recent version.
128 */
129 public function allPages() {
130 $this->dumpFrom( '' );
131 }
132
133 /**
134 * Dumps a series of page and revision records for those pages
135 * in the database falling within the page_id range given.
136 * @param int $start Inclusive lower limit (this id is included)
137 * @param int $end Exclusive upper limit (this id is not included)
138 * If 0, no upper limit.
139 * @param bool $orderRevs order revisions within pages in ascending order
140 */
141 public function pagesByRange( $start, $end, $orderRevs ) {
142 if ( $orderRevs ) {
143 $condition = 'rev_page >= ' . intval( $start );
144 if ( $end ) {
145 $condition .= ' AND rev_page < ' . intval( $end );
146 }
147 } else {
148 $condition = 'page_id >= ' . intval( $start );
149 if ( $end ) {
150 $condition .= ' AND page_id < ' . intval( $end );
151 }
152 }
153 $this->dumpFrom( $condition, $orderRevs );
154 }
155
156 /**
157 * Dumps a series of page and revision records for those pages
158 * in the database with revisions falling within the rev_id range given.
159 * @param int $start Inclusive lower limit (this id is included)
160 * @param int $end Exclusive upper limit (this id is not included)
161 * If 0, no upper limit.
162 */
163 public function revsByRange( $start, $end ) {
164 $condition = 'rev_id >= ' . intval( $start );
165 if ( $end ) {
166 $condition .= ' AND rev_id < ' . intval( $end );
167 }
168 $this->dumpFrom( $condition );
169 }
170
171 /**
172 * @param Title $title
173 */
174 public function pageByTitle( $title ) {
175 $this->dumpFrom(
176 'page_namespace=' . $title->getNamespace() .
177 ' AND page_title=' . $this->db->addQuotes( $title->getDBkey() ) );
178 }
179
180 /**
181 * @param string $name
182 * @throws MWException
183 */
184 public function pageByName( $name ) {
185 $title = Title::newFromText( $name );
186 if ( is_null( $title ) ) {
187 throw new MWException( "Can't export invalid title" );
188 } else {
189 $this->pageByTitle( $title );
190 }
191 }
192
193 /**
194 * @param array $names
195 */
196 public function pagesByName( $names ) {
197 foreach ( $names as $name ) {
198 $this->pageByName( $name );
199 }
200 }
201
202 public function allLogs() {
203 $this->dumpFrom( '' );
204 }
205
206 /**
207 * @param int $start
208 * @param int $end
209 */
210 public function logsByRange( $start, $end ) {
211 $condition = 'log_id >= ' . intval( $start );
212 if ( $end ) {
213 $condition .= ' AND log_id < ' . intval( $end );
214 }
215 $this->dumpFrom( $condition );
216 }
217
218 /**
219 * Generates the distinct list of authors of an article
220 * Not called by default (depends on $this->list_authors)
221 * Can be set by Special:Export when not exporting whole history
222 *
223 * @param array $cond
224 */
225 protected function do_list_authors( $cond ) {
226 $this->author_list = "<contributors>";
227 // rev_deleted
228
229 $res = $this->db->select(
230 [ 'page', 'revision' ],
231 [ 'DISTINCT rev_user_text', 'rev_user' ],
232 [
233 $this->db->bitAnd( 'rev_deleted', Revision::DELETED_USER ) . ' = 0',
234 $cond,
235 'page_id = rev_id',
236 ],
237 __METHOD__
238 );
239
240 foreach ( $res as $row ) {
241 $this->author_list .= "<contributor>" .
242 "<username>" .
243 htmlentities( $row->rev_user_text ) .
244 "</username>" .
245 "<id>" .
246 $row->rev_user .
247 "</id>" .
248 "</contributor>";
249 }
250 $this->author_list .= "</contributors>";
251 }
252
253 /**
254 * @param string $cond
255 * @throws MWException
256 * @throws Exception
257 */
258 protected function dumpFrom( $cond = '', $orderRevs = false ) {
259 # For logging dumps...
260 if ( $this->history & self::LOGS ) {
261 $where = [ 'user_id = log_user' ];
262 # Hide private logs
263 $hideLogs = LogEventsList::getExcludeClause( $this->db );
264 if ( $hideLogs ) {
265 $where[] = $hideLogs;
266 }
267 # Add on any caller specified conditions
268 if ( $cond ) {
269 $where[] = $cond;
270 }
271 # Get logging table name for logging.* clause
272 $logging = $this->db->tableName( 'logging' );
273
274 if ( $this->buffer == WikiExporter::STREAM ) {
275 $prev = $this->db->bufferResults( false );
276 }
277 $result = null; // Assuring $result is not undefined, if exception occurs early
278 try {
279 $result = $this->db->select( [ 'logging', 'user' ],
280 [ "{$logging}.*", 'user_name' ], // grab the user name
281 $where,
282 __METHOD__,
283 [ 'ORDER BY' => 'log_id', 'USE INDEX' => [ 'logging' => 'PRIMARY' ] ]
284 );
285 $this->outputLogStream( $result );
286 if ( $this->buffer == WikiExporter::STREAM ) {
287 $this->db->bufferResults( $prev );
288 }
289 } catch ( Exception $e ) {
290 // Throwing the exception does not reliably free the resultset, and
291 // would also leave the connection in unbuffered mode.
292
293 // Freeing result
294 try {
295 if ( $result ) {
296 $result->free();
297 }
298 } catch ( Exception $e2 ) {
299 // Already in panic mode -> ignoring $e2 as $e has
300 // higher priority
301 }
302
303 // Putting database back in previous buffer mode
304 try {
305 if ( $this->buffer == WikiExporter::STREAM ) {
306 $this->db->bufferResults( $prev );
307 }
308 } catch ( Exception $e2 ) {
309 // Already in panic mode -> ignoring $e2 as $e has
310 // higher priority
311 }
312
313 // Inform caller about problem
314 throw $e;
315 }
316 # For page dumps...
317 } else {
318 $tables = [ 'page', 'revision' ];
319 $opts = [ 'ORDER BY' => 'page_id ASC' ];
320 $opts['USE INDEX'] = [];
321 $join = [];
322 if ( is_array( $this->history ) ) {
323 # Time offset/limit for all pages/history...
324 $revJoin = 'page_id=rev_page';
325 # Set time order
326 if ( $this->history['dir'] == 'asc' ) {
327 $op = '>';
328 $opts['ORDER BY'] = 'rev_timestamp ASC';
329 } else {
330 $op = '<';
331 $opts['ORDER BY'] = 'rev_timestamp DESC';
332 }
333 # Set offset
334 if ( !empty( $this->history['offset'] ) ) {
335 $revJoin .= " AND rev_timestamp $op " .
336 $this->db->addQuotes( $this->db->timestamp( $this->history['offset'] ) );
337 }
338 $join['revision'] = [ 'INNER JOIN', $revJoin ];
339 # Set query limit
340 if ( !empty( $this->history['limit'] ) ) {
341 $opts['LIMIT'] = intval( $this->history['limit'] );
342 }
343 } elseif ( $this->history & WikiExporter::FULL ) {
344 # Full history dumps...
345 # query optimization for history stub dumps
346 if ( $this->text == WikiExporter::STUB && $orderRevs ) {
347 $tables = [ 'revision', 'page' ];
348 $opts[] = 'STRAIGHT_JOIN';
349 $opts['ORDER BY'] = [ 'rev_page ASC', 'rev_id ASC' ];
350 $opts['USE INDEX']['revision'] = 'rev_page_id';
351 $join['page'] = [ 'INNER JOIN', 'rev_page=page_id' ];
352 } else {
353 $join['revision'] = [ 'INNER JOIN', 'page_id=rev_page' ];
354 }
355 } elseif ( $this->history & WikiExporter::CURRENT ) {
356 # Latest revision dumps...
357 if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
358 $this->do_list_authors( $cond );
359 }
360 $join['revision'] = [ 'INNER JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
361 } elseif ( $this->history & WikiExporter::STABLE ) {
362 # "Stable" revision dumps...
363 # Default JOIN, to be overridden...
364 $join['revision'] = [ 'INNER JOIN', 'page_id=rev_page AND page_latest=rev_id' ];
365 # One, and only one hook should set this, and return false
366 if ( Hooks::run( 'WikiExporter::dumpStableQuery', [ &$tables, &$opts, &$join ] ) ) {
367 throw new MWException( __METHOD__ . " given invalid history dump type." );
368 }
369 } elseif ( $this->history & WikiExporter::RANGE ) {
370 # Dump of revisions within a specified range
371 $join['revision'] = [ 'INNER JOIN', 'page_id=rev_page' ];
372 $opts['ORDER BY'] = [ 'rev_page ASC', 'rev_id ASC' ];
373 } else {
374 # Unknown history specification parameter?
375 throw new MWException( __METHOD__ . " given invalid history dump type." );
376 }
377 # Query optimization hacks
378 if ( $cond == '' ) {
379 $opts[] = 'STRAIGHT_JOIN';
380 $opts['USE INDEX']['page'] = 'PRIMARY';
381 }
382 # Build text join options
383 if ( $this->text != WikiExporter::STUB ) { // 1-pass
384 $tables[] = 'text';
385 $join['text'] = [ 'INNER JOIN', 'rev_text_id=old_id' ];
386 }
387
388 if ( $this->buffer == WikiExporter::STREAM ) {
389 $prev = $this->db->bufferResults( false );
390 }
391 $result = null; // Assuring $result is not undefined, if exception occurs early
392 try {
393 Hooks::run( 'ModifyExportQuery',
394 [ $this->db, &$tables, &$cond, &$opts, &$join ] );
395
396 # Do the query!
397 $result = $this->db->select( $tables, '*', $cond, __METHOD__, $opts, $join );
398 # Output dump results
399 $this->outputPageStream( $result );
400
401 if ( $this->buffer == WikiExporter::STREAM ) {
402 $this->db->bufferResults( $prev );
403 }
404 } catch ( Exception $e ) {
405 // Throwing the exception does not reliably free the resultset, and
406 // would also leave the connection in unbuffered mode.
407
408 // Freeing result
409 try {
410 if ( $result ) {
411 $result->free();
412 }
413 } catch ( Exception $e2 ) {
414 // Already in panic mode -> ignoring $e2 as $e has
415 // higher priority
416 }
417
418 // Putting database back in previous buffer mode
419 try {
420 if ( $this->buffer == WikiExporter::STREAM ) {
421 $this->db->bufferResults( $prev );
422 }
423 } catch ( Exception $e2 ) {
424 // Already in panic mode -> ignoring $e2 as $e has
425 // higher priority
426 }
427
428 // Inform caller about problem
429 throw $e;
430 }
431 }
432 }
433
434 /**
435 * Runs through a query result set dumping page and revision records.
436 * The result set should be sorted/grouped by page to avoid duplicate
437 * page records in the output.
438 *
439 * Should be safe for
440 * streaming (non-buffered) queries, as long as it was made on a
441 * separate database connection not managed by LoadBalancer; some
442 * blob storage types will make queries to pull source data.
443 *
444 * @param ResultWrapper $resultset
445 */
446 protected function outputPageStream( $resultset ) {
447 $last = null;
448 foreach ( $resultset as $row ) {
449 if ( $last === null ||
450 $last->page_namespace != $row->page_namespace ||
451 $last->page_title != $row->page_title ) {
452 if ( $last !== null ) {
453 $output = '';
454 if ( $this->dumpUploads ) {
455 $output .= $this->writer->writeUploads( $last, $this->dumpUploadFileContents );
456 }
457 $output .= $this->writer->closePage();
458 $this->sink->writeClosePage( $output );
459 }
460 $output = $this->writer->openPage( $row );
461 $this->sink->writeOpenPage( $row, $output );
462 $last = $row;
463 }
464 $output = $this->writer->writeRevision( $row );
465 $this->sink->writeRevision( $row, $output );
466 }
467 if ( $last !== null ) {
468 $output = '';
469 if ( $this->dumpUploads ) {
470 $output .= $this->writer->writeUploads( $last, $this->dumpUploadFileContents );
471 }
472 $output .= $this->author_list;
473 $output .= $this->writer->closePage();
474 $this->sink->writeClosePage( $output );
475 }
476 }
477
478 /**
479 * @param ResultWrapper $resultset
480 */
481 protected function outputLogStream( $resultset ) {
482 foreach ( $resultset as $row ) {
483 $output = $this->writer->writeLogItem( $row );
484 $this->sink->writeLogItem( $row, $output );
485 }
486 }
487 }