Fix namespace and notalk filters for dump
[lhc/web/wiklou.git] / includes / Export.php
1 <?php
2 # Copyright (C) 2003, 2005 Brion Vibber <brion@pobox.com>
3 # http://www.mediawiki.org/
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License along
16 # with this program; if not, write to the Free Software Foundation, Inc.,
17 # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 # http://www.gnu.org/copyleft/gpl.html
19 /**
20 *
21 * @package MediaWiki
22 * @subpackage SpecialPage
23 */
24
25 /** */
26 require_once( 'Revision.php' );
27
28 define( 'MW_EXPORT_FULL', 0 );
29 define( 'MW_EXPORT_CURRENT', 1 );
30
31 define( 'MW_EXPORT_BUFFER', 0 );
32 define( 'MW_EXPORT_STREAM', 1 );
33
34
35 /**
36 * @package MediaWiki
37 * @subpackage SpecialPage
38 */
39 class WikiExporter {
40 /**
41 * If using MW_EXPORT_STREAM to stream a large amount of data,
42 * provide a database connection which is not managed by
43 * LoadBalancer to read from: some history blob types will
44 * make additional queries to pull source data while the
45 * main query is still running.
46 *
47 * @param Database $db
48 * @param int $history one of MW_EXPORT_FULL or MW_EXPORT_CURRENT
49 * @param int $buffer one of MW_EXPORT_BUFFER or MW_EXPORT_STREAM
50 */
51 function WikiExporter( &$db, $history = MW_EXPORT_CURRENT,
52 $buffer = MW_EXPORT_BUFFER ) {
53 $this->db =& $db;
54 $this->history = $history;
55 $this->buffer = $buffer;
56 $this->writer = new XmlDumpWriter();
57 $this->sink = new DumpOutput();
58 }
59
60 /**
61 * Set the DumpOutput or DumpFilter object which will receive
62 * various row objects and XML output for filtering. Filters
63 * can be chained or used as callbacks.
64 *
65 * @param mixed $callback
66 */
67 function setOutputSink( &$sink ) {
68 $this->sink =& $sink;
69 }
70
71 function openStream() {
72 $output = $this->writer->openStream();
73 $this->sink->writeOpenStream( $output );
74 }
75
76 function closeStream() {
77 $output = $this->writer->closeStream();
78 $this->sink->writeCloseStream( $output );
79 }
80
81 /**
82 * Dumps a series of page and revision records for all pages
83 * in the database, either including complete history or only
84 * the most recent version.
85 */
86 function allPages() {
87 return $this->dumpFrom( '' );
88 }
89
90 /**
91 * Dumps a series of page and revision records for those pages
92 * in the database falling within the page_id range given.
93 * @param int $start Inclusive lower limit (this id is included)
94 * @param int $end Exclusive upper limit (this id is not included)
95 * If 0, no upper limit.
96 */
97 function pagesByRange( $start, $end ) {
98 $condition = 'page_id >= ' . intval( $start );
99 if( $end ) {
100 $condition .= ' AND page_id < ' . intval( $end );
101 }
102 return $this->dumpFrom( $condition );
103 }
104
105 /**
106 * @param Title $title
107 */
108 function pageByTitle( $title ) {
109 return $this->dumpFrom(
110 'page_namespace=' . $title->getNamespace() .
111 ' AND page_title=' . $this->db->addQuotes( $title->getDbKey() ) );
112 }
113
114 function pageByName( $name ) {
115 $title = Title::newFromText( $name );
116 if( is_null( $title ) ) {
117 return new WikiError( "Can't export invalid title" );
118 } else {
119 return $this->pageByTitle( $title );
120 }
121 }
122
123 function pagesByName( $names ) {
124 foreach( $names as $name ) {
125 $this->pageByName( $name );
126 }
127 }
128
129
130 // -------------------- private implementation below --------------------
131
132 function dumpFrom( $cond = '' ) {
133 $fname = 'WikiExporter::dumpFrom';
134 wfProfileIn( $fname );
135
136 $page = $this->db->tableName( 'page' );
137 $revision = $this->db->tableName( 'revision' );
138 $text = $this->db->tableName( 'text' );
139
140 if( $this->history == MW_EXPORT_FULL ) {
141 $join = 'page_id=rev_page';
142 } elseif( $this->history == MW_EXPORT_CURRENT ) {
143 $join = 'page_id=rev_page AND page_latest=rev_id';
144 } else {
145 wfProfileOut( $fname );
146 return new WikiError( "$fname given invalid history dump type." );
147 }
148 $where = ( $cond == '' ) ? '' : "$cond AND";
149
150 if( $this->buffer == MW_EXPORT_STREAM ) {
151 $prev = $this->db->bufferResults( false );
152 }
153 if( $cond == '' ) {
154 // Optimization hack for full-database dump
155 $pageindex = 'FORCE INDEX (PRIMARY)';
156 $revindex = 'FORCE INDEX(page_timestamp)';
157 } else {
158 $pageindex = '';
159 $revindex = '';
160 }
161 $result = $this->db->query(
162 "SELECT * FROM
163 $page $pageindex,
164 $revision $revindex,
165 $text
166 WHERE $where $join AND rev_text_id=old_id
167 ORDER BY page_id", $fname );
168 $wrapper = $this->db->resultObject( $result );
169 $this->outputStream( $wrapper );
170
171 if( $this->buffer == MW_EXPORT_STREAM ) {
172 $this->db->bufferResults( $prev );
173 }
174
175 wfProfileOut( $fname );
176 }
177
178 /**
179 * Runs through a query result set dumping page and revision records.
180 * The result set should be sorted/grouped by page to avoid duplicate
181 * page records in the output.
182 *
183 * The result set will be freed once complete. Should be safe for
184 * streaming (non-buffered) queries, as long as it was made on a
185 * separate database connection not managed by LoadBalancer; some
186 * blob storage types will make queries to pull source data.
187 *
188 * @param ResultWrapper $resultset
189 * @access private
190 */
191 function outputStream( $resultset ) {
192 $last = null;
193 while( $row = $resultset->fetchObject() ) {
194 if( is_null( $last ) ||
195 $last->page_namespace != $row->page_namespace ||
196 $last->page_title != $row->page_title ) {
197 if( isset( $last ) ) {
198 $output = $this->writer->closePage();
199 $this->sink->writeClosePage( $output );
200 }
201 $output = $this->writer->openPage( $row );
202 $this->sink->writeOpenPage( $row, $output );
203 $last = $row;
204 }
205 $output = $this->writer->writeRevision( $row );
206 $this->sink->writeRevision( $row, $output );
207 }
208 if( isset( $last ) ) {
209 $output = $this->writer->closePage();
210 $this->sink->writeClosePage( $output );
211 }
212 $resultset->free();
213 }
214 }
215
216 class XmlDumpWriter {
217
218 /**
219 * Returns the export schema version.
220 * @return string
221 */
222 function schemaVersion() {
223 return "0.3";
224 }
225
226 /**
227 * Opens the XML output stream's root <mediawiki> element.
228 * This does not include an xml directive, so is safe to include
229 * as a subelement in a larger XML stream. Namespace and XML Schema
230 * references are included.
231 *
232 * Output will be encoded in UTF-8.
233 *
234 * @return string
235 */
236 function openStream() {
237 global $wgContLanguageCode;
238 $ver = $this->schemaVersion();
239 return wfElement( 'mediawiki', array(
240 'xmlns' => "http://www.mediawiki.org/xml/export-$ver/",
241 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
242 'xsi:schemaLocation' => "http://www.mediawiki.org/xml/export-$ver/ " .
243 "http://www.mediawiki.org/xml/export-$ver.xsd",
244 'version' => $ver,
245 'xml:lang' => $wgContLanguageCode ),
246 null ) .
247 "\n" .
248 $this->siteInfo();
249 }
250
251 function siteInfo() {
252 $info = array(
253 $this->sitename(),
254 $this->homelink(),
255 $this->generator(),
256 $this->caseSetting(),
257 $this->namespaces() );
258 return " <siteinfo>\n " .
259 implode( "\n ", $info ) .
260 "\n </siteinfo>\n";
261 }
262
263 function sitename() {
264 global $wgSitename;
265 return wfElement( 'sitename', array(), $wgSitename );
266 }
267
268 function generator() {
269 global $wgVersion;
270 return wfElement( 'generator', array(), "MediaWiki $wgVersion" );
271 }
272
273 function homelink() {
274 $page = Title::newFromText( wfMsgForContent( 'mainpage' ) );
275 return wfElement( 'base', array(), $page->getFullUrl() );
276 }
277
278 function caseSetting() {
279 global $wgCapitalLinks;
280 // "case-insensitive" option is reserved for future
281 $sensitivity = $wgCapitalLinks ? 'first-letter' : 'case-sensitive';
282 return wfElement( 'case', array(), $sensitivity );
283 }
284
285 function namespaces() {
286 global $wgContLang;
287 $spaces = " <namespaces>\n";
288 foreach( $wgContLang->getFormattedNamespaces() as $ns => $title ) {
289 $spaces .= ' ' . wfElement( 'namespace', array( 'key' => $ns ), $title ) . "\n";
290 }
291 $spaces .= " </namespaces>";
292 return $spaces;
293 }
294
295 /**
296 * Closes the output stream with the closing root element.
297 * Call when finished dumping things.
298 */
299 function closeStream() {
300 return "</mediawiki>\n";
301 }
302
303
304 /**
305 * Opens a <page> section on the output stream, with data
306 * from the given database row.
307 *
308 * @param object $row
309 * @return string
310 * @access private
311 */
312 function openPage( $row ) {
313 $out = " <page>\n";
314 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
315 $out .= ' ' . wfElementClean( 'title', array(), $title->getPrefixedText() ) . "\n";
316 $out .= ' ' . wfElement( 'id', array(), strval( $row->page_id ) ) . "\n";
317 if( '' != $row->page_restrictions ) {
318 $out .= ' ' . wfElement( 'restrictions', array(),
319 strval( $row->page_restrictions ) ) . "\n";
320 }
321 return $out;
322 }
323
324 /**
325 * Closes a <page> section on the output stream.
326 *
327 * @access private
328 */
329 function closePage() {
330 return " </page>\n";
331 }
332
333 /**
334 * Dumps a <revision> section on the output stream, with
335 * data filled in from the given database row.
336 *
337 * @param object $row
338 * @return string
339 * @access private
340 */
341 function writeRevision( $row ) {
342 $fname = 'WikiExporter::dumpRev';
343 wfProfileIn( $fname );
344
345 $out = " <revision>\n";
346 $out .= " " . wfElement( 'id', null, strval( $row->rev_id ) ) . "\n";
347
348 $ts = wfTimestamp2ISO8601( strval( $row->rev_timestamp ) );
349 $out .= " " . wfElement( 'timestamp', null, $ts ) . "\n";
350
351 $out .= " <contributor>\n";
352 if( $row->rev_user ) {
353 $out .= " " . wfElementClean( 'username', null, strval( $row->rev_user_text ) ) . "\n";
354 $out .= " " . wfElement( 'id', null, strval( $row->rev_user ) ) . "\n";
355 } else {
356 $out .= " " . wfElementClean( 'ip', null, strval( $row->rev_user_text ) ) . "\n";
357 }
358 $out .= " </contributor>\n";
359
360 if( $row->rev_minor_edit ) {
361 $out .= " <minor/>\n";
362 }
363 if( $row->rev_comment != '' ) {
364 $out .= " " . wfElementClean( 'comment', null, strval( $row->rev_comment ) ) . "\n";
365 }
366
367 $text = strval( Revision::getRevisionText( $row ) );
368 $out .= " " . wfElementClean( 'text',
369 array( 'xml:space' => 'preserve' ),
370 strval( $text ) ) . "\n";
371
372 $out .= " </revision>\n";
373
374 wfProfileOut( $fname );
375 return $out;
376 }
377
378 }
379
380
381 /**
382 * Base class for output stream; prints to stdout or buffer or whereever.
383 */
384 class DumpOutput {
385 function writeOpenStream( $string ) {
386 $this->write( $string );
387 }
388
389 function writeCloseStream( $string ) {
390 $this->write( $string );
391 }
392
393 function writeOpenPage( $page, $string ) {
394 $this->write( $string );
395 }
396
397 function writeClosePage( $string ) {
398 $this->write( $string );
399 }
400
401 function writeRevision( $rev, $string ) {
402 $this->write( $string );
403 }
404
405 /**
406 * Override to write to a different stream type.
407 * @return bool
408 */
409 function write( $string ) {
410 print $string;
411 }
412 }
413
414 /**
415 * Stream outputter to send data to a file.
416 */
417 class DumpFileOutput extends DumpOutput {
418 var $handle;
419
420 function DumpFileOutput( $file ) {
421 $this->handle = fopen( $file, "wt" );
422 }
423
424 function write( $string ) {
425 fputs( $this->handle, $string );
426 }
427 }
428
429 /**
430 * Stream outputter to send data to a file via some filter program.
431 * Even if compression is available in a library, using a separate
432 * program can allow us to make use of a multi-processor system.
433 */
434 class DumpPipeOutput extends DumpFileOutput {
435 function DumpPipeOutput( $command, $file = null ) {
436 if( !is_null( $file ) ) {
437 $command .= " > " . wfEscapeShellArg( $file );
438 }
439 $this->handle = popen( $command, "w" );
440 }
441 }
442
443 /**
444 * Sends dump output via the gzip compressor.
445 */
446 class DumpGZipOutput extends DumpPipeOutput {
447 function DumpGZipOutput( $file ) {
448 parent::DumpPipeOutput( "gzip", $file );
449 }
450 }
451
452 /**
453 * Sends dump output via the bgzip2 compressor.
454 */
455 class DumpBZip2Output extends DumpPipeOutput {
456 function DumpBZip2Output( $file ) {
457 parent::DumpPipeOutput( "bzip2", $file );
458 }
459 }
460
461 /**
462 * Sends dump output via the p7zip compressor.
463 */
464 class Dump7ZipOutput extends DumpPipeOutput {
465 function Dump7ZipOutput( $file ) {
466 $command = "7za a -si " . wfEscapeShellArg( $file );
467 parent::DumpPipeOutput( $command );
468 }
469 }
470
471
472
473 /**
474 * Dump output filter class.
475 * This just does output filtering and streaming; XML formatting is done
476 * higher up, so be careful in what you do.
477 */
478 class DumpFilter {
479 function DumpFilter( &$sink ) {
480 $this->sink =& $sink;
481 }
482
483 function writeOpenStream( $string ) {
484 $this->sink->writeOpenStream( $string );
485 }
486
487 function writeCloseStream( $string ) {
488 $this->sink->writeCloseStream( $string );
489 }
490
491 function writeOpenPage( $page, $string ) {
492 $this->sendingThisPage = $this->pass( $page, $string );
493 if( $this->sendingThisPage ) {
494 $this->sink->writeOpenPage( $page, $string );
495 }
496 }
497
498 function writeClosePage( $string ) {
499 if( $this->sendingThisPage ) {
500 $this->sink->writeClosePage( $string );
501 $this->sendingThisPage = false;
502 }
503 }
504
505 function writeRevision( $rev, $string ) {
506 if( $this->sendingThisPage ) {
507 $this->sink->writeRevision( $rev, $string );
508 }
509 }
510
511 /**
512 * Override for page-based filter types.
513 * @return bool
514 */
515 function pass( $page, $string ) {
516 return true;
517 }
518 }
519
520 /**
521 * Simple dump output filter to exclude all talk pages.
522 */
523 class DumpNotalkFilter extends DumpFilter {
524 function pass( $page ) {
525 return !Namespace::isTalk( $page->page_namespace );
526 }
527 }
528
529 /**
530 * Dump output filter to include or exclude pages in a given set of namespaces.
531 */
532 class DumpNamespaceFilter extends DumpFilter {
533 var $invert = false;
534 var $namespaces = array();
535
536 function DumpNamespaceFilter( &$sink, $param ) {
537 parent::DumpFilter( $sink );
538
539 $constants = array(
540 "NS_MAIN" => NS_MAIN,
541 "NS_TALK" => NS_TALK,
542 "NS_USER" => NS_USER,
543 "NS_USER_TALK" => NS_USER_TALK,
544 "NS_PROJECT" => NS_PROJECT,
545 "NS_PROJECT_TALK" => NS_PROJECT_TALK,
546 "NS_IMAGE" => NS_IMAGE,
547 "NS_IMAGE_TALK" => NS_IMAGE_TALK,
548 "NS_MEDIAWIKI" => NS_MEDIAWIKI,
549 "NS_MEDIAWIKI_TALK" => NS_MEDIAWIKI_TALK,
550 "NS_TEMPLATE" => NS_TEMPLATE,
551 "NS_TEMPLATE_TALK" => NS_TEMPLATE_TALK,
552 "NS_HELP" => NS_HELP,
553 "NS_HELP_TALK" => NS_HELP_TALK,
554 "NS_CATEGORY" => NS_CATEGORY,
555 "NS_CATEGORY_TALK" => NS_CATEGORY_TALK );
556
557 if( $param{0} == '!' ) {
558 $this->invert = true;
559 $param = substr( $param, 1 );
560 }
561
562 foreach( explode( ',', $param ) as $key ) {
563 $key = trim( $key );
564 if( isset( $constants[$key] ) ) {
565 $ns = $constants[$key];
566 $this->namespaces[$ns] = true;
567 } elseif( is_numeric( $key ) ) {
568 $ns = intval( $key );
569 $this->namespaces[$ns] = true;
570 } else {
571 die( "Unrecognized namespace key '$key'\n" );
572 }
573 }
574 }
575
576 function pass( $page ) {
577 $match = isset( $this->namespaces[$page->page_namespace] );
578 return $this->invert xor $match;
579 }
580 }
581
582
583 /**
584 * Dump output filter to include only the last revision in each page sequence.
585 */
586 class DumpLatestFilter extends DumpFilter {
587 var $page, $pageString, $rev, $revString;
588
589 function writeOpenPage( $page, $string ) {
590 $this->page = $page;
591 $this->pageString = $string;
592 }
593
594 function writeClosePage( $string ) {
595 if( $this->rev ) {
596 $this->sink->writeOpenPage( $this->page, $this->pageString );
597 $this->sink->writeRevision( $this->rev, $this->revString );
598 $this->sink->writeClosePage( $string );
599 }
600 $this->rev = null;
601 $this->revString = null;
602 $this->page = null;
603 $this->pageString = null;
604 }
605
606 function writeRevision( $rev, $string ) {
607 if( $rev->rev_id == $this->page->page_latest ) {
608 $this->rev = $rev;
609 $this->revString = $string;
610 }
611 }
612 }
613
614 /**
615 * Base class for output stream; prints to stdout or buffer or whereever.
616 */
617 class DumpMultiWriter {
618 function DumpMultiWriter( $sinks ) {
619 $this->sinks = $sinks;
620 $this->count = count( $sinks );
621 }
622
623 function writeOpenStream( $string ) {
624 for( $i = 0; $i < $this->count; $i++ ) {
625 $this->sinks[$i]->writeOpenStream( $string );
626 }
627 }
628
629 function writeCloseStream( $string ) {
630 for( $i = 0; $i < $this->count; $i++ ) {
631 $this->sinks[$i]->writeCloseStream( $string );
632 }
633 }
634
635 function writeOpenPage( $page, $string ) {
636 for( $i = 0; $i < $this->count; $i++ ) {
637 $this->sinks[$i]->writeOpenPage( $page, $string );
638 }
639 }
640
641 function writeClosePage( $string ) {
642 for( $i = 0; $i < $this->count; $i++ ) {
643 $this->sinks[$i]->writeClosePage( $string );
644 }
645 }
646
647 function writeRevision( $rev, $string ) {
648 for( $i = 0; $i < $this->count; $i++ ) {
649 $this->sinks[$i]->writeRevision( $rev, $string );
650 }
651 }
652 }
653
654
655
656 function wfTimestamp2ISO8601( $ts ) {
657 #2003-08-05T18:30:02Z
658 return preg_replace( '/^(....)(..)(..)(..)(..)(..)$/', '$1-$2-$3T$4:$5:$6Z', wfTimestamp( TS_MW, $ts ) );
659 }
660
661 function xmlsafe( $string ) {
662 $fname = 'xmlsafe';
663 wfProfileIn( $fname );
664
665 /**
666 * The page may contain old data which has not been properly normalized.
667 * Invalid UTF-8 sequences or forbidden control characters will make our
668 * XML output invalid, so be sure to strip them out.
669 */
670 $string = UtfNormal::cleanUp( $string );
671
672 $string = htmlspecialchars( $string );
673 wfProfileOut( $fname );
674 return $string;
675 }
676
677 ?>