Another fix.
[lhc/web/wiklou.git] / includes / Export.php
1 <?php
2 # Copyright (C) 2003, 2005, 2006 Brion Vibber <brion@pobox.com>
3 # http://www.mediawiki.org/
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License along
16 # with this program; if not, write to the Free Software Foundation, Inc.,
17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 # http://www.gnu.org/copyleft/gpl.html
19
20 /**
21 *
22 * @addtogroup SpecialPage
23 */
24
25 class WikiExporter {
26 var $list_authors = false ; # Return distinct author list (when not returning full history)
27 var $author_list = "" ;
28
29 const FULL = 0;
30 const CURRENT = 1;
31
32 const BUFFER = 0;
33 const STREAM = 1;
34
35 const TEXT = 0;
36 const STUB = 1;
37
38 /**
39 * If using WikiExporter::STREAM to stream a large amount of data,
40 * provide a database connection which is not managed by
41 * LoadBalancer to read from: some history blob types will
42 * make additional queries to pull source data while the
43 * main query is still running.
44 *
45 * @param Database $db
46 * @param mixed $history one of WikiExporter::FULL or WikiExporter::CURRENT, or an
47 * associative array:
48 * offset: non-inclusive offset at which to start the query
49 * limit: maximum number of rows to return
50 * dir: "asc" or "desc" timestamp order
51 * @param int $buffer one of WikiExporter::BUFFER or WikiExporter::STREAM
52 */
53 function __construct( &$db, $history = WikiExporter::CURRENT,
54 $buffer = WikiExporter::BUFFER, $text = WikiExporter::TEXT ) {
55 $this->db =& $db;
56 $this->history = $history;
57 $this->buffer = $buffer;
58 $this->writer = new XmlDumpWriter();
59 $this->sink = new DumpOutput();
60 $this->text = $text;
61 }
62
63 /**
64 * Set the DumpOutput or DumpFilter object which will receive
65 * various row objects and XML output for filtering. Filters
66 * can be chained or used as callbacks.
67 *
68 * @param mixed $callback
69 */
70 function setOutputSink( &$sink ) {
71 $this->sink =& $sink;
72 }
73
74 function openStream() {
75 $output = $this->writer->openStream();
76 $this->sink->writeOpenStream( $output );
77 }
78
79 function closeStream() {
80 $output = $this->writer->closeStream();
81 $this->sink->writeCloseStream( $output );
82 }
83
84 /**
85 * Dumps a series of page and revision records for all pages
86 * in the database, either including complete history or only
87 * the most recent version.
88 */
89 function allPages() {
90 return $this->dumpFrom( '' );
91 }
92
93 /**
94 * Dumps a series of page and revision records for those pages
95 * in the database falling within the page_id range given.
96 * @param int $start Inclusive lower limit (this id is included)
97 * @param int $end Exclusive upper limit (this id is not included)
98 * If 0, no upper limit.
99 */
100 function pagesByRange( $start, $end ) {
101 $condition = 'page_id >= ' . intval( $start );
102 if( $end ) {
103 $condition .= ' AND page_id < ' . intval( $end );
104 }
105 return $this->dumpFrom( $condition );
106 }
107
108 /**
109 * @param Title $title
110 */
111 function pageByTitle( $title ) {
112 return $this->dumpFrom(
113 'page_namespace=' . $title->getNamespace() .
114 ' AND page_title=' . $this->db->addQuotes( $title->getDbKey() ) );
115 }
116
117 function pageByName( $name ) {
118 $title = Title::newFromText( $name );
119 if( is_null( $title ) ) {
120 return new WikiError( "Can't export invalid title" );
121 } else {
122 return $this->pageByTitle( $title );
123 }
124 }
125
126 function pagesByName( $names ) {
127 foreach( $names as $name ) {
128 $this->pageByName( $name );
129 }
130 }
131
132
133 // -------------------- private implementation below --------------------
134
135 # Generates the distinct list of authors of an article
136 # Not called by default (depends on $this->list_authors)
137 # Can be set by Special:Export when not exporting whole history
138 function do_list_authors ( $page , $revision , $cond ) {
139 $fname = "do_list_authors" ;
140 wfProfileIn( $fname );
141 $this->author_list = "<contributors>";
142 $sql = "SELECT DISTINCT rev_user_text,rev_user FROM {$page},{$revision} WHERE page_id=rev_page AND " . $cond ;
143 $result = $this->db->query( $sql, $fname );
144 $resultset = $this->db->resultObject( $result );
145 while( $row = $resultset->fetchObject() ) {
146 $this->author_list .= "<contributor>" .
147 "<username>" .
148 htmlentities( $row->rev_user_text ) .
149 "</username>" .
150 "<id>" .
151 $row->rev_user .
152 "</id>" .
153 "</contributor>";
154 }
155 wfProfileOut( $fname );
156 $this->author_list .= "</contributors>";
157 }
158
159 function dumpFrom( $cond = '' ) {
160 $fname = 'WikiExporter::dumpFrom';
161 wfProfileIn( $fname );
162
163 $page = $this->db->tableName( 'page' );
164 $revision = $this->db->tableName( 'revision' );
165 $text = $this->db->tableName( 'text' );
166
167 $order = 'ORDER BY page_id';
168 $limit = '';
169
170 if( $this->history == WikiExporter::FULL ) {
171 $join = 'page_id=rev_page';
172 } elseif( $this->history == WikiExporter::CURRENT ) {
173 if ( $this->list_authors && $cond != '' ) { // List authors, if so desired
174 $this->do_list_authors ( $page , $revision , $cond );
175 }
176 $join = 'page_id=rev_page AND page_latest=rev_id';
177 } elseif ( is_array( $this->history ) ) {
178 $join = 'page_id=rev_page';
179 if ( $this->history['dir'] == 'asc' ) {
180 $op = '>';
181 $order .= ', rev_timestamp';
182 } else {
183 $op = '<';
184 $order .= ', rev_timestamp DESC';
185 }
186 if ( !empty( $this->history['offset'] ) ) {
187 $join .= " AND rev_timestamp $op " . $this->db->addQuotes(
188 $this->db->timestamp( $this->history['offset'] ) );
189 }
190 if ( !empty( $this->history['limit'] ) ) {
191 $limitNum = intval( $this->history['limit'] );
192 if ( $limitNum > 0 ) {
193 $limit = "LIMIT $limitNum";
194 }
195 }
196 } else {
197 wfProfileOut( $fname );
198 return new WikiError( "$fname given invalid history dump type." );
199 }
200 $where = ( $cond == '' ) ? '' : "$cond AND";
201
202 if( $this->buffer == WikiExporter::STREAM ) {
203 $prev = $this->db->bufferResults( false );
204 }
205 if( $cond == '' ) {
206 // Optimization hack for full-database dump
207 $revindex = $pageindex = $this->db->useIndexClause("PRIMARY");
208 $straight = ' /*! STRAIGHT_JOIN */ ';
209 } else {
210 $pageindex = '';
211 $revindex = '';
212 $straight = '';
213 }
214 if( $this->text == WikiExporter::STUB ) {
215 $sql = "SELECT $straight * FROM
216 $page $pageindex,
217 $revision $revindex
218 WHERE $where $join
219 $order $limit";
220 } else {
221 $sql = "SELECT $straight * FROM
222 $page $pageindex,
223 $revision $revindex,
224 $text
225 WHERE $where $join AND rev_text_id=old_id
226 $order $limit";
227 }
228 $result = $this->db->query( $sql, $fname );
229 $wrapper = $this->db->resultObject( $result );
230 $this->outputStream( $wrapper );
231
232 if ( $this->list_authors ) {
233 $this->outputStream( $wrapper );
234 }
235
236 if( $this->buffer == WikiExporter::STREAM ) {
237 $this->db->bufferResults( $prev );
238 }
239
240 wfProfileOut( $fname );
241 }
242
243 /**
244 * Runs through a query result set dumping page and revision records.
245 * The result set should be sorted/grouped by page to avoid duplicate
246 * page records in the output.
247 *
248 * The result set will be freed once complete. Should be safe for
249 * streaming (non-buffered) queries, as long as it was made on a
250 * separate database connection not managed by LoadBalancer; some
251 * blob storage types will make queries to pull source data.
252 *
253 * @param ResultWrapper $resultset
254 * @access private
255 */
256 function outputStream( $resultset ) {
257 $last = null;
258 while( $row = $resultset->fetchObject() ) {
259 if( is_null( $last ) ||
260 $last->page_namespace != $row->page_namespace ||
261 $last->page_title != $row->page_title ) {
262 if( isset( $last ) ) {
263 $output = $this->writer->closePage();
264 $this->sink->writeClosePage( $output );
265 }
266 $output = $this->writer->openPage( $row );
267 $this->sink->writeOpenPage( $row, $output );
268 $last = $row;
269 }
270 $output = $this->writer->writeRevision( $row );
271 $this->sink->writeRevision( $row, $output );
272 }
273 if( isset( $last ) ) {
274 $output = $this->author_list . $this->writer->closePage();
275 $this->sink->writeClosePage( $output );
276 }
277 $resultset->free();
278 }
279 }
280
281 class XmlDumpWriter {
282
283 /**
284 * Returns the export schema version.
285 * @return string
286 */
287 function schemaVersion() {
288 return "0.3"; // FIXME: upgrade to 0.4 when updated XSD is ready, for the revision deletion bits
289 }
290
291 /**
292 * Opens the XML output stream's root <mediawiki> element.
293 * This does not include an xml directive, so is safe to include
294 * as a subelement in a larger XML stream. Namespace and XML Schema
295 * references are included.
296 *
297 * Output will be encoded in UTF-8.
298 *
299 * @return string
300 */
301 function openStream() {
302 global $wgContLanguageCode;
303 $ver = $this->schemaVersion();
304 return wfElement( 'mediawiki', array(
305 'xmlns' => "http://www.mediawiki.org/xml/export-$ver/",
306 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance",
307 'xsi:schemaLocation' => "http://www.mediawiki.org/xml/export-$ver/ " .
308 "http://www.mediawiki.org/xml/export-$ver.xsd",
309 'version' => $ver,
310 'xml:lang' => $wgContLanguageCode ),
311 null ) .
312 "\n" .
313 $this->siteInfo();
314 }
315
316 function siteInfo() {
317 $info = array(
318 $this->sitename(),
319 $this->homelink(),
320 $this->generator(),
321 $this->caseSetting(),
322 $this->namespaces() );
323 return " <siteinfo>\n " .
324 implode( "\n ", $info ) .
325 "\n </siteinfo>\n";
326 }
327
328 function sitename() {
329 global $wgSitename;
330 return wfElement( 'sitename', array(), $wgSitename );
331 }
332
333 function generator() {
334 global $wgVersion;
335 return wfElement( 'generator', array(), "MediaWiki $wgVersion" );
336 }
337
338 function homelink() {
339 return wfElement( 'base', array(), Title::newMainPage()->getFullUrl() );
340 }
341
342 function caseSetting() {
343 global $wgCapitalLinks;
344 // "case-insensitive" option is reserved for future
345 $sensitivity = $wgCapitalLinks ? 'first-letter' : 'case-sensitive';
346 return wfElement( 'case', array(), $sensitivity );
347 }
348
349 function namespaces() {
350 global $wgContLang;
351 $spaces = " <namespaces>\n";
352 foreach( $wgContLang->getFormattedNamespaces() as $ns => $title ) {
353 $spaces .= ' ' . wfElement( 'namespace', array( 'key' => $ns ), $title ) . "\n";
354 }
355 $spaces .= " </namespaces>";
356 return $spaces;
357 }
358
359 /**
360 * Closes the output stream with the closing root element.
361 * Call when finished dumping things.
362 */
363 function closeStream() {
364 return "</mediawiki>\n";
365 }
366
367
368 /**
369 * Opens a <page> section on the output stream, with data
370 * from the given database row.
371 *
372 * @param object $row
373 * @return string
374 * @access private
375 */
376 function openPage( $row ) {
377 $out = " <page>\n";
378 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
379 $out .= ' ' . wfElementClean( 'title', array(), $title->getPrefixedText() ) . "\n";
380 $out .= ' ' . wfElement( 'id', array(), strval( $row->page_id ) ) . "\n";
381 if( '' != $row->page_restrictions ) {
382 $out .= ' ' . wfElement( 'restrictions', array(),
383 strval( $row->page_restrictions ) ) . "\n";
384 }
385 return $out;
386 }
387
388 /**
389 * Closes a <page> section on the output stream.
390 *
391 * @access private
392 */
393 function closePage() {
394 return " </page>\n";
395 }
396
397 /**
398 * Dumps a <revision> section on the output stream, with
399 * data filled in from the given database row.
400 *
401 * @param object $row
402 * @return string
403 * @access private
404 */
405 function writeRevision( $row ) {
406 $fname = 'WikiExporter::dumpRev';
407 wfProfileIn( $fname );
408
409 $out = " <revision>\n";
410 $out .= " " . wfElement( 'id', null, strval( $row->rev_id ) ) . "\n";
411
412 $ts = wfTimestamp( TS_ISO_8601, $row->rev_timestamp );
413 $out .= " " . wfElement( 'timestamp', null, $ts ) . "\n";
414
415 if( $row->rev_deleted & Revision::DELETED_USER ) {
416 $out .= " " . wfElement( 'contributor', array( 'deleted' => 'deleted' ) ) . "\n";
417 } else {
418 $out .= " <contributor>\n";
419 if( $row->rev_user ) {
420 $out .= " " . wfElementClean( 'username', null, strval( $row->rev_user_text ) ) . "\n";
421 $out .= " " . wfElement( 'id', null, strval( $row->rev_user ) ) . "\n";
422 } else {
423 $out .= " " . wfElementClean( 'ip', null, strval( $row->rev_user_text ) ) . "\n";
424 }
425 $out .= " </contributor>\n";
426 }
427
428 if( $row->rev_minor_edit ) {
429 $out .= " <minor/>\n";
430 }
431 if( $row->rev_deleted & Revision::DELETED_COMMENT ) {
432 $out .= " " . wfElement( 'comment', array( 'deleted' => 'deleted' ) ) . "\n";
433 } elseif( $row->rev_comment != '' ) {
434 $out .= " " . wfElementClean( 'comment', null, strval( $row->rev_comment ) ) . "\n";
435 }
436
437 if( $row->rev_deleted & Revision::DELETED_TEXT ) {
438 $out .= " " . wfElement( 'text', array( 'deleted' => 'deleted' ) ) . "\n";
439 } elseif( isset( $row->old_text ) ) {
440 // Raw text from the database may have invalid chars
441 $text = strval( Revision::getRevisionText( $row ) );
442 $out .= " " . wfElementClean( 'text',
443 array( 'xml:space' => 'preserve' ),
444 strval( $text ) ) . "\n";
445 } else {
446 // Stub output
447 $out .= " " . wfElement( 'text',
448 array( 'id' => $row->rev_text_id ),
449 "" ) . "\n";
450 }
451
452 $out .= " </revision>\n";
453
454 wfProfileOut( $fname );
455 return $out;
456 }
457
458 }
459
460
461 /**
462 * Base class for output stream; prints to stdout or buffer or whereever.
463 */
464 class DumpOutput {
465 function writeOpenStream( $string ) {
466 $this->write( $string );
467 }
468
469 function writeCloseStream( $string ) {
470 $this->write( $string );
471 }
472
473 function writeOpenPage( $page, $string ) {
474 $this->write( $string );
475 }
476
477 function writeClosePage( $string ) {
478 $this->write( $string );
479 }
480
481 function writeRevision( $rev, $string ) {
482 $this->write( $string );
483 }
484
485 /**
486 * Override to write to a different stream type.
487 * @return bool
488 */
489 function write( $string ) {
490 print $string;
491 }
492 }
493
494 /**
495 * Stream outputter to send data to a file.
496 */
497 class DumpFileOutput extends DumpOutput {
498 var $handle;
499
500 function DumpFileOutput( $file ) {
501 $this->handle = fopen( $file, "wt" );
502 }
503
504 function write( $string ) {
505 fputs( $this->handle, $string );
506 }
507 }
508
509 /**
510 * Stream outputter to send data to a file via some filter program.
511 * Even if compression is available in a library, using a separate
512 * program can allow us to make use of a multi-processor system.
513 */
514 class DumpPipeOutput extends DumpFileOutput {
515 function DumpPipeOutput( $command, $file = null ) {
516 if( !is_null( $file ) ) {
517 $command .= " > " . wfEscapeShellArg( $file );
518 }
519 $this->handle = popen( $command, "w" );
520 }
521 }
522
523 /**
524 * Sends dump output via the gzip compressor.
525 */
526 class DumpGZipOutput extends DumpPipeOutput {
527 function DumpGZipOutput( $file ) {
528 parent::DumpPipeOutput( "gzip", $file );
529 }
530 }
531
532 /**
533 * Sends dump output via the bgzip2 compressor.
534 */
535 class DumpBZip2Output extends DumpPipeOutput {
536 function DumpBZip2Output( $file ) {
537 parent::DumpPipeOutput( "bzip2", $file );
538 }
539 }
540
541 /**
542 * Sends dump output via the p7zip compressor.
543 */
544 class Dump7ZipOutput extends DumpPipeOutput {
545 function Dump7ZipOutput( $file ) {
546 $command = "7za a -bd -si " . wfEscapeShellArg( $file );
547 // Suppress annoying useless crap from p7zip
548 // Unfortunately this could suppress real error messages too
549 $command .= " >/dev/null 2>&1";
550 parent::DumpPipeOutput( $command );
551 }
552 }
553
554
555
556 /**
557 * Dump output filter class.
558 * This just does output filtering and streaming; XML formatting is done
559 * higher up, so be careful in what you do.
560 */
561 class DumpFilter {
562 function DumpFilter( &$sink ) {
563 $this->sink =& $sink;
564 }
565
566 function writeOpenStream( $string ) {
567 $this->sink->writeOpenStream( $string );
568 }
569
570 function writeCloseStream( $string ) {
571 $this->sink->writeCloseStream( $string );
572 }
573
574 function writeOpenPage( $page, $string ) {
575 $this->sendingThisPage = $this->pass( $page, $string );
576 if( $this->sendingThisPage ) {
577 $this->sink->writeOpenPage( $page, $string );
578 }
579 }
580
581 function writeClosePage( $string ) {
582 if( $this->sendingThisPage ) {
583 $this->sink->writeClosePage( $string );
584 $this->sendingThisPage = false;
585 }
586 }
587
588 function writeRevision( $rev, $string ) {
589 if( $this->sendingThisPage ) {
590 $this->sink->writeRevision( $rev, $string );
591 }
592 }
593
594 /**
595 * Override for page-based filter types.
596 * @return bool
597 */
598 function pass( $page ) {
599 return true;
600 }
601 }
602
603 /**
604 * Simple dump output filter to exclude all talk pages.
605 */
606 class DumpNotalkFilter extends DumpFilter {
607 function pass( $page ) {
608 return !Namespace::isTalk( $page->page_namespace );
609 }
610 }
611
612 /**
613 * Dump output filter to include or exclude pages in a given set of namespaces.
614 */
615 class DumpNamespaceFilter extends DumpFilter {
616 var $invert = false;
617 var $namespaces = array();
618
619 function DumpNamespaceFilter( &$sink, $param ) {
620 parent::DumpFilter( $sink );
621
622 $constants = array(
623 "NS_MAIN" => NS_MAIN,
624 "NS_TALK" => NS_TALK,
625 "NS_USER" => NS_USER,
626 "NS_USER_TALK" => NS_USER_TALK,
627 "NS_PROJECT" => NS_PROJECT,
628 "NS_PROJECT_TALK" => NS_PROJECT_TALK,
629 "NS_IMAGE" => NS_IMAGE,
630 "NS_IMAGE_TALK" => NS_IMAGE_TALK,
631 "NS_MEDIAWIKI" => NS_MEDIAWIKI,
632 "NS_MEDIAWIKI_TALK" => NS_MEDIAWIKI_TALK,
633 "NS_TEMPLATE" => NS_TEMPLATE,
634 "NS_TEMPLATE_TALK" => NS_TEMPLATE_TALK,
635 "NS_HELP" => NS_HELP,
636 "NS_HELP_TALK" => NS_HELP_TALK,
637 "NS_CATEGORY" => NS_CATEGORY,
638 "NS_CATEGORY_TALK" => NS_CATEGORY_TALK );
639
640 if( $param{0} == '!' ) {
641 $this->invert = true;
642 $param = substr( $param, 1 );
643 }
644
645 foreach( explode( ',', $param ) as $key ) {
646 $key = trim( $key );
647 if( isset( $constants[$key] ) ) {
648 $ns = $constants[$key];
649 $this->namespaces[$ns] = true;
650 } elseif( is_numeric( $key ) ) {
651 $ns = intval( $key );
652 $this->namespaces[$ns] = true;
653 } else {
654 throw new MWException( "Unrecognized namespace key '$key'\n" );
655 }
656 }
657 }
658
659 function pass( $page ) {
660 $match = isset( $this->namespaces[$page->page_namespace] );
661 return $this->invert xor $match;
662 }
663 }
664
665
666 /**
667 * Dump output filter to include only the last revision in each page sequence.
668 */
669 class DumpLatestFilter extends DumpFilter {
670 var $page, $pageString, $rev, $revString;
671
672 function writeOpenPage( $page, $string ) {
673 $this->page = $page;
674 $this->pageString = $string;
675 }
676
677 function writeClosePage( $string ) {
678 if( $this->rev ) {
679 $this->sink->writeOpenPage( $this->page, $this->pageString );
680 $this->sink->writeRevision( $this->rev, $this->revString );
681 $this->sink->writeClosePage( $string );
682 }
683 $this->rev = null;
684 $this->revString = null;
685 $this->page = null;
686 $this->pageString = null;
687 }
688
689 function writeRevision( $rev, $string ) {
690 if( $rev->rev_id == $this->page->page_latest ) {
691 $this->rev = $rev;
692 $this->revString = $string;
693 }
694 }
695 }
696
697 /**
698 * Base class for output stream; prints to stdout or buffer or whereever.
699 */
700 class DumpMultiWriter {
701 function DumpMultiWriter( $sinks ) {
702 $this->sinks = $sinks;
703 $this->count = count( $sinks );
704 }
705
706 function writeOpenStream( $string ) {
707 for( $i = 0; $i < $this->count; $i++ ) {
708 $this->sinks[$i]->writeOpenStream( $string );
709 }
710 }
711
712 function writeCloseStream( $string ) {
713 for( $i = 0; $i < $this->count; $i++ ) {
714 $this->sinks[$i]->writeCloseStream( $string );
715 }
716 }
717
718 function writeOpenPage( $page, $string ) {
719 for( $i = 0; $i < $this->count; $i++ ) {
720 $this->sinks[$i]->writeOpenPage( $page, $string );
721 }
722 }
723
724 function writeClosePage( $string ) {
725 for( $i = 0; $i < $this->count; $i++ ) {
726 $this->sinks[$i]->writeClosePage( $string );
727 }
728 }
729
730 function writeRevision( $rev, $string ) {
731 for( $i = 0; $i < $this->count; $i++ ) {
732 $this->sinks[$i]->writeRevision( $rev, $string );
733 }
734 }
735 }
736
737 function xmlsafe( $string ) {
738 $fname = 'xmlsafe';
739 wfProfileIn( $fname );
740
741 /**
742 * The page may contain old data which has not been properly normalized.
743 * Invalid UTF-8 sequences or forbidden control characters will make our
744 * XML output invalid, so be sure to strip them out.
745 */
746 $string = UtfNormal::cleanUp( $string );
747
748 $string = htmlspecialchars( $string );
749 wfProfileOut( $fname );
750 return $string;
751 }
752
753 ?>