Make sure the TOC is there when toggleToc is called. Although the function calling...
[lhc/web/wiklou.git] / maintenance / importUseModWikipedia.php
1 <?php
2
3 /**
4 * A script to read a dump of the English Wikipedia from the UseModWiki period, and to
5 * generate an XML dump in MediaWiki format.
6 *
7 * Some relevant code was ported from UseModWiki 0.92.
8 *
9 */
10
11 require_once( dirname( __FILE__ ) . '/Maintenance.php' );
12 require_once( dirname( __FILE__ ) .'/../includes/normal/UtfNormalUtil.php' );
13
14
15 class ImportUseModWikipedia extends Maintenance {
16 var $encodeMap, $decodeMap;
17
18 var $deepRenames = array(
19 'JimboWales' => 983862286,
20 'TexaS' => 983918410,
21 'HistoryOfUnitedStatesTalk' => 984795423,
22 'MetallicA' => 985128533,
23 'PythagoreanTheorem' => 985225545,
24 'TheCanonofScripture' => 985368223,
25 'TaoTehChing' => 985368222,
26 //'TheMostRemarkableFormulaInTheWorld' => 985368221,
27 'TheRecorder' => 985368220,
28 'GladstoneOregon' => 985368219,
29 'PacificBeach' => '?',
30 'AaRiver' => '?',
31 );
32
33 var $replacements = array();
34
35 var $renameTextLinksOps = array(
36 983846265 => array(
37 'TestIgnore' => 'IgnoreTest',
38 ),
39 983848080 => array(
40 'UnitedLocomotiveWorks' => 'Atlas Shrugged/United Locomotive Works'
41 ),
42 983856376 => array(
43 'WikiPedia' => 'Wikipedia',
44 ),
45 983896152 => array(
46 'John_F_Kennedy' => 'John_F._Kennedy',
47 ),
48 983905871 => array(
49 'LarrySanger' => 'Larry_Sanger'
50 ),
51 984697068 => array(
52 'UnitedStates' => 'United States',
53 ),
54 984792748 => array(
55 'LibertarianisM' => 'Libertarianism'
56 ),
57 985327832 => array(
58 'AnarchisM' => 'Anarchism',
59 ),
60 985290063 => array(
61 'HistoryOfUnitedStatesDiscussion' => 'History_Of_United_States_Discussion'
62 ),
63 985290091 => array(
64 'BritishEmpire' => 'British Empire'
65 ),
66 /*
67 985468958 => array(
68 'ScienceFiction' => 'Science fiction',
69 ),*/
70 );
71
72 /**
73 * Hack for observed substitution issues
74 */
75 var $skipSelfSubstitution = array(
76 'Pythagorean_Theorem',
77 'The_Most_Remarkable_Formula_In_The_World',
78 'Wine',
79 );
80
81 var $unixLineEndingsOps = array(
82 987743732 => 'Wikipedia_FAQ'
83 );
84
85 var $replacementsDone = array();
86
87 var $moveLog = array();
88 var $moveDests = array();
89 var $revId;
90
91 var $rc = array();
92 var $textCache = array();
93 var $blacklist = array();
94
95 var $FS, $FS1, $FS2, $FS3;
96 var $FreeLinkPattern, $UrlPattern, $LinkPattern, $InterLinkPattern;
97
98 var $cp1252Table = array(
99 0x80 => 0x20ac,
100 0x81 => 0x0081,
101 0x82 => 0x201a,
102 0x83 => 0x0192,
103 0x84 => 0x201e,
104 0x85 => 0x2026,
105 0x86 => 0x2020,
106 0x87 => 0x2021,
107 0x88 => 0x02c6,
108 0x89 => 0x2030,
109 0x8a => 0x0160,
110 0x8b => 0x2039,
111 0x8c => 0x0152,
112 0x8d => 0x008d,
113 0x8e => 0x017d,
114 0x8f => 0x008f,
115 0x90 => 0x0090,
116 0x91 => 0x2018,
117 0x92 => 0x2019,
118 0x93 => 0x201c,
119 0x94 => 0x201d,
120 0x95 => 0x2022,
121 0x96 => 0x2013,
122 0x97 => 0x2014,
123 0x98 => 0x02dc,
124 0x99 => 0x2122,
125 0x9a => 0x0161,
126 0x9b => 0x203a,
127 0x9c => 0x0153,
128 0x9d => 0x009d,
129 0x9e => 0x017e,
130 0x9f => 0x0178);
131
132 public function __construct() {
133 parent::__construct();
134 $this->addOption( 'datadir', 'the value of $DataDir from wiki.cgi', true, true );
135 $this->addOption( 'outfile', 'the name of the output XML file', true, true );
136 $this->initLinkPatterns();
137
138 $this->encodeMap = $this->decodeMap = array();
139
140 for ($source = 0; $source <= 0xff; $source++) {
141 if ( isset( $this->cp1252Table[$source] ) ) {
142 $dest = $this->cp1252Table[$source];
143 } else {
144 $dest = $source;
145 }
146 $sourceChar = chr( $source );
147 $destChar = codepointToUtf8( $dest );
148 $this->encodeMap[$sourceChar] = $destChar;
149 $this->decodeMap[$destChar] = $sourceChar;
150 }
151 }
152
153 function initLinkPatterns() {
154 # Field separators are used in the URL-style patterns below.
155 $this->FS = "\xb3"; # The FS character is a superscript "3"
156 $this->FS1 = $this->FS . "1"; # The FS values are used to separate fields
157 $this->FS2 = $this->FS . "2"; # in stored hashtables and other data structures.
158 $this->FS3 = $this->FS . "3"; # The FS character is not allowed in user data.
159
160 $UpperLetter = "[A-Z";
161 $LowerLetter = "[a-z";
162 $AnyLetter = "[A-Za-z";
163 $AnyLetter .= "_0-9";
164 $UpperLetter .= "]"; $LowerLetter .= "]"; $AnyLetter .= "]";
165
166 # Main link pattern: lowercase between uppercase, then anything
167 $LpA = $UpperLetter . "+" . $LowerLetter . "+" . $UpperLetter
168 . $AnyLetter . "*";
169 # Optional subpage link pattern: uppercase, lowercase, then anything
170 $LpB = $UpperLetter . "+" . $LowerLetter . "+" . $AnyLetter . "*";
171
172 # Loose pattern: If subpage is used, subpage may be simple name
173 $this->LinkPattern = "((?:(?:$LpA)?\\/$LpB)|$LpA)";
174 $QDelim = '(?:"")?'; # Optional quote delimiter (not in output)
175 $this->LinkPattern .= $QDelim;
176
177 # Inter-site convention: sites must start with uppercase letter
178 # (Uppercase letter avoids confusion with URLs)
179 $InterSitePattern = $UpperLetter . $AnyLetter . "+";
180 $this->InterLinkPattern = "((?:$InterSitePattern:[^\\]\\s\"<>{$this->FS}]+)$QDelim)";
181
182 $AnyLetter = "[-,. _0-9A-Za-z]";
183 $this->FreeLinkPattern = "($AnyLetter+)";
184 $this->FreeLinkPattern = "((?:(?:$AnyLetter+)?\\/)?$AnyLetter+)";
185 $this->FreeLinkPattern .= $QDelim;
186
187 # Url-style links are delimited by one of:
188 # 1. Whitespace (kept in output)
189 # 2. Left or right angle-bracket (< or >) (kept in output)
190 # 3. Right square-bracket (]) (kept in output)
191 # 4. A single double-quote (") (kept in output)
192 # 5. A $FS (field separator) character (kept in output)
193 # 6. A double double-quote ("") (removed from output)
194
195 $UrlProtocols = "http|https|ftp|afs|news|nntp|mid|cid|mailto|wais|"
196 . "prospero|telnet|gopher";
197 $UrlProtocols .= '|file';
198 $this->UrlPattern = "((?:(?:$UrlProtocols):[^\\]\\s\"<>{$this->FS}]+)$QDelim)";
199 $ImageExtensions = "(gif|jpg|png|bmp|jpeg)";
200 $RFCPattern = "RFC\\s?(\\d+)";
201 $ISBNPattern = "ISBN:?([0-9- xX]{10,})";
202 }
203
204 function execute() {
205 $this->articleFileName = '/tmp/importUseMod.' . mt_rand( 0, 0x7ffffff ) . '.tmp';
206 $this->patchFileName = '/tmp/importUseMod.' . mt_rand( 0, 0x7ffffff ) . '.tmp';
207 $this->dataDir = $this->getOption( 'datadir' );
208 $this->outFile = fopen( $this->getOption( 'outfile' ), 'w' );
209 if ( !$this->outFile ) {
210 echo "Unable to open output file\n";
211 return 1;
212 }
213 $this->writeXmlHeader();
214 $this->readRclog();
215 $this->writeMoveLog();
216 $this->writeRevisions();
217 $this->reconcileCurrentRevs();
218 $this->writeXmlFooter();
219 unlink( $this->articleFileName );
220 unlink( $this->patchFileName );
221 return 0;
222 }
223
224 function writeXmlHeader() {
225 fwrite( $this->outFile, <<<EOT
226 <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd" version="0.3" xml:lang="en">
227 <siteinfo>
228 <sitename>Wikipedia</sitename>
229 <base>http://www.wikipedia.com/</base>
230 <generator>MediaWiki 1.18alpha importUseModWikipedia.php</generator>
231 <case>case-sensitive</case>
232 <namespaces>
233 <namespace key="0" />
234 </namespaces>
235 </siteinfo>
236
237 EOT
238 );
239 }
240
241 function writeXmlFooter() {
242 fwrite( $this->outFile, "</mediawiki>\n" );
243 }
244
245 function readRclog() {
246 $rcFile = fopen( "{$this->dataDir}/rclog", 'r' );
247 while ( $line = fgets( $rcFile ) ) {
248 $bits = explode( $this->FS3, $line );
249 if ( count( $bits ) !== 7 ) {
250 echo "Error reading rclog\n";
251 return;
252 }
253 $params = array(
254 'timestamp' => $bits[0],
255 'rctitle' => $bits[1],
256 'summary' => $bits[2],
257 'minor' => $bits[3],
258 'host' => $bits[4],
259 'kind' => $bits[5],
260 'extra' => array()
261 );
262 $extraList = explode( $this->FS2, $bits[6] );
263
264 for ( $i = 0; $i < count( $extraList ); $i += 2 ) {
265 $params['extra'][$extraList[$i]] = $extraList[$i + 1];
266 }
267 $this->rc[$params['timestamp']][] = $params;
268 }
269 }
270
271 function writeMoveLog() {
272 $this->moveLog = array();
273 $deepRenames = $this->deepRenames;
274 echo "Calculating move log...\n";
275 $this->processDiffFile( array( $this, 'moveLogCallback' ) );
276
277 // We have the timestamp intervals, now make a guess at the actual timestamp
278 foreach ( $this->moveLog as $newTitle => $params ) {
279 // Is there a time specified?
280 $drTime = false;
281 if ( isset( $deepRenames[$params['old']] ) ) {
282 $drTime = $deepRenames[$params['old']];
283 if ( $drTime !== '?' ) {
284 if ( ( !isset( $params['endTime'] ) || $drTime < $params['endTime'] )
285 && $drTime > $params['startTime'] )
286 {
287 $this->moveLog[$newTitle]['timestamp'] = $drTime;
288 $this->moveLog[$newTitle]['deep'] = true;
289
290 echo "{$params['old']} -> $newTitle at $drTime\n";
291 unset( $deepRenames[$params['old']] );
292 continue;
293 } else {
294 echo "WARNING: deep rename time invalid: {$params['old']}\n";
295 unset( $deepRenames[$params['old']] );
296 }
297 }
298 }
299
300 // Guess that it is one second after the last edit to the page before it was moved
301 $this->moveLog[$newTitle]['timestamp'] = $params['startTime'] + 1;
302 if ( $drTime === '?' ) {
303 $this->moveLog[$newTitle]['deep'] = true;
304 unset( $deepRenames[$params['old']] );
305 }
306 if ( isset( $params['endTime'] ) ) {
307 $this->printLatin1( "{$params['old']} -> $newTitle between " .
308 "{$params['startTime']} and {$params['endTime']}\n" );
309 } else {
310 $this->printLatin1( "{$params['old']} -> $newTitle after " .
311 "{$params['startTime']}\n" );
312 }
313 }
314
315 // Write the move log to the XML file
316 $id = 1;
317 foreach ( $this->moveLog as $newTitle => $params ) {
318 $out = "<logitem>\n" .
319 $this->element( 'id', $id++ ) .
320 $this->element( 'timestamp', wfTimestamp( TS_ISO_8601, $params['timestamp'] ) ) .
321 "<contributor>\n" .
322 $this->element( 'username', 'UseModWiki admin' ) .
323 "</contributor>" .
324 $this->element( 'type', 'move' ) .
325 $this->element( 'action', 'move' ) .
326 $this->element( 'logtitle', $params['old'] ) .
327 "<params xml:space=\"preserve\">" .
328 htmlspecialchars( $this->encode( "{$newTitle}\n1" ) ) .
329 "</params>\n" .
330 "</logitem>\n";
331 fwrite( $this->outFile, $out );
332 }
333
334 // Check for remaining deep rename entries
335 if ( $deepRenames ) {
336 echo "WARNING: the following entries in \$this->deepRenames are " .
337 "invalid, since no such move exists:\n" .
338 implode( "\n", array_keys( $deepRenames ) ) .
339 "\n\n";
340 }
341
342 }
343
344 function element( $name, $value ) {
345 return "<$name>" . htmlspecialchars( $this->encode( $value ) ) . "</$name>\n";
346 }
347
348 function moveLogCallback( $entry ) {
349 $rctitle = $entry['rctitle'];
350 $title = $entry['title'];
351 $this->moveDests[$rctitle] = $title;
352
353 if ( $rctitle === $title ) {
354 if ( isset( $this->moveLog[$rctitle] )
355 && !isset( $this->moveLog[$rctitle]['endTime'] ) )
356 {
357 // This is the latest time that the page could have been moved
358 $this->moveLog[$rctitle]['endTime'] = $entry['timestamp'];
359 }
360 } else {
361 if ( !isset( $this->moveLog[$rctitle] ) ) {
362 // Initialise the move log entry
363 $this->moveLog[$rctitle] = array(
364 'old' => $title
365 );
366 }
367 // Update the earliest time the page could have been moved
368 $this->moveLog[$rctitle]['startTime'] = $entry['timestamp'];
369 }
370 }
371
372 function writeRevisions() {
373 $this->numGoodRevs = 0;
374 $this->revId = 1;
375 $this->processDiffFile( array( $this, 'revisionCallback' ) );
376 echo "\n\nImported {$this->numGoodRevs} out of {$this->numRevs}\n";
377 }
378
379 function revisionCallback( $params ) {
380 $origTitle = $params['title'];
381 $title = $params['rctitle'];
382 $editTime = $params['timestamp'];
383
384 if ( isset( $this->blacklist[$title] ) ) {
385 return;
386 }
387 $this->doPendingOps( $editTime );
388
389 $origText = $this->getText( $title );
390 $text = $this->patch( $origText, $params['diff'] );
391 if ( $text === false ) {
392 echo "$editTime $title attempting resolution...\n";
393 $linkSubstitutes = $this->resolveFailedDiff( $origText, $params['diff'] );
394 if ( !$linkSubstitutes ) {
395 $this->printLatin1( "$editTime $title DIFF FAILED\n" );
396 $this->blacklist[$title] = true;
397 return;
398 }
399 $this->printLatin1( "$editTime $title requires substitutions:\n" );
400 $time = $editTime - 1;
401 foreach ( $linkSubstitutes as $old => $new ) {
402 $this->printLatin1( "SUBSTITUTE $old -> $new\n" );
403 $this->renameTextLinks( $old, $new, $time-- );
404 }
405 $origText = $this->getText( $title );
406 $text = $this->patch( $origText, $params['diff'] );
407 if ( $text === false ) {
408 $this->printLatin1( "$editTime $title STILL FAILS!\n" );
409 $this->blacklist[$title] = true;
410 return;
411 }
412
413 echo "\n";
414 }
415
416 $params['text'] = $text;
417 $this->saveRevision( $params );
418 $this->numGoodRevs++;
419 #$this->printLatin1( "$editTime $title\n" );
420 }
421
422 function doPendingOps( $editTime ) {
423 foreach ( $this->moveLog as $newTitle => $entry ) {
424 if ( $entry['timestamp'] <= $editTime ) {
425 unset( $this->moveLog[$newTitle] );
426 if ( isset( $entry['deep'] ) ) {
427 $this->renameTextLinks( $entry['old'], $newTitle, $entry['timestamp'] );
428 }
429 }
430 }
431
432 foreach ( $this->renameTextLinksOps as $renameTime => $replacements ) {
433 if ( $editTime >= $renameTime ) {
434 foreach ( $replacements as $old => $new ) {
435 $this->printLatin1( "SUBSTITUTE $old -> $new\n" );
436 $this->renameTextLinks( $old, $new, $renameTime );
437 }
438 unset( $this->renameTextLinksOps[$renameTime] );
439 }
440 }
441
442 foreach ( $this->unixLineEndingsOps as $fixTime => $title ) {
443 if ( $editTime >= $fixTime ) {
444 $this->printLatin1( "$fixTime $title FIXING LINE ENDINGS\n" );
445 $text = $this->getText( $title );
446 $text = str_replace( "\r", '', $text );
447 $this->saveRevision( array(
448 'rctitle' => $title,
449 'timestamp' => $fixTime,
450 'extra' => array( 'name' => 'UseModWiki admin' ),
451 'text' => $text,
452 'summary' => 'Fixing line endings',
453 ) );
454 unset( $this->unixLineEndingsOps[$fixTime] );
455 }
456 }
457 }
458
459 function patch( $source, $diff ) {
460 file_put_contents( $this->articleFileName, $source );
461 file_put_contents( $this->patchFileName, $diff );
462 $error = wfShellExec(
463 wfEscapeShellArg(
464 'patch',
465 '-n',
466 '-r', '-',
467 '--no-backup-if-mismatch',
468 '--binary',
469 $this->articleFileName,
470 $this->patchFileName
471 ) . ' 2>&1',
472 $status
473 );
474 $text = file_get_contents( $this->articleFileName );
475 if ( $status || $text === false ) {
476 return false;
477 } else {
478 return $text;
479 }
480 }
481
482 function resolveFailedDiff( $origText, $diff ) {
483 $context = array();
484 $rxRange = '\d+(?:,(\d+))?';
485 $diffLines = explode( "\n", $diff );
486 for ( $i = 0; $i < count( $diffLines ); $i++ ) {
487 $diffLine = $diffLines[$i];
488 if ( !preg_match( '/^(\d+)(?:,\d+)?[acd]\d+(?:,\d+)?$/', $diffLine, $m ) ) {
489 continue;
490 }
491
492 $sourceIndex = intval( $m[1] );
493 $i++;
494 while ( $i < count( $diffLines ) && substr( $diffLines[$i], 0, 1 ) === '<' ) {
495 $context[$sourceIndex - 1] = substr( $diffLines[$i], 2 );
496 $sourceIndex++;
497 $i++;
498 }
499 $i--;
500 }
501
502 $changedLinks = array();
503 $origLines = explode( "\n", $origText );
504 foreach ( $context as $i => $contextLine ) {
505 $origLine = isset( $origLines[$i] ) ? $origLines[$i] : '';
506 if ( $contextLine === $origLine ) {
507 continue;
508 }
509 $newChanges = $this->resolveTextChange( $origLine, $contextLine );
510 if ( is_array( $newChanges ) ) {
511 $changedLinks += $newChanges;
512 } else {
513 echo "Resolution failure on line " . ( $i + 1 ) . "\n";
514 $this->printLatin1( $newChanges );
515 }
516 }
517
518 return $changedLinks;
519 }
520
521 function resolveTextChange( $source, $dest ) {
522 $changedLinks = array();
523 $sourceLinks = $this->getLinkList( $source );
524 $destLinks = $this->getLinkList( $dest );
525 $newLinks = array_diff( $destLinks, $sourceLinks );
526 $removedLinks = array_diff( $sourceLinks, $destLinks );
527
528 // Match up the removed links with the new links
529 foreach ( $newLinks as $j => $newLink ) {
530 $minDistance = 100000000;
531 $bestRemovedLink = false;
532 foreach ( $removedLinks as $k => $removedLink ) {
533 $editDistance = levenshtein( $newLink, $removedLink );
534 if ( $editDistance < $minDistance ) {
535 $minDistance = $editDistance;
536 $bestRemovedLink = $removedLink;
537 }
538 }
539 if ( $bestRemovedLink !== false ) {
540 $changedLinks[$bestRemovedLink] = $newLink;
541 $newLinks = array_diff( $newLinks, array( $newLink ) );
542 $removedLinks = array_diff( $removedLinks, array( $bestRemovedLink ) );
543 }
544 }
545
546 $proposal = $source;
547 foreach ( $changedLinks as $removedLink => $newLink ) {
548 $proposal = $this->substituteTextLinks( $removedLink, $newLink, $proposal );
549 }
550 if ( $proposal !== $dest ) {
551 // Resolution failed
552 $msg = "Source line: $source\n" .
553 "Source links: " . implode( ', ', $sourceLinks ) . "\n" .
554 "Context line: $dest\n" .
555 "Context links: " . implode( ', ', $destLinks ) . "\n" .
556 "Proposal: $proposal\n";
557 return $msg;
558 }
559 return $changedLinks;
560 }
561
562 function processDiffFile( $callback ) {
563 $diffFile = fopen( "{$this->dataDir}/diff_log", 'r' );
564
565 $delimiter = "------\n";
566 file_put_contents( $this->articleFileName, "Describe the new page here.\n" );
567
568 $line = fgets( $diffFile );
569 $lineNum = 1;
570 if ( $line !== $delimiter ) {
571 echo "Invalid diff file\n";
572 return false;
573 }
574 $lastReportLine = 0;
575 $this->numRevs = 0;
576
577 while ( true ) {
578 $line = fgets( $diffFile );
579 $lineNum++;
580 if ( $line === false ) {
581 break;
582 }
583 if ( $lineNum > $lastReportLine + 1000 ) {
584 $lastReportLine = $lineNum;
585 fwrite( STDERR, "$lineNum \r" );
586 fflush( STDERR );
587 }
588 $line = trim( $line );
589 if ( !preg_match( '/^([^|]+)\|(\d+)$/', $line, $matches ) ) {
590 echo "Invalid header on line $lineNum\n";
591 return true;
592 }
593 list( , $title, $editTime ) = $matches;
594
595 $diff = '';
596 $diffStartLine = $lineNum;
597 while ( true ) {
598 $line = fgets( $diffFile );
599 $lineNum++;
600 if ( $line === $delimiter ) {
601 break;
602 }
603 if ( $line === false ) {
604 break 2;
605 }
606 $diff .= $line;
607 }
608
609 $this->numRevs++;
610
611 if ( !isset( $this->rc[$editTime] ) ) {
612 $this->printLatin1( "$editTime $title DELETED, skipping\n" );
613 continue;
614 }
615
616 if ( count( $this->rc[$editTime] ) == 1 ) {
617 $params = $this->rc[$editTime][0];
618 } else {
619 $params = false;
620 $candidates = '';
621 foreach ( $this->rc[$editTime] as $rc ) {
622 if ( $rc['rctitle'] === $title ) {
623 $params = $rc;
624 break;
625 }
626 if ( $candidates === '' ) {
627 $candidates = $rc['rctitle'];
628 } else {
629 $candidates .= ', ' . $rc['rctitle'];
630 }
631 }
632 if ( !$params ) {
633 $this->printLatin1( "$editTime $title ERROR cannot resolve rclog\n" );
634 $this->printLatin1( "$editTime $title CANDIDATES: $candidates\n" );
635 continue;
636 }
637 }
638 $params['diff'] = $diff;
639 $params['title'] = $title;
640 $params['diffStartLine'] = $diffStartLine;
641 call_user_func( $callback, $params );
642 }
643 echo "\n";
644
645 if ( !feof( $diffFile ) ) {
646 echo "Stopped at line $lineNum\n";
647 }
648 return true;
649 }
650
651 function reconcileCurrentRevs() {
652 foreach ( $this->textCache as $title => $text ) {
653 $fileName = "{$this->dataDir}/page/";
654 if ( preg_match( '/^[A-Z]/', $title, $m ) ) {
655 $fileName .= $m[0];
656 } else {
657 $fileName .= 'other';
658 }
659 $fileName .= "/$title.db";
660
661 if ( !file_exists( $fileName ) ) {
662 $this->printLatin1( "ERROR: Cannot find page file for {$title}\n" );
663 continue;
664 }
665
666 $fileContents = file_get_contents( $fileName );
667 $page = $this->unserializeUseMod( $fileContents, $this->FS1 );
668 $section = $this->unserializeUseMod( $page['text_default'], $this->FS2 );
669 $data = $this->unserializeUseMod( $section['data'], $this->FS3 );
670 $pageText = $data['text'];
671 if ( $text !== $pageText ) {
672 $substs = $this->resolveTextChange( $text, $pageText );
673 if ( is_array( $substs ) ) {
674 foreach ( $substs as $source => $dest ) {
675 if ( isset( $this->moveLog[$dest] ) ) {
676 $this->printLatin1( "ERROR: need deep rename: $source\n" );
677 } else {
678 $this->printLatin1( "ERROR: need substitute: $source -> $dest\n" );
679 }
680 }
681 } else {
682 $this->printLatin1( "ERROR: unresolved diff in $title:\n" );
683 wfSuppressWarnings();
684 $diff = xdiff_string_diff( $text, $pageText ) . '';
685 wfRestoreWarnings();
686 $this->printLatin1( "$diff\n" );
687 }
688 }
689 }
690 }
691
692 function makeTitle( $titleText ) {
693 return Title::newFromText( $this->encode( $titleText ) );
694 }
695
696 function getText( $titleText ) {
697 if ( !isset( $this->textCache[$titleText] ) ) {
698 return "Describe the new page here.\n";
699 } else {
700 return $this->textCache[$titleText];
701 }
702 }
703
704 function saveRevision( $params ) {
705 $this->textCache[$params['rctitle']] = $params['text'];
706
707 $out = "<page>\n" .
708 $this->element( 'title', $params['rctitle'] ) .
709 "<revision>\n" .
710 $this->element( 'id', $this->revId ++ ) .
711 $this->element( 'timestamp', wfTimestamp( TS_ISO_8601, $params['timestamp'] ) ) .
712 "<contributor>\n";
713 if ( isset( $params['extra']['name'] ) ) {
714 $out .= $this->element( 'username', $params['extra']['name'] );
715 }
716 if ( isset( $params['extra']['id'] ) ) {
717 $out .= $this->element( 'id', $params['extra']['id'] );
718 }
719 if ( isset( $params['host'] ) ) {
720 $out .= $this->element( 'ip', $params['host'] );
721 }
722 $out .=
723 "</contributor>\n" .
724 $this->element( 'comment', $params['summary'] ) .
725 "<text xml:space=\"preserve\">" .
726 htmlspecialchars( $this->encode( $params['text'] ) ) .
727 "</text>\n" .
728 "</revision>\n" .
729 "</page>\n";
730 fwrite( $this->outFile, $out );
731 }
732
733 function renameTextLinks( $old, $new, $timestamp ) {
734 $newWithUnderscores = $new;
735 $old = str_replace( '_', ' ', $old );
736 $new = str_replace( '_', ' ', $new );
737
738 foreach ( $this->textCache as $title => $oldText ) {
739 if ( $newWithUnderscores === $title
740 && in_array( $title, $this->skipSelfSubstitution ) )
741 {
742 // Hack to make Pythagorean_Theorem etc. work
743 continue;
744 }
745
746 $newText = $this->substituteTextLinks( $old, $new, $oldText );
747 if ( $oldText !== $newText ) {
748 $this->saveRevision( array(
749 'rctitle' => $title,
750 'timestamp' => $timestamp,
751 'text' => $newText,
752 'extra' => array( 'name' => 'Page move link fixup script' ),
753 'summary' => '',
754 'minor' => true
755 ) );
756 }
757 }
758 }
759
760 function substituteTextLinks( $old, $new, $text ) {
761 $this->saveUrl = array();
762 $this->old = $old;
763 $this->new = $new;
764
765 $text = str_replace( $this->FS, '', $text ); # Remove separators (paranoia)
766 $text = preg_replace_callback( '/(<pre>(.*?)<\/pre>)/is',
767 array( $this, 'storeRaw' ), $text );
768 $text = preg_replace_callback( '/(<code>(.*?)<\/code>)/is',
769 array( $this, 'storeRaw' ), $text );
770 $text = preg_replace_callback( '/(<nowiki>(.*?)<\/nowiki>)/s',
771 array( $this, 'storeRaw' ), $text );
772
773 $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\|([^\]]+)\]\]/",
774 array( $this, 'subFreeLink' ), $text );
775 $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\]\]/",
776 array( $this, 'subFreeLink' ), $text );
777 $text = preg_replace_callback( "/(\[{$this->UrlPattern}\s+([^\]]+?)\])/",
778 array( $this, 'storeRaw' ), $text );
779 $text = preg_replace_callback( "/(\[{$this->InterLinkPattern}\s+([^\]]+?)\])/",
780 array( $this, 'storeRaw' ), $text );
781 $text = preg_replace_callback( "/(\[?{$this->UrlPattern}\]?)/",
782 array( $this, 'storeRaw' ), $text );
783 $text = preg_replace_callback( "/(\[?{$this->InterLinkPattern}\]?)/",
784 array( $this, 'storeRaw' ), $text );
785 $text = preg_replace_callback( "/{$this->LinkPattern}/",
786 array( $this, 'subWikiLink' ), $text );
787
788 $text = preg_replace_callback( "/{$this->FS}(\d+){$this->FS}/",
789 array( $this, 'restoreRaw' ), $text ); # Restore saved text
790 return $text;
791 }
792
793 function getLinkList( $text ) {
794 $this->saveUrl = array();
795 $this->linkList = array();
796
797 $text = str_replace( $this->FS, '', $text ); # Remove separators (paranoia)
798 $text = preg_replace_callback( '/(<pre>(.*?)<\/pre>)/is',
799 array( $this, 'storeRaw' ), $text );
800 $text = preg_replace_callback( '/(<code>(.*?)<\/code>)/is',
801 array( $this, 'storeRaw' ), $text );
802 $text = preg_replace_callback( '/(<nowiki>(.*?)<\/nowiki>)/s',
803 array( $this, 'storeRaw' ), $text );
804
805 $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\|([^\]]+)\]\]/",
806 array( $this, 'storeLink' ), $text );
807 $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\]\]/",
808 array( $this, 'storeLink' ), $text );
809 $text = preg_replace_callback( "/(\[{$this->UrlPattern}\s+([^\]]+?)\])/",
810 array( $this, 'storeRaw' ), $text );
811 $text = preg_replace_callback( "/(\[{$this->InterLinkPattern}\s+([^\]]+?)\])/",
812 array( $this, 'storeRaw' ), $text );
813 $text = preg_replace_callback( "/(\[?{$this->UrlPattern}\]?)/",
814 array( $this, 'storeRaw' ), $text );
815 $text = preg_replace_callback( "/(\[?{$this->InterLinkPattern}\]?)/",
816 array( $this, 'storeRaw' ), $text );
817 $text = preg_replace_callback( "/{$this->LinkPattern}/",
818 array( $this, 'storeLink' ), $text );
819
820 return $this->linkList;
821 }
822
823 function storeRaw( $m ) {
824 $this->saveUrl[] = $m[1];
825 return $this->FS . (count( $this->saveUrl ) - 1) . $this->FS;
826 }
827
828 function subFreeLink( $m ) {
829 $link = $m[1];
830 if ( isset( $m[2] ) ) {
831 $name = $m[2];
832 } else {
833 $name = '';
834 }
835 $oldlink = $link;
836 $link = preg_replace( '/^\s+/', '', $link );
837 $link = preg_replace( '/\s+$/', '', $link );
838 if ( $link == $this->old ) {
839 $link = $this->new;
840 } else {
841 $link = $oldlink; # Preserve spaces if no match
842 }
843 $link = "[[$link";
844 if ( $name !== "" ) {
845 $link .= "|$name";
846 }
847 $link .= "]]";
848 return $this->storeRaw( array( 1 => $link ) );
849 }
850
851 function subWikiLink( $m ) {
852 $link = $m[1];
853 if ( $link == $this->old ) {
854 $link = $this->new;
855 if ( !preg_match( "/^{$this->LinkPattern}$/", $this->new ) ) {
856 $link = "[[$link]]";
857 }
858 }
859 return $this->storeRaw( array( 1 => $link ) );
860 }
861
862 function restoreRaw( $m ) {
863 return $this->saveUrl[$m[1]];
864 }
865
866 function storeLink( $m ) {
867 $this->linkList[] = $m[1];
868 return $this->storeRaw( $m );
869 }
870
871 function encode( $s ) {
872 return strtr( $s, $this->encodeMap );
873 }
874
875 function decode( $s ) {
876 return strtr( $s, $this->decodeMap );
877 }
878
879 function printLatin1( $s ) {
880 echo $this->encode( $s );
881 }
882
883 function unserializeUseMod( $s, $sep ) {
884 $parts = explode( $sep, $s );
885 $result = array();
886 for ( $i = 0; $i < count( $parts ); $i += 2 ) {
887 $result[$parts[$i]] = $parts[$i+1];
888 }
889 return $result;
890 }
891 }
892
893 $maintClass = 'ImportUseModWikipedia';
894 require_once( DO_MAINTENANCE );