Long and ugly script to convert the recently-discovered August 2001 backup of Wikiped...
[lhc/web/wiklou.git] / maintenance / importUseModWikipedia.php
1 <?php
2
3 /**
4 * A script to read a dump of the English Wikipedia from the UseModWiki period, and to
5 * generate an XML dump in MediaWiki format.
6 *
7 * Some relevant code was ported from UseModWiki 0.92.
8 *
9 */
10
11 require_once( dirname( __FILE__ ) . '/Maintenance.php' );
12 require_once( dirname( __FILE__ ) .'/../includes/normal/UtfNormalUtil.php' );
13
14
15 class ImportUseModWikipedia extends Maintenance {
16 var $encodeMap, $decodeMap;
17
18 var $deepRenames = array(
19 'JimboWales' => 983862286,
20 'TexaS' => 983918410,
21 'HistoryOfUnitedStatesTalk' => 984795423,
22 'MetallicA' => 985128533,
23 'PythagoreanTheorem' => 985225545,
24 'TheCanonofScripture' => 985368223,
25 'TaoTehChing' => 985368222,
26 //'TheMostRemarkableFormulaInTheWorld' => 985368221,
27 'TheRecorder' => 985368220,
28 'GladstoneOregon' => 985368219,
29 #'UnitedStatesConstitution/AmendmentTwo' =>
30 );
31
32 var $replacements = array();
33
34 var $renameTextLinksOps = array(
35 983846265 => array(
36 'TestIgnore' => 'IgnoreTest',
37 ),
38 983848080 => array(
39 'UnitedLocomotiveWorks' => 'Atlas Shrugged/United Locomotive Works'
40 ),
41 983856376 => array(
42 'WikiPedia' => 'Wikipedia',
43 ),
44 983896152 => array(
45 'John_F_Kennedy' => 'John_F._Kennedy',
46 ),
47 983905871 => array(
48 'LarrySanger' => 'Larry_Sanger'
49 ),
50 984697068 => array(
51 'UnitedStates' => 'United States',
52 ),
53 984792748 => array(
54 'LibertarianisM' => 'Libertarianism'
55 ),
56 985327832 => array(
57 'AnarchisM' => 'Anarchism',
58 ),
59 985290063 => array(
60 'HistoryOfUnitedStatesDiscussion' => 'History_Of_United_States_Discussion'
61 ),
62 985290091 => array(
63 'BritishEmpire' => 'British Empire'
64 ),
65 /*
66 985468958 => array(
67 'ScienceFiction' => 'Science fiction',
68 ),*/
69 );
70
71 /**
72 * Hack for observed substitution issues
73 */
74 var $skipSelfSubstitution = array(
75 'Pythagorean_Theorem',
76 'The_Most_Remarkable_Formula_In_The_World',
77 'Wine',
78 );
79
80 var $unixLineEndingsOps = array(
81 987743732 => 'Wikipedia_FAQ'
82 );
83
84 var $replacementsDone = array();
85
86 var $moveLog = array();
87 var $moveDests = array();
88 var $revId;
89
90 var $rc = array();
91 var $textCache = array();
92 var $blacklist = array();
93
94 var $FS, $FS1, $FS2, $FS3;
95 var $FreeLinkPattern, $UrlPattern, $LinkPattern, $InterLinkPattern;
96
97 var $cp1252Table = <<<EOT
98 0x00 0x0000
99 0x01 0x0001
100 0x02 0x0002
101 0x03 0x0003
102 0x04 0x0004
103 0x05 0x0005
104 0x06 0x0006
105 0x07 0x0007
106 0x08 0x0008
107 0x09 0x0009
108 0x0a 0x000a
109 0x0b 0x000b
110 0x0c 0x000c
111 0x0d 0x000d
112 0x0e 0x000e
113 0x0f 0x000f
114 0x10 0x0010
115 0x11 0x0011
116 0x12 0x0012
117 0x13 0x0013
118 0x14 0x0014
119 0x15 0x0015
120 0x16 0x0016
121 0x17 0x0017
122 0x18 0x0018
123 0x19 0x0019
124 0x1a 0x001a
125 0x1b 0x001b
126 0x1c 0x001c
127 0x1d 0x001d
128 0x1e 0x001e
129 0x1f 0x001f
130 0x20 0x0020
131 0x21 0x0021
132 0x22 0x0022
133 0x23 0x0023
134 0x24 0x0024
135 0x25 0x0025
136 0x26 0x0026
137 0x27 0x0027
138 0x28 0x0028
139 0x29 0x0029
140 0x2a 0x002a
141 0x2b 0x002b
142 0x2c 0x002c
143 0x2d 0x002d
144 0x2e 0x002e
145 0x2f 0x002f
146 0x30 0x0030
147 0x31 0x0031
148 0x32 0x0032
149 0x33 0x0033
150 0x34 0x0034
151 0x35 0x0035
152 0x36 0x0036
153 0x37 0x0037
154 0x38 0x0038
155 0x39 0x0039
156 0x3a 0x003a
157 0x3b 0x003b
158 0x3c 0x003c
159 0x3d 0x003d
160 0x3e 0x003e
161 0x3f 0x003f
162 0x40 0x0040
163 0x41 0x0041
164 0x42 0x0042
165 0x43 0x0043
166 0x44 0x0044
167 0x45 0x0045
168 0x46 0x0046
169 0x47 0x0047
170 0x48 0x0048
171 0x49 0x0049
172 0x4a 0x004a
173 0x4b 0x004b
174 0x4c 0x004c
175 0x4d 0x004d
176 0x4e 0x004e
177 0x4f 0x004f
178 0x50 0x0050
179 0x51 0x0051
180 0x52 0x0052
181 0x53 0x0053
182 0x54 0x0054
183 0x55 0x0055
184 0x56 0x0056
185 0x57 0x0057
186 0x58 0x0058
187 0x59 0x0059
188 0x5a 0x005a
189 0x5b 0x005b
190 0x5c 0x005c
191 0x5d 0x005d
192 0x5e 0x005e
193 0x5f 0x005f
194 0x60 0x0060
195 0x61 0x0061
196 0x62 0x0062
197 0x63 0x0063
198 0x64 0x0064
199 0x65 0x0065
200 0x66 0x0066
201 0x67 0x0067
202 0x68 0x0068
203 0x69 0x0069
204 0x6a 0x006a
205 0x6b 0x006b
206 0x6c 0x006c
207 0x6d 0x006d
208 0x6e 0x006e
209 0x6f 0x006f
210 0x70 0x0070
211 0x71 0x0071
212 0x72 0x0072
213 0x73 0x0073
214 0x74 0x0074
215 0x75 0x0075
216 0x76 0x0076
217 0x77 0x0077
218 0x78 0x0078
219 0x79 0x0079
220 0x7a 0x007a
221 0x7b 0x007b
222 0x7c 0x007c
223 0x7d 0x007d
224 0x7e 0x007e
225 0x7f 0x007f
226 0x80 0x20ac
227 0x81 0x0081
228 0x82 0x201a
229 0x83 0x0192
230 0x84 0x201e
231 0x85 0x2026
232 0x86 0x2020
233 0x87 0x2021
234 0x88 0x02c6
235 0x89 0x2030
236 0x8a 0x0160
237 0x8b 0x2039
238 0x8c 0x0152
239 0x8d 0x008d
240 0x8e 0x017d
241 0x8f 0x008f
242 0x90 0x0090
243 0x91 0x2018
244 0x92 0x2019
245 0x93 0x201c
246 0x94 0x201d
247 0x95 0x2022
248 0x96 0x2013
249 0x97 0x2014
250 0x98 0x02dc
251 0x99 0x2122
252 0x9a 0x0161
253 0x9b 0x203a
254 0x9c 0x0153
255 0x9d 0x009d
256 0x9e 0x017e
257 0x9f 0x0178
258 0xa0 0x00a0
259 0xa1 0x00a1
260 0xa2 0x00a2
261 0xa3 0x00a3
262 0xa4 0x00a4
263 0xa5 0x00a5
264 0xa6 0x00a6
265 0xa7 0x00a7
266 0xa8 0x00a8
267 0xa9 0x00a9
268 0xaa 0x00aa
269 0xab 0x00ab
270 0xac 0x00ac
271 0xad 0x00ad
272 0xae 0x00ae
273 0xaf 0x00af
274 0xb0 0x00b0
275 0xb1 0x00b1
276 0xb2 0x00b2
277 0xb3 0x00b3
278 0xb4 0x00b4
279 0xb5 0x00b5
280 0xb6 0x00b6
281 0xb7 0x00b7
282 0xb8 0x00b8
283 0xb9 0x00b9
284 0xba 0x00ba
285 0xbb 0x00bb
286 0xbc 0x00bc
287 0xbd 0x00bd
288 0xbe 0x00be
289 0xbf 0x00bf
290 0xc0 0x00c0
291 0xc1 0x00c1
292 0xc2 0x00c2
293 0xc3 0x00c3
294 0xc4 0x00c4
295 0xc5 0x00c5
296 0xc6 0x00c6
297 0xc7 0x00c7
298 0xc8 0x00c8
299 0xc9 0x00c9
300 0xca 0x00ca
301 0xcb 0x00cb
302 0xcc 0x00cc
303 0xcd 0x00cd
304 0xce 0x00ce
305 0xcf 0x00cf
306 0xd0 0x00d0
307 0xd1 0x00d1
308 0xd2 0x00d2
309 0xd3 0x00d3
310 0xd4 0x00d4
311 0xd5 0x00d5
312 0xd6 0x00d6
313 0xd7 0x00d7
314 0xd8 0x00d8
315 0xd9 0x00d9
316 0xda 0x00da
317 0xdb 0x00db
318 0xdc 0x00dc
319 0xdd 0x00dd
320 0xde 0x00de
321 0xdf 0x00df
322 0xe0 0x00e0
323 0xe1 0x00e1
324 0xe2 0x00e2
325 0xe3 0x00e3
326 0xe4 0x00e4
327 0xe5 0x00e5
328 0xe6 0x00e6
329 0xe7 0x00e7
330 0xe8 0x00e8
331 0xe9 0x00e9
332 0xea 0x00ea
333 0xeb 0x00eb
334 0xec 0x00ec
335 0xed 0x00ed
336 0xee 0x00ee
337 0xef 0x00ef
338 0xf0 0x00f0
339 0xf1 0x00f1
340 0xf2 0x00f2
341 0xf3 0x00f3
342 0xf4 0x00f4
343 0xf5 0x00f5
344 0xf6 0x00f6
345 0xf7 0x00f7
346 0xf8 0x00f8
347 0xf9 0x00f9
348 0xfa 0x00fa
349 0xfb 0x00fb
350 0xfc 0x00fc
351 0xfd 0x00fd
352 0xfe 0x00fe
353 0xff 0x00ff
354 EOT;
355 public function __construct() {
356 parent::__construct();
357 $this->addOption( 'datadir', 'the value of $DataDir from wiki.cgi', true, true );
358 $this->addOption( 'outfile', 'the name of the output XML file', true, true );
359 $this->initLinkPatterns();
360
361 $this->encodeMap = $this->decodeMap = array();
362 foreach ( explode( "\n", $this->cp1252Table ) as $line ) {
363 list( $source, $dest ) = explode( "\t", $line );
364 $sourceChar = chr( base_convert( substr( $source, 2 ), 16, 10 ) );
365 $destChar = codepointToUtf8( base_convert( substr( $dest, 2 ), 16, 10 ) );
366 $this->encodeMap[$sourceChar] = $destChar;
367 $this->decodeMap[$destChar] = $sourceChar;
368 }
369 }
370
371 function initLinkPatterns() {
372 # Field separators are used in the URL-style patterns below.
373 $this->FS = "\xb3"; # The FS character is a superscript "3"
374 $this->FS1 = $this->FS . "1"; # The FS values are used to separate fields
375 $this->FS2 = $this->FS . "2"; # in stored hashtables and other data structures.
376 $this->FS3 = $this->FS . "3"; # The FS character is not allowed in user data.
377
378 $UpperLetter = "[A-Z";
379 $LowerLetter = "[a-z";
380 $AnyLetter = "[A-Za-z";
381 $AnyLetter .= "_0-9";
382 $UpperLetter .= "]"; $LowerLetter .= "]"; $AnyLetter .= "]";
383
384 # Main link pattern: lowercase between uppercase, then anything
385 $LpA = $UpperLetter . "+" . $LowerLetter . "+" . $UpperLetter
386 . $AnyLetter . "*";
387 # Optional subpage link pattern: uppercase, lowercase, then anything
388 $LpB = $UpperLetter . "+" . $LowerLetter . "+" . $AnyLetter . "*";
389
390 # Loose pattern: If subpage is used, subpage may be simple name
391 $this->LinkPattern = "((?:(?:$LpA)?\\/$LpB)|$LpA)";
392 $QDelim = '(?:"")?'; # Optional quote delimiter (not in output)
393 $this->LinkPattern .= $QDelim;
394
395 # Inter-site convention: sites must start with uppercase letter
396 # (Uppercase letter avoids confusion with URLs)
397 $InterSitePattern = $UpperLetter . $AnyLetter . "+";
398 $this->InterLinkPattern = "((?:$InterSitePattern:[^\\]\\s\"<>{$this->FS}]+)$QDelim)";
399
400 $AnyLetter = "[-,. _0-9A-Za-z]";
401 $this->FreeLinkPattern = "($AnyLetter+)";
402 $this->FreeLinkPattern = "((?:(?:$AnyLetter+)?\\/)?$AnyLetter+)";
403 $this->FreeLinkPattern .= $QDelim;
404
405 # Url-style links are delimited by one of:
406 # 1. Whitespace (kept in output)
407 # 2. Left or right angle-bracket (< or >) (kept in output)
408 # 3. Right square-bracket (]) (kept in output)
409 # 4. A single double-quote (") (kept in output)
410 # 5. A $FS (field separator) character (kept in output)
411 # 6. A double double-quote ("") (removed from output)
412
413 $UrlProtocols = "http|https|ftp|afs|news|nntp|mid|cid|mailto|wais|"
414 . "prospero|telnet|gopher";
415 $UrlProtocols .= '|file';
416 $this->UrlPattern = "((?:(?:$UrlProtocols):[^\\]\\s\"<>{$this->FS}]+)$QDelim)";
417 $ImageExtensions = "(gif|jpg|png|bmp|jpeg)";
418 $RFCPattern = "RFC\\s?(\\d+)";
419 $ISBNPattern = "ISBN:?([0-9- xX]{10,})";
420 }
421
422 function execute() {
423 $this->articleFileName = '/tmp/importUseMod.' . mt_rand( 0, 0x7ffffff ) . '.tmp';
424 $this->patchFileName = '/tmp/importUseMod.' . mt_rand( 0, 0x7ffffff ) . '.tmp';
425 $this->dataDir = $this->getOption( 'datadir' );
426 $this->outFile = fopen( $this->getOption( 'outfile' ), 'w' );
427 if ( !$this->outFile ) {
428 echo "Unable to open output file\n";
429 return 1;
430 }
431 $this->writeXmlHeader();
432 $this->readRclog();
433 $this->writeMoveLog();
434 $this->writeRevisions();
435 $this->reconcileCurrentRevs();
436 $this->writeXmlFooter();
437 unlink( $this->articleFileName );
438 unlink( $this->patchFileName );
439 return 0;
440 }
441
442 function writeXmlHeader() {
443 fwrite( $this->outFile, <<<EOT
444 <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd" version="0.3" xml:lang="en">
445 <siteinfo>
446 <sitename>Wikipedia</sitename>
447 <base>http://www.wikipedia.com/</base>
448 <generator>MediaWiki 1.18alpha importUseModWikipedia.php</generator>
449 <case>case-sensitive</case>
450 <namespaces>
451 <namespace key="0" />
452 </namespaces>
453 </siteinfo>
454
455 EOT
456 );
457 }
458
459 function writeXmlFooter() {
460 fwrite( $this->outFile, "</mediawiki>\n" );
461 }
462
463 function readRclog() {
464 $rcFile = fopen( "{$this->dataDir}/rclog", 'r' );
465 while ( $line = fgets( $rcFile ) ) {
466 $bits = explode( $this->FS3, $line );
467 if ( count( $bits ) !== 7 ) {
468 echo "Error reading rclog\n";
469 return;
470 }
471 $params = array(
472 'timestamp' => $bits[0],
473 'rctitle' => $bits[1],
474 'summary' => $bits[2],
475 'minor' => $bits[3],
476 'host' => $bits[4],
477 'kind' => $bits[5],
478 'extra' => array()
479 );
480 $extraList = explode( $this->FS2, $bits[6] );
481
482 for ( $i = 0; $i < count( $extraList ); $i += 2 ) {
483 $params['extra'][$extraList[$i]] = $extraList[$i + 1];
484 }
485 $this->rc[$params['timestamp']][] = $params;
486 }
487 }
488
489 function writeMoveLog() {
490 $this->moveLog = array();
491 $deepRenames = $this->deepRenames;
492 echo "Calculating move log...\n";
493 $this->processDiffFile( array( $this, 'moveLogCallback' ) );
494
495 // We have the timestamp intervals, now make a guess at the actual timestamp
496 foreach ( $this->moveLog as $newTitle => $params ) {
497 // Is there a time specified?
498 $drTime = false;
499 if ( isset( $deepRenames[$params['old']] ) ) {
500 $drTime = $deepRenames[$params['old']];
501 if ( $drTime !== '?' ) {
502 if ( ( !isset( $params['endTime'] ) || $drTime < $params['endTime'] )
503 && $drTime > $params['startTime'] )
504 {
505 $this->moveLog[$newTitle]['timestamp'] = $drTime;
506 $this->moveLog[$newTitle]['deep'] = true;
507
508 echo "{$params['old']} -> $newTitle at $drTime\n";
509 unset( $deepRenames[$params['old']] );
510 continue;
511 } else {
512 echo "WARNING: deep rename time invalid: {$params['old']}\n";
513 unset( $deepRenames[$params['old']] );
514 }
515 }
516 }
517
518 // Guess that it is one second after the last edit to the page before it was moved
519 $this->moveLog[$newTitle]['timestamp'] = $params['startTime'] + 1;
520 if ( $drTime === '?' ) {
521 $this->moveLog[$newTitle]['deep'] = true;
522 unset( $deepRenames[$params['old']] );
523 }
524 if ( isset( $params['endTime'] ) ) {
525 $this->printLatin1( "{$params['old']} -> $newTitle between " .
526 "{$params['startTime']} and {$params['endTime']}\n" );
527 } else {
528 $this->printLatin1( "{$params['old']} -> $newTitle after " .
529 "{$params['startTime']}\n" );
530 }
531 }
532
533 // Write the move log to the XML file
534 $id = 1;
535 foreach ( $this->moveLog as $newTitle => $params ) {
536 $out = "<logitem>\n" .
537 $this->element( 'id', $id++ ) .
538 $this->element( 'timestamp', wfTimestamp( TS_ISO_8601, $params['timestamp'] ) ) .
539 "<contributor>\n" .
540 $this->element( 'username', 'UseModWiki admin' ) .
541 "</contributor>" .
542 $this->element( 'type', 'move' ) .
543 $this->element( 'action', 'move' ) .
544 $this->element( 'logtitle', $params['old'] ) .
545 "<params xml:space=\"preserve\">" .
546 htmlspecialchars( $this->encode( "{$newTitle}\n1" ) ) .
547 "</params>\n" .
548 "</logitem>\n";
549 fwrite( $this->outFile, $out );
550 }
551
552 // Check for remaining deep rename entries
553 if ( $deepRenames ) {
554 echo "WARNING: the following entries in \$this->deepRenames are " .
555 "invalid, since no such move exists:\n" .
556 implode( "\n", array_keys( $deepRenames ) ) .
557 "\n\n";
558 }
559
560 }
561
562 function element( $name, $value ) {
563 return "<$name>" . htmlspecialchars( $this->encode( $value ) ) . "</$name>\n";
564 }
565
566 function moveLogCallback( $entry ) {
567 $rctitle = $entry['rctitle'];
568 $title = $entry['title'];
569 $this->moveDests[$rctitle] = $title;
570
571 if ( $rctitle === $title ) {
572 if ( isset( $this->moveLog[$rctitle] )
573 && !isset( $this->moveLog[$rctitle]['endTime'] ) )
574 {
575 // This is the latest time that the page could have been moved
576 $this->moveLog[$rctitle]['endTime'] = $entry['timestamp'];
577 }
578 } else {
579 if ( !isset( $this->moveLog[$rctitle] ) ) {
580 // Initialise the move log entry
581 $this->moveLog[$rctitle] = array(
582 'old' => $title
583 );
584 }
585 // Update the earliest time the page could have been moved
586 $this->moveLog[$rctitle]['startTime'] = $entry['timestamp'];
587 }
588 }
589
590 function writeRevisions() {
591 $this->numGoodRevs = 0;
592 $this->revId = 1;
593 $this->processDiffFile( array( $this, 'revisionCallback' ) );
594 echo "\n\nImported {$this->numGoodRevs} out of {$this->numRevs}\n";
595 }
596
597 function revisionCallback( $params ) {
598 $origTitle = $params['title'];
599 $title = $params['rctitle'];
600 $editTime = $params['timestamp'];
601
602 if ( isset( $this->blacklist[$title] ) ) {
603 return;
604 }
605 $this->doPendingOps( $editTime );
606
607 $origText = $this->getText( $title );
608 $text = $this->patch( $origText, $params['diff'] );
609 if ( $text === false ) {
610 echo "$editTime $title attempting resolution...\n";
611 $linkSubstitutes = $this->resolveFailedDiff( $origText, $params['diff'] );
612 if ( !$linkSubstitutes ) {
613 $this->printLatin1( "$editTime $title DIFF FAILED\n" );
614 $this->blacklist[$title] = true;
615 return;
616 }
617 $this->printLatin1( "$editTime $title requires substitutions:\n" );
618 $time = $editTime - 1;
619 foreach ( $linkSubstitutes as $old => $new ) {
620 $this->printLatin1( "SUBSTITUTE $old -> $new\n" );
621 $this->renameTextLinks( $old, $new, $time-- );
622 }
623 $origText = $this->getText( $title );
624 $text = $this->patch( $origText, $params['diff'] );
625 if ( $text === false ) {
626 $this->printLatin1( "$editTime $title STILL FAILS!\n" );
627 $this->blacklist[$title] = true;
628 return;
629 }
630
631 echo "\n";
632 }
633
634 $params['text'] = $text;
635 $this->saveRevision( $params );
636 $this->numGoodRevs++;
637 #$this->printLatin1( "$editTime $title\n" );
638 }
639
640 function doPendingOps( $editTime ) {
641 foreach ( $this->moveLog as $newTitle => $entry ) {
642 if ( $entry['timestamp'] <= $editTime ) {
643 unset( $this->moveLog[$newTitle] );
644 if ( isset( $entry['deep'] ) ) {
645 $this->renameTextLinks( $entry['old'], $newTitle, $entry['timestamp'] );
646 }
647 }
648 }
649
650 foreach ( $this->renameTextLinksOps as $renameTime => $replacements ) {
651 if ( $editTime >= $renameTime ) {
652 foreach ( $replacements as $old => $new ) {
653 $this->printLatin1( "SUBSTITUTE $old -> $new\n" );
654 $this->renameTextLinks( $old, $new, $renameTime );
655 }
656 unset( $this->renameTextLinksOps[$renameTime] );
657 }
658 }
659
660 foreach ( $this->unixLineEndingsOps as $fixTime => $title ) {
661 if ( $editTime >= $fixTime ) {
662 $this->printLatin1( "$fixTime $title FIXING LINE ENDINGS\n" );
663 $text = $this->getText( $title );
664 $text = str_replace( "\r", '', $text );
665 $this->saveRevision( array(
666 'rctitle' => $title,
667 'timestamp' => $fixTime,
668 'extra' => array( 'name' => 'UseModWiki admin' ),
669 'text' => $text,
670 'summary' => 'Fixing line endings',
671 ) );
672 unset( $this->unixLineEndingsOps[$fixTime] );
673 }
674 }
675 }
676
677 function patch( $source, $diff ) {
678 file_put_contents( $this->articleFileName, $source );
679 file_put_contents( $this->patchFileName, $diff );
680 $error = wfShellExec(
681 wfEscapeShellArg(
682 'patch',
683 '-n',
684 '-r', '-',
685 '--no-backup-if-mismatch',
686 '--binary',
687 $this->articleFileName,
688 $this->patchFileName
689 ) . ' 2>&1',
690 $status
691 );
692 $text = file_get_contents( $this->articleFileName );
693 if ( $status || $text === false ) {
694 return false;
695 } else {
696 return $text;
697 }
698 }
699
700 function resolveFailedDiff( $origText, $diff ) {
701 $context = array();
702 $rxRange = '\d+(?:,(\d+))?';
703 $diffLines = explode( "\n", $diff );
704 for ( $i = 0; $i < count( $diffLines ); $i++ ) {
705 $diffLine = $diffLines[$i];
706 if ( !preg_match( '/^(\d+)(?:,\d+)?[acd]\d+(?:,\d+)?$/', $diffLine, $m ) ) {
707 continue;
708 }
709
710 $sourceIndex = intval( $m[1] );
711 $i++;
712 while ( $i < count( $diffLines ) && substr( $diffLines[$i], 0, 1 ) === '<' ) {
713 $context[$sourceIndex - 1] = substr( $diffLines[$i], 2 );
714 $sourceIndex++;
715 $i++;
716 }
717 $i--;
718 }
719
720 $changedLinks = array();
721 $origLines = explode( "\n", $origText );
722 foreach ( $context as $i => $contextLine ) {
723 $origLine = isset( $origLines[$i] ) ? $origLines[$i] : '';
724 if ( $contextLine === $origLine ) {
725 continue;
726 }
727 $newChanges = $this->resolveTextChange( $origLine, $contextLine );
728 if ( is_array( $newChanges ) ) {
729 $changedLinks += $newChanges;
730 } else {
731 echo "Resolution failure on line " . ( $i + 1 ) . "\n";
732 $this->printLatin1( $newChanges );
733 }
734 }
735
736 return $changedLinks;
737 }
738
739 function resolveTextChange( $source, $dest ) {
740 $changedLinks = array();
741 $sourceLinks = $this->getLinkList( $source );
742 $destLinks = $this->getLinkList( $dest );
743 $newLinks = array_diff( $destLinks, $sourceLinks );
744 $removedLinks = array_diff( $sourceLinks, $destLinks );
745
746 // Match up the removed links with the new links
747 foreach ( $newLinks as $j => $newLink ) {
748 $minDistance = 100000000;
749 $bestRemovedLink = false;
750 foreach ( $removedLinks as $k => $removedLink ) {
751 $editDistance = levenshtein( $newLink, $removedLink );
752 if ( $editDistance < $minDistance ) {
753 $minDistance = $editDistance;
754 $bestRemovedLink = $removedLink;
755 }
756 }
757 if ( $bestRemovedLink !== false ) {
758 $changedLinks[$bestRemovedLink] = $newLink;
759 $newLinks = array_diff( $newLinks, array( $newLink ) );
760 $removedLinks = array_diff( $removedLinks, array( $bestRemovedLink ) );
761 }
762 }
763
764 $proposal = $source;
765 foreach ( $changedLinks as $removedLink => $newLink ) {
766 $proposal = $this->substituteTextLinks( $removedLink, $newLink, $proposal );
767 }
768 if ( $proposal !== $dest ) {
769 // Resolution failed
770 $msg = "Source line: $source\n" .
771 "Source links: " . implode( ', ', $sourceLinks ) . "\n" .
772 "Context line: $dest\n" .
773 "Context links: " . implode( ', ', $destLinks ) . "\n" .
774 "Proposal: $proposal\n";
775 return $msg;
776 }
777 return $changedLinks;
778 }
779
780 function processDiffFile( $callback ) {
781 $diffFile = fopen( "{$this->dataDir}/diff_log", 'r' );
782
783 $delimiter = "------\n";
784 file_put_contents( $this->articleFileName, "Describe the new page here.\n" );
785
786 $line = fgets( $diffFile );
787 $lineNum = 1;
788 if ( $line !== $delimiter ) {
789 echo "Invalid diff file\n";
790 return false;
791 }
792 $lastReportLine = 0;
793 $this->numRevs = 0;
794
795 while ( true ) {
796 $line = fgets( $diffFile );
797 $lineNum++;
798 if ( $line === false ) {
799 break;
800 }
801 if ( $lineNum > $lastReportLine + 1000 ) {
802 $lastReportLine = $lineNum;
803 fwrite( STDERR, "$lineNum \r" );
804 fflush( STDERR );
805 }
806 $line = trim( $line );
807 if ( !preg_match( '/^([^|]+)\|(\d+)$/', $line, $matches ) ) {
808 echo "Invalid header on line $lineNum\n";
809 return true;
810 }
811 list( , $title, $editTime ) = $matches;
812
813 $diff = '';
814 $diffStartLine = $lineNum;
815 while ( true ) {
816 $line = fgets( $diffFile );
817 $lineNum++;
818 if ( $line === $delimiter ) {
819 break;
820 }
821 if ( $line === false ) {
822 break 2;
823 }
824 $diff .= $line;
825 }
826
827 $this->numRevs++;
828
829 if ( !isset( $this->rc[$editTime] ) ) {
830 $this->printLatin1( "$editTime $title DELETED, skipping\n" );
831 continue;
832 }
833
834 if ( count( $this->rc[$editTime] ) == 1 ) {
835 $params = $this->rc[$editTime][0];
836 } else {
837 $params = false;
838 $candidates = '';
839 foreach ( $this->rc[$editTime] as $rc ) {
840 if ( $rc['rctitle'] === $title ) {
841 $params = $rc;
842 break;
843 }
844 if ( $candidates === '' ) {
845 $candidates = $rc['rctitle'];
846 } else {
847 $candidates .= ', ' . $rc['rctitle'];
848 }
849 }
850 if ( !$params ) {
851 $this->printLatin1( "$editTime $title ERROR cannot resolve rclog\n" );
852 $this->printLatin1( "$editTime $title CANDIDATES: $candidates\n" );
853 continue;
854 }
855 }
856 $params['diff'] = $diff;
857 $params['title'] = $title;
858 $params['diffStartLine'] = $diffStartLine;
859 call_user_func( $callback, $params );
860 }
861 echo "\n";
862
863 if ( !feof( $diffFile ) ) {
864 echo "Stopped at line $lineNum\n";
865 }
866 return true;
867 }
868
869 function reconcileCurrentRevs() {
870 foreach ( $this->textCache as $title => $text ) {
871 $fileName = "{$this->dataDir}/page/";
872 if ( preg_match( '/^[A-Z]/', $title, $m ) ) {
873 $fileName .= $m[0];
874 } else {
875 $fileName .= 'other';
876 }
877 $fileName .= "/$title.db";
878
879 if ( !file_exists( $fileName ) ) {
880 $this->printLatin1( "ERROR: Cannot find page file for {$title}\n" );
881 continue;
882 }
883
884 $fileContents = file_get_contents( $fileName );
885 $page = $this->unserializeUseMod( $fileContents, $this->FS1 );
886 $section = $this->unserializeUseMod( $page['text_default'], $this->FS2 );
887 $data = $this->unserializeUseMod( $section['data'], $this->FS3 );
888 $pageText = $data['text'];
889 if ( $text !== $pageText ) {
890 $substs = $this->resolveTextChange( $text, $pageText );
891 if ( is_array( $substs ) ) {
892 foreach ( $substs as $source => $dest ) {
893 if ( isset( $this->moveLog[$dest] ) ) {
894 $this->printLatin1( "ERROR: need deep rename: $source\n" );
895 } else {
896 $this->printLatin1( "ERROR: need substitute: $source -> $dest\n" );
897 }
898 }
899 } else {
900 $this->printLatin1( "ERROR: unresolved diff in $title:\n" );
901 wfSuppressWarnings();
902 $diff = xdiff_string_diff( $text, $pageText ) . '';
903 wfRestoreWarnings();
904 $this->printLatin1( "$diff\n" );
905 }
906 }
907 }
908 }
909
910 function makeTitle( $titleText ) {
911 return Title::newFromText( $this->encode( $titleText ) );
912 }
913
914 function getText( $titleText ) {
915 if ( !isset( $this->textCache[$titleText] ) ) {
916 return "Describe the new page here.\n";
917 } else {
918 return $this->textCache[$titleText];
919 }
920 }
921
922 function saveRevision( $params ) {
923 $this->textCache[$params['rctitle']] = $params['text'];
924
925 $out = "<page>\n" .
926 $this->element( 'title', $params['rctitle'] ) .
927 "<revision>\n" .
928 $this->element( 'id', $this->revId ++ ) .
929 $this->element( 'timestamp', wfTimestamp( TS_ISO_8601, $params['timestamp'] ) ) .
930 "<contributor>\n";
931 if ( isset( $params['extra']['name'] ) ) {
932 $out .= $this->element( 'username', $params['extra']['name'] );
933 }
934 if ( isset( $params['extra']['id'] ) ) {
935 $out .= $this->element( 'id', $params['extra']['id'] );
936 }
937 if ( isset( $params['host'] ) ) {
938 $out .= $this->element( 'ip', $params['host'] );
939 }
940 $out .=
941 "</contributor>\n" .
942 $this->element( 'comment', $params['summary'] ) .
943 "<text xml:space=\"preserve\">" .
944 htmlspecialchars( $this->encode( $params['text'] ) ) .
945 "</text>\n" .
946 "</revision>\n" .
947 "</page>\n";
948 fwrite( $this->outFile, $out );
949 }
950
951 function renameTextLinks( $old, $new, $timestamp ) {
952 $newWithUnderscores = $new;
953 $old = str_replace( '_', ' ', $old );
954 $new = str_replace( '_', ' ', $new );
955
956 foreach ( $this->textCache as $title => $oldText ) {
957 if ( $newWithUnderscores === $title
958 && in_array( $title, $this->skipSelfSubstitution ) )
959 {
960 // Hack to make Pythagorean_Theorem etc. work
961 continue;
962 }
963
964 $newText = $this->substituteTextLinks( $old, $new, $oldText );
965 if ( $oldText !== $newText ) {
966 $this->saveRevision( array(
967 'rctitle' => $title,
968 'timestamp' => $timestamp,
969 'text' => $newText,
970 'extra' => array( 'name' => 'Page move link fixup script' ),
971 'summary' => '',
972 'minor' => true
973 ) );
974 }
975 }
976 }
977
978 function substituteTextLinks( $old, $new, $text ) {
979 $this->saveUrl = array();
980 $this->old = $old;
981 $this->new = $new;
982
983 $text = str_replace( $this->FS, '', $text ); # Remove separators (paranoia)
984 $text = preg_replace_callback( '/(<pre>(.*?)<\/pre>)/is',
985 array( $this, 'storeRaw' ), $text );
986 $text = preg_replace_callback( '/(<code>(.*?)<\/code>)/is',
987 array( $this, 'storeRaw' ), $text );
988 $text = preg_replace_callback( '/(<nowiki>(.*?)<\/nowiki>)/s',
989 array( $this, 'storeRaw' ), $text );
990
991 $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\|([^\]]+)\]\]/",
992 array( $this, 'subFreeLink' ), $text );
993 $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\]\]/",
994 array( $this, 'subFreeLink' ), $text );
995 $text = preg_replace_callback( "/(\[{$this->UrlPattern}\s+([^\]]+?)\])/",
996 array( $this, 'storeRaw' ), $text );
997 $text = preg_replace_callback( "/(\[{$this->InterLinkPattern}\s+([^\]]+?)\])/",
998 array( $this, 'storeRaw' ), $text );
999 $text = preg_replace_callback( "/(\[?{$this->UrlPattern}\]?)/",
1000 array( $this, 'storeRaw' ), $text );
1001 $text = preg_replace_callback( "/(\[?{$this->InterLinkPattern}\]?)/",
1002 array( $this, 'storeRaw' ), $text );
1003 $text = preg_replace_callback( "/{$this->LinkPattern}/",
1004 array( $this, 'subWikiLink' ), $text );
1005
1006 $text = preg_replace_callback( "/{$this->FS}(\d+){$this->FS}/",
1007 array( $this, 'restoreRaw' ), $text ); # Restore saved text
1008 return $text;
1009 }
1010
1011 function getLinkList( $text ) {
1012 $this->saveUrl = array();
1013 $this->linkList = array();
1014
1015 $text = str_replace( $this->FS, '', $text ); # Remove separators (paranoia)
1016 $text = preg_replace_callback( '/(<pre>(.*?)<\/pre>)/is',
1017 array( $this, 'storeRaw' ), $text );
1018 $text = preg_replace_callback( '/(<code>(.*?)<\/code>)/is',
1019 array( $this, 'storeRaw' ), $text );
1020 $text = preg_replace_callback( '/(<nowiki>(.*?)<\/nowiki>)/s',
1021 array( $this, 'storeRaw' ), $text );
1022
1023 $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\|([^\]]+)\]\]/",
1024 array( $this, 'storeLink' ), $text );
1025 $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\]\]/",
1026 array( $this, 'storeLink' ), $text );
1027 $text = preg_replace_callback( "/(\[{$this->UrlPattern}\s+([^\]]+?)\])/",
1028 array( $this, 'storeRaw' ), $text );
1029 $text = preg_replace_callback( "/(\[{$this->InterLinkPattern}\s+([^\]]+?)\])/",
1030 array( $this, 'storeRaw' ), $text );
1031 $text = preg_replace_callback( "/(\[?{$this->UrlPattern}\]?)/",
1032 array( $this, 'storeRaw' ), $text );
1033 $text = preg_replace_callback( "/(\[?{$this->InterLinkPattern}\]?)/",
1034 array( $this, 'storeRaw' ), $text );
1035 $text = preg_replace_callback( "/{$this->LinkPattern}/",
1036 array( $this, 'storeLink' ), $text );
1037
1038 return $this->linkList;
1039 }
1040
1041 function storeRaw( $m ) {
1042 $this->saveUrl[] = $m[1];
1043 return $this->FS . (count( $this->saveUrl ) - 1) . $this->FS;
1044 }
1045
1046 function subFreeLink( $m ) {
1047 $link = $m[1];
1048 if ( isset( $m[2] ) ) {
1049 $name = $m[2];
1050 } else {
1051 $name = '';
1052 }
1053 $oldlink = $link;
1054 $link = preg_replace( '/^\s+/', '', $link );
1055 $link = preg_replace( '/\s+$/', '', $link );
1056 if ( $link == $this->old ) {
1057 $link = $this->new;
1058 } else {
1059 $link = $oldlink; # Preserve spaces if no match
1060 }
1061 $link = "[[$link";
1062 if ( $name !== "" ) {
1063 $link .= "|$name";
1064 }
1065 $link .= "]]";
1066 return $this->storeRaw( array( 1 => $link ) );
1067 }
1068
1069 function subWikiLink( $m ) {
1070 $link = $m[1];
1071 if ( $link == $this->old ) {
1072 $link = $this->new;
1073 if ( !preg_match( "/^{$this->LinkPattern}$/", $this->new ) ) {
1074 $link = "[[$link]]";
1075 }
1076 }
1077 return $this->storeRaw( array( 1 => $link ) );
1078 }
1079
1080 function restoreRaw( $m ) {
1081 return $this->saveUrl[$m[1]];
1082 }
1083
1084 function storeLink( $m ) {
1085 $this->linkList[] = $m[1];
1086 return $this->storeRaw( $m );
1087 }
1088
1089 function encode( $s ) {
1090 return strtr( $s, $this->encodeMap );
1091 }
1092
1093 function decode( $s ) {
1094 return strtr( $s, $this->decodeMap );
1095 }
1096
1097 function printLatin1( $s ) {
1098 echo $this->encode( $s );
1099 }
1100
1101 function unserializeUseMod( $s, $sep ) {
1102 $parts = explode( $sep, $s );
1103 $result = array();
1104 for ( $i = 0; $i < count( $parts ); $i += 2 ) {
1105 $result[$parts[$i]] = $parts[$i+1];
1106 }
1107 return $result;
1108 }
1109 }
1110
1111 $maintClass = 'ImportUseModWikipedia';
1112 require_once( DO_MAINTENANCE );