4 * A script to read a dump of the English Wikipedia from the UseModWiki period, and to
5 * generate an XML dump in MediaWiki format.
7 * Some relevant code was ported from UseModWiki 0.92.
11 require_once( dirname( __FILE__
) . '/Maintenance.php' );
12 require_once( dirname( __FILE__
) .'/../includes/normal/UtfNormalUtil.php' );
15 class ImportUseModWikipedia
extends Maintenance
{
16 var $encodeMap, $decodeMap;
18 var $deepRenames = array(
19 'JimboWales' => 983862286,
21 'HistoryOfUnitedStatesTalk' => 984795423,
22 'MetallicA' => 985128533,
23 'PythagoreanTheorem' => 985225545,
24 'TheCanonofScripture' => 985368223,
25 'TaoTehChing' => 985368222,
26 //'TheMostRemarkableFormulaInTheWorld' => 985368221,
27 'TheRecorder' => 985368220,
28 'GladstoneOregon' => 985368219,
29 #'UnitedStatesConstitution/AmendmentTwo' =>
32 var $replacements = array();
34 var $renameTextLinksOps = array(
36 'TestIgnore' => 'IgnoreTest',
39 'UnitedLocomotiveWorks' => 'Atlas Shrugged/United Locomotive Works'
42 'WikiPedia' => 'Wikipedia',
45 'John_F_Kennedy' => 'John_F._Kennedy',
48 'LarrySanger' => 'Larry_Sanger'
51 'UnitedStates' => 'United States',
54 'LibertarianisM' => 'Libertarianism'
57 'AnarchisM' => 'Anarchism',
60 'HistoryOfUnitedStatesDiscussion' => 'History_Of_United_States_Discussion'
63 'BritishEmpire' => 'British Empire'
67 'ScienceFiction' => 'Science fiction',
72 * Hack for observed substitution issues
74 var $skipSelfSubstitution = array(
75 'Pythagorean_Theorem',
76 'The_Most_Remarkable_Formula_In_The_World',
80 var $unixLineEndingsOps = array(
81 987743732 => 'Wikipedia_FAQ'
84 var $replacementsDone = array();
86 var $moveLog = array();
87 var $moveDests = array();
91 var $textCache = array();
92 var $blacklist = array();
94 var $FS, $FS1, $FS2, $FS3;
95 var $FreeLinkPattern, $UrlPattern, $LinkPattern, $InterLinkPattern;
97 var $cp1252Table = <<<EOT
355 public function __construct() {
356 parent
::__construct();
357 $this->addOption( 'datadir', 'the value of $DataDir from wiki.cgi', true, true );
358 $this->addOption( 'outfile', 'the name of the output XML file', true, true );
359 $this->initLinkPatterns();
361 $this->encodeMap
= $this->decodeMap
= array();
362 foreach ( explode( "\n", $this->cp1252Table
) as $line ) {
363 list( $source, $dest ) = explode( "\t", $line );
364 $sourceChar = chr( base_convert( substr( $source, 2 ), 16, 10 ) );
365 $destChar = codepointToUtf8( base_convert( substr( $dest, 2 ), 16, 10 ) );
366 $this->encodeMap
[$sourceChar] = $destChar;
367 $this->decodeMap
[$destChar] = $sourceChar;
371 function initLinkPatterns() {
372 # Field separators are used in the URL-style patterns below.
373 $this->FS
= "\xb3"; # The FS character is a superscript "3"
374 $this->FS1
= $this->FS
. "1"; # The FS values are used to separate fields
375 $this->FS2
= $this->FS
. "2"; # in stored hashtables and other data structures.
376 $this->FS3
= $this->FS
. "3"; # The FS character is not allowed in user data.
378 $UpperLetter = "[A-Z";
379 $LowerLetter = "[a-z";
380 $AnyLetter = "[A-Za-z";
381 $AnyLetter .= "_0-9";
382 $UpperLetter .= "]"; $LowerLetter .= "]"; $AnyLetter .= "]";
384 # Main link pattern: lowercase between uppercase, then anything
385 $LpA = $UpperLetter . "+" . $LowerLetter . "+" . $UpperLetter
387 # Optional subpage link pattern: uppercase, lowercase, then anything
388 $LpB = $UpperLetter . "+" . $LowerLetter . "+" . $AnyLetter . "*";
390 # Loose pattern: If subpage is used, subpage may be simple name
391 $this->LinkPattern
= "((?:(?:$LpA)?\\/$LpB)|$LpA)";
392 $QDelim = '(?:"")?'; # Optional quote delimiter (not in output)
393 $this->LinkPattern
.= $QDelim;
395 # Inter-site convention: sites must start with uppercase letter
396 # (Uppercase letter avoids confusion with URLs)
397 $InterSitePattern = $UpperLetter . $AnyLetter . "+";
398 $this->InterLinkPattern
= "((?:$InterSitePattern:[^\\]\\s\"<>{$this->FS}]+)$QDelim)";
400 $AnyLetter = "[-,. _0-9A-Za-z]";
401 $this->FreeLinkPattern
= "($AnyLetter+)";
402 $this->FreeLinkPattern
= "((?:(?:$AnyLetter+)?\\/)?$AnyLetter+)";
403 $this->FreeLinkPattern
.= $QDelim;
405 # Url-style links are delimited by one of:
406 # 1. Whitespace (kept in output)
407 # 2. Left or right angle-bracket (< or >) (kept in output)
408 # 3. Right square-bracket (]) (kept in output)
409 # 4. A single double-quote (") (kept in output)
410 # 5. A $FS (field separator) character (kept in output)
411 # 6. A double double-quote ("") (removed from output)
413 $UrlProtocols = "http|https|ftp|afs|news|nntp|mid|cid|mailto|wais|"
414 . "prospero|telnet|gopher";
415 $UrlProtocols .= '|file';
416 $this->UrlPattern
= "((?:(?:$UrlProtocols):[^\\]\\s\"<>{$this->FS}]+)$QDelim)";
417 $ImageExtensions = "(gif|jpg|png|bmp|jpeg)";
418 $RFCPattern = "RFC\\s?(\\d+)";
419 $ISBNPattern = "ISBN:?([0-9- xX]{10,})";
423 $this->articleFileName
= '/tmp/importUseMod.' . mt_rand( 0, 0x7ffffff ) . '.tmp';
424 $this->patchFileName
= '/tmp/importUseMod.' . mt_rand( 0, 0x7ffffff ) . '.tmp';
425 $this->dataDir
= $this->getOption( 'datadir' );
426 $this->outFile
= fopen( $this->getOption( 'outfile' ), 'w' );
427 if ( !$this->outFile
) {
428 echo "Unable to open output file\n";
431 $this->writeXmlHeader();
433 $this->writeMoveLog();
434 $this->writeRevisions();
435 $this->reconcileCurrentRevs();
436 $this->writeXmlFooter();
437 unlink( $this->articleFileName
);
438 unlink( $this->patchFileName
);
442 function writeXmlHeader() {
443 fwrite( $this->outFile
, <<<EOT
444 <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd" version="0.3" xml:lang="en">
446 <sitename>Wikipedia</sitename>
447 <base>http://www.wikipedia.com/</base>
448 <generator>MediaWiki 1.18alpha importUseModWikipedia.php</generator>
449 <case>case-sensitive</case>
451 <namespace key="0" />
459 function writeXmlFooter() {
460 fwrite( $this->outFile
, "</mediawiki>\n" );
463 function readRclog() {
464 $rcFile = fopen( "{$this->dataDir}/rclog", 'r' );
465 while ( $line = fgets( $rcFile ) ) {
466 $bits = explode( $this->FS3
, $line );
467 if ( count( $bits ) !== 7 ) {
468 echo "Error reading rclog\n";
472 'timestamp' => $bits[0],
473 'rctitle' => $bits[1],
474 'summary' => $bits[2],
480 $extraList = explode( $this->FS2
, $bits[6] );
482 for ( $i = 0; $i < count( $extraList ); $i +
= 2 ) {
483 $params['extra'][$extraList[$i]] = $extraList[$i +
1];
485 $this->rc
[$params['timestamp']][] = $params;
489 function writeMoveLog() {
490 $this->moveLog
= array();
491 $deepRenames = $this->deepRenames
;
492 echo "Calculating move log...\n";
493 $this->processDiffFile( array( $this, 'moveLogCallback' ) );
495 // We have the timestamp intervals, now make a guess at the actual timestamp
496 foreach ( $this->moveLog
as $newTitle => $params ) {
497 // Is there a time specified?
499 if ( isset( $deepRenames[$params['old']] ) ) {
500 $drTime = $deepRenames[$params['old']];
501 if ( $drTime !== '?' ) {
502 if ( ( !isset( $params['endTime'] ) ||
$drTime < $params['endTime'] )
503 && $drTime > $params['startTime'] )
505 $this->moveLog
[$newTitle]['timestamp'] = $drTime;
506 $this->moveLog
[$newTitle]['deep'] = true;
508 echo "{$params['old']} -> $newTitle at $drTime\n";
509 unset( $deepRenames[$params['old']] );
512 echo "WARNING: deep rename time invalid: {$params['old']}\n";
513 unset( $deepRenames[$params['old']] );
518 // Guess that it is one second after the last edit to the page before it was moved
519 $this->moveLog
[$newTitle]['timestamp'] = $params['startTime'] +
1;
520 if ( $drTime === '?' ) {
521 $this->moveLog
[$newTitle]['deep'] = true;
522 unset( $deepRenames[$params['old']] );
524 if ( isset( $params['endTime'] ) ) {
525 $this->printLatin1( "{$params['old']} -> $newTitle between " .
526 "{$params['startTime']} and {$params['endTime']}\n" );
528 $this->printLatin1( "{$params['old']} -> $newTitle after " .
529 "{$params['startTime']}\n" );
533 // Write the move log to the XML file
535 foreach ( $this->moveLog
as $newTitle => $params ) {
536 $out = "<logitem>\n" .
537 $this->element( 'id', $id++
) .
538 $this->element( 'timestamp', wfTimestamp( TS_ISO_8601
, $params['timestamp'] ) ) .
540 $this->element( 'username', 'UseModWiki admin' ) .
542 $this->element( 'type', 'move' ) .
543 $this->element( 'action', 'move' ) .
544 $this->element( 'logtitle', $params['old'] ) .
545 "<params xml:space=\"preserve\">" .
546 htmlspecialchars( $this->encode( "{$newTitle}\n1" ) ) .
549 fwrite( $this->outFile
, $out );
552 // Check for remaining deep rename entries
553 if ( $deepRenames ) {
554 echo "WARNING: the following entries in \$this->deepRenames are " .
555 "invalid, since no such move exists:\n" .
556 implode( "\n", array_keys( $deepRenames ) ) .
562 function element( $name, $value ) {
563 return "<$name>" . htmlspecialchars( $this->encode( $value ) ) . "</$name>\n";
566 function moveLogCallback( $entry ) {
567 $rctitle = $entry['rctitle'];
568 $title = $entry['title'];
569 $this->moveDests
[$rctitle] = $title;
571 if ( $rctitle === $title ) {
572 if ( isset( $this->moveLog
[$rctitle] )
573 && !isset( $this->moveLog
[$rctitle]['endTime'] ) )
575 // This is the latest time that the page could have been moved
576 $this->moveLog
[$rctitle]['endTime'] = $entry['timestamp'];
579 if ( !isset( $this->moveLog
[$rctitle] ) ) {
580 // Initialise the move log entry
581 $this->moveLog
[$rctitle] = array(
585 // Update the earliest time the page could have been moved
586 $this->moveLog
[$rctitle]['startTime'] = $entry['timestamp'];
590 function writeRevisions() {
591 $this->numGoodRevs
= 0;
593 $this->processDiffFile( array( $this, 'revisionCallback' ) );
594 echo "\n\nImported {$this->numGoodRevs} out of {$this->numRevs}\n";
597 function revisionCallback( $params ) {
598 $origTitle = $params['title'];
599 $title = $params['rctitle'];
600 $editTime = $params['timestamp'];
602 if ( isset( $this->blacklist
[$title] ) ) {
605 $this->doPendingOps( $editTime );
607 $origText = $this->getText( $title );
608 $text = $this->patch( $origText, $params['diff'] );
609 if ( $text === false ) {
610 echo "$editTime $title attempting resolution...\n";
611 $linkSubstitutes = $this->resolveFailedDiff( $origText, $params['diff'] );
612 if ( !$linkSubstitutes ) {
613 $this->printLatin1( "$editTime $title DIFF FAILED\n" );
614 $this->blacklist
[$title] = true;
617 $this->printLatin1( "$editTime $title requires substitutions:\n" );
618 $time = $editTime - 1;
619 foreach ( $linkSubstitutes as $old => $new ) {
620 $this->printLatin1( "SUBSTITUTE $old -> $new\n" );
621 $this->renameTextLinks( $old, $new, $time-- );
623 $origText = $this->getText( $title );
624 $text = $this->patch( $origText, $params['diff'] );
625 if ( $text === false ) {
626 $this->printLatin1( "$editTime $title STILL FAILS!\n" );
627 $this->blacklist
[$title] = true;
634 $params['text'] = $text;
635 $this->saveRevision( $params );
636 $this->numGoodRevs++
;
637 #$this->printLatin1( "$editTime $title\n" );
640 function doPendingOps( $editTime ) {
641 foreach ( $this->moveLog
as $newTitle => $entry ) {
642 if ( $entry['timestamp'] <= $editTime ) {
643 unset( $this->moveLog
[$newTitle] );
644 if ( isset( $entry['deep'] ) ) {
645 $this->renameTextLinks( $entry['old'], $newTitle, $entry['timestamp'] );
650 foreach ( $this->renameTextLinksOps
as $renameTime => $replacements ) {
651 if ( $editTime >= $renameTime ) {
652 foreach ( $replacements as $old => $new ) {
653 $this->printLatin1( "SUBSTITUTE $old -> $new\n" );
654 $this->renameTextLinks( $old, $new, $renameTime );
656 unset( $this->renameTextLinksOps
[$renameTime] );
660 foreach ( $this->unixLineEndingsOps
as $fixTime => $title ) {
661 if ( $editTime >= $fixTime ) {
662 $this->printLatin1( "$fixTime $title FIXING LINE ENDINGS\n" );
663 $text = $this->getText( $title );
664 $text = str_replace( "\r", '', $text );
665 $this->saveRevision( array(
667 'timestamp' => $fixTime,
668 'extra' => array( 'name' => 'UseModWiki admin' ),
670 'summary' => 'Fixing line endings',
672 unset( $this->unixLineEndingsOps
[$fixTime] );
677 function patch( $source, $diff ) {
678 file_put_contents( $this->articleFileName
, $source );
679 file_put_contents( $this->patchFileName
, $diff );
680 $error = wfShellExec(
685 '--no-backup-if-mismatch',
687 $this->articleFileName
,
692 $text = file_get_contents( $this->articleFileName
);
693 if ( $status ||
$text === false ) {
700 function resolveFailedDiff( $origText, $diff ) {
702 $rxRange = '\d+(?:,(\d+))?';
703 $diffLines = explode( "\n", $diff );
704 for ( $i = 0; $i < count( $diffLines ); $i++
) {
705 $diffLine = $diffLines[$i];
706 if ( !preg_match( '/^(\d+)(?:,\d+)?[acd]\d+(?:,\d+)?$/', $diffLine, $m ) ) {
710 $sourceIndex = intval( $m[1] );
712 while ( $i < count( $diffLines ) && substr( $diffLines[$i], 0, 1 ) === '<' ) {
713 $context[$sourceIndex - 1] = substr( $diffLines[$i], 2 );
720 $changedLinks = array();
721 $origLines = explode( "\n", $origText );
722 foreach ( $context as $i => $contextLine ) {
723 $origLine = isset( $origLines[$i] ) ?
$origLines[$i] : '';
724 if ( $contextLine === $origLine ) {
727 $newChanges = $this->resolveTextChange( $origLine, $contextLine );
728 if ( is_array( $newChanges ) ) {
729 $changedLinks +
= $newChanges;
731 echo "Resolution failure on line " . ( $i +
1 ) . "\n";
732 $this->printLatin1( $newChanges );
736 return $changedLinks;
739 function resolveTextChange( $source, $dest ) {
740 $changedLinks = array();
741 $sourceLinks = $this->getLinkList( $source );
742 $destLinks = $this->getLinkList( $dest );
743 $newLinks = array_diff( $destLinks, $sourceLinks );
744 $removedLinks = array_diff( $sourceLinks, $destLinks );
746 // Match up the removed links with the new links
747 foreach ( $newLinks as $j => $newLink ) {
748 $minDistance = 100000000;
749 $bestRemovedLink = false;
750 foreach ( $removedLinks as $k => $removedLink ) {
751 $editDistance = levenshtein( $newLink, $removedLink );
752 if ( $editDistance < $minDistance ) {
753 $minDistance = $editDistance;
754 $bestRemovedLink = $removedLink;
757 if ( $bestRemovedLink !== false ) {
758 $changedLinks[$bestRemovedLink] = $newLink;
759 $newLinks = array_diff( $newLinks, array( $newLink ) );
760 $removedLinks = array_diff( $removedLinks, array( $bestRemovedLink ) );
765 foreach ( $changedLinks as $removedLink => $newLink ) {
766 $proposal = $this->substituteTextLinks( $removedLink, $newLink, $proposal );
768 if ( $proposal !== $dest ) {
770 $msg = "Source line: $source\n" .
771 "Source links: " . implode( ', ', $sourceLinks ) . "\n" .
772 "Context line: $dest\n" .
773 "Context links: " . implode( ', ', $destLinks ) . "\n" .
774 "Proposal: $proposal\n";
777 return $changedLinks;
780 function processDiffFile( $callback ) {
781 $diffFile = fopen( "{$this->dataDir}/diff_log", 'r' );
783 $delimiter = "------\n";
784 file_put_contents( $this->articleFileName
, "Describe the new page here.\n" );
786 $line = fgets( $diffFile );
788 if ( $line !== $delimiter ) {
789 echo "Invalid diff file\n";
796 $line = fgets( $diffFile );
798 if ( $line === false ) {
801 if ( $lineNum > $lastReportLine +
1000 ) {
802 $lastReportLine = $lineNum;
803 fwrite( STDERR
, "$lineNum \r" );
806 $line = trim( $line );
807 if ( !preg_match( '/^([^|]+)\|(\d+)$/', $line, $matches ) ) {
808 echo "Invalid header on line $lineNum\n";
811 list( , $title, $editTime ) = $matches;
814 $diffStartLine = $lineNum;
816 $line = fgets( $diffFile );
818 if ( $line === $delimiter ) {
821 if ( $line === false ) {
829 if ( !isset( $this->rc
[$editTime] ) ) {
830 $this->printLatin1( "$editTime $title DELETED, skipping\n" );
834 if ( count( $this->rc
[$editTime] ) == 1 ) {
835 $params = $this->rc
[$editTime][0];
839 foreach ( $this->rc
[$editTime] as $rc ) {
840 if ( $rc['rctitle'] === $title ) {
844 if ( $candidates === '' ) {
845 $candidates = $rc['rctitle'];
847 $candidates .= ', ' . $rc['rctitle'];
851 $this->printLatin1( "$editTime $title ERROR cannot resolve rclog\n" );
852 $this->printLatin1( "$editTime $title CANDIDATES: $candidates\n" );
856 $params['diff'] = $diff;
857 $params['title'] = $title;
858 $params['diffStartLine'] = $diffStartLine;
859 call_user_func( $callback, $params );
863 if ( !feof( $diffFile ) ) {
864 echo "Stopped at line $lineNum\n";
869 function reconcileCurrentRevs() {
870 foreach ( $this->textCache
as $title => $text ) {
871 $fileName = "{$this->dataDir}/page/";
872 if ( preg_match( '/^[A-Z]/', $title, $m ) ) {
875 $fileName .= 'other';
877 $fileName .= "/$title.db";
879 if ( !file_exists( $fileName ) ) {
880 $this->printLatin1( "ERROR: Cannot find page file for {$title}\n" );
884 $fileContents = file_get_contents( $fileName );
885 $page = $this->unserializeUseMod( $fileContents, $this->FS1
);
886 $section = $this->unserializeUseMod( $page['text_default'], $this->FS2
);
887 $data = $this->unserializeUseMod( $section['data'], $this->FS3
);
888 $pageText = $data['text'];
889 if ( $text !== $pageText ) {
890 $substs = $this->resolveTextChange( $text, $pageText );
891 if ( is_array( $substs ) ) {
892 foreach ( $substs as $source => $dest ) {
893 if ( isset( $this->moveLog
[$dest] ) ) {
894 $this->printLatin1( "ERROR: need deep rename: $source\n" );
896 $this->printLatin1( "ERROR: need substitute: $source -> $dest\n" );
900 $this->printLatin1( "ERROR: unresolved diff in $title:\n" );
901 wfSuppressWarnings();
902 $diff = xdiff_string_diff( $text, $pageText ) . '';
904 $this->printLatin1( "$diff\n" );
910 function makeTitle( $titleText ) {
911 return Title
::newFromText( $this->encode( $titleText ) );
914 function getText( $titleText ) {
915 if ( !isset( $this->textCache
[$titleText] ) ) {
916 return "Describe the new page here.\n";
918 return $this->textCache
[$titleText];
922 function saveRevision( $params ) {
923 $this->textCache
[$params['rctitle']] = $params['text'];
926 $this->element( 'title', $params['rctitle'] ) .
928 $this->element( 'id', $this->revId ++
) .
929 $this->element( 'timestamp', wfTimestamp( TS_ISO_8601
, $params['timestamp'] ) ) .
931 if ( isset( $params['extra']['name'] ) ) {
932 $out .= $this->element( 'username', $params['extra']['name'] );
934 if ( isset( $params['extra']['id'] ) ) {
935 $out .= $this->element( 'id', $params['extra']['id'] );
937 if ( isset( $params['host'] ) ) {
938 $out .= $this->element( 'ip', $params['host'] );
942 $this->element( 'comment', $params['summary'] ) .
943 "<text xml:space=\"preserve\">" .
944 htmlspecialchars( $this->encode( $params['text'] ) ) .
948 fwrite( $this->outFile
, $out );
951 function renameTextLinks( $old, $new, $timestamp ) {
952 $newWithUnderscores = $new;
953 $old = str_replace( '_', ' ', $old );
954 $new = str_replace( '_', ' ', $new );
956 foreach ( $this->textCache
as $title => $oldText ) {
957 if ( $newWithUnderscores === $title
958 && in_array( $title, $this->skipSelfSubstitution
) )
960 // Hack to make Pythagorean_Theorem etc. work
964 $newText = $this->substituteTextLinks( $old, $new, $oldText );
965 if ( $oldText !== $newText ) {
966 $this->saveRevision( array(
968 'timestamp' => $timestamp,
970 'extra' => array( 'name' => 'Page move link fixup script' ),
978 function substituteTextLinks( $old, $new, $text ) {
979 $this->saveUrl
= array();
983 $text = str_replace( $this->FS
, '', $text ); # Remove separators (paranoia)
984 $text = preg_replace_callback( '/(<pre>(.*?)<\/pre>)/is',
985 array( $this, 'storeRaw' ), $text );
986 $text = preg_replace_callback( '/(<code>(.*?)<\/code>)/is',
987 array( $this, 'storeRaw' ), $text );
988 $text = preg_replace_callback( '/(<nowiki>(.*?)<\/nowiki>)/s',
989 array( $this, 'storeRaw' ), $text );
991 $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\|([^\]]+)\]\]/",
992 array( $this, 'subFreeLink' ), $text );
993 $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\]\]/",
994 array( $this, 'subFreeLink' ), $text );
995 $text = preg_replace_callback( "/(\[{$this->UrlPattern}\s+([^\]]+?)\])/",
996 array( $this, 'storeRaw' ), $text );
997 $text = preg_replace_callback( "/(\[{$this->InterLinkPattern}\s+([^\]]+?)\])/",
998 array( $this, 'storeRaw' ), $text );
999 $text = preg_replace_callback( "/(\[?{$this->UrlPattern}\]?)/",
1000 array( $this, 'storeRaw' ), $text );
1001 $text = preg_replace_callback( "/(\[?{$this->InterLinkPattern}\]?)/",
1002 array( $this, 'storeRaw' ), $text );
1003 $text = preg_replace_callback( "/{$this->LinkPattern}/",
1004 array( $this, 'subWikiLink' ), $text );
1006 $text = preg_replace_callback( "/{$this->FS}(\d+){$this->FS}/",
1007 array( $this, 'restoreRaw' ), $text ); # Restore saved text
1011 function getLinkList( $text ) {
1012 $this->saveUrl
= array();
1013 $this->linkList
= array();
1015 $text = str_replace( $this->FS
, '', $text ); # Remove separators (paranoia)
1016 $text = preg_replace_callback( '/(<pre>(.*?)<\/pre>)/is',
1017 array( $this, 'storeRaw' ), $text );
1018 $text = preg_replace_callback( '/(<code>(.*?)<\/code>)/is',
1019 array( $this, 'storeRaw' ), $text );
1020 $text = preg_replace_callback( '/(<nowiki>(.*?)<\/nowiki>)/s',
1021 array( $this, 'storeRaw' ), $text );
1023 $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\|([^\]]+)\]\]/",
1024 array( $this, 'storeLink' ), $text );
1025 $text = preg_replace_callback( "/\[\[{$this->FreeLinkPattern}\]\]/",
1026 array( $this, 'storeLink' ), $text );
1027 $text = preg_replace_callback( "/(\[{$this->UrlPattern}\s+([^\]]+?)\])/",
1028 array( $this, 'storeRaw' ), $text );
1029 $text = preg_replace_callback( "/(\[{$this->InterLinkPattern}\s+([^\]]+?)\])/",
1030 array( $this, 'storeRaw' ), $text );
1031 $text = preg_replace_callback( "/(\[?{$this->UrlPattern}\]?)/",
1032 array( $this, 'storeRaw' ), $text );
1033 $text = preg_replace_callback( "/(\[?{$this->InterLinkPattern}\]?)/",
1034 array( $this, 'storeRaw' ), $text );
1035 $text = preg_replace_callback( "/{$this->LinkPattern}/",
1036 array( $this, 'storeLink' ), $text );
1038 return $this->linkList
;
1041 function storeRaw( $m ) {
1042 $this->saveUrl
[] = $m[1];
1043 return $this->FS
. (count( $this->saveUrl
) - 1) . $this->FS
;
1046 function subFreeLink( $m ) {
1048 if ( isset( $m[2] ) ) {
1054 $link = preg_replace( '/^\s+/', '', $link );
1055 $link = preg_replace( '/\s+$/', '', $link );
1056 if ( $link == $this->old
) {
1059 $link = $oldlink; # Preserve spaces if no match
1062 if ( $name !== "" ) {
1066 return $this->storeRaw( array( 1 => $link ) );
1069 function subWikiLink( $m ) {
1071 if ( $link == $this->old
) {
1073 if ( !preg_match( "/^{$this->LinkPattern}$/", $this->new ) ) {
1074 $link = "[[$link]]";
1077 return $this->storeRaw( array( 1 => $link ) );
1080 function restoreRaw( $m ) {
1081 return $this->saveUrl
[$m[1]];
1084 function storeLink( $m ) {
1085 $this->linkList
[] = $m[1];
1086 return $this->storeRaw( $m );
1089 function encode( $s ) {
1090 return strtr( $s, $this->encodeMap
);
1093 function decode( $s ) {
1094 return strtr( $s, $this->decodeMap
);
1097 function printLatin1( $s ) {
1098 echo $this->encode( $s );
1101 function unserializeUseMod( $s, $sep ) {
1102 $parts = explode( $sep, $s );
1104 for ( $i = 0; $i < count( $parts ); $i +
= 2 ) {
1105 $result[$parts[$i]] = $parts[$i+
1];
1111 $maintClass = 'ImportUseModWikipedia';
1112 require_once( DO_MAINTENANCE
);