maintenance/importUseModWiki.php

   1 <?php
   2
   3 /**
   4  * Import data from a UseModWiki into a MediaWiki wiki
   5  * 2003-02-09 Brion VIBBER <brion@pobox.com>
   6  * Based loosely on Magnus's code from 2001-2002
   7  *
   8  * Updated limited version to get something working temporarily
   9  * 2003-10-09
  10  * Be sure to run the link & index rebuilding scripts!
  11  *
  12  * Some more munging for charsets etc
  13  * 2003-11-28
  14  *
  15  * Partial fix for pages starting with lowercase letters (??)
  16  * and CamelCase and /Subpage link conversion
  17  * 2004-11-17
  18  *
  19  * Rewrite output to create Special:Export format for import
  20  * instead of raw SQL. Should be 'future-proof' against future
  21  * schema changes.
  22  * 2005-03-14
  23  *
  24  * @todo document
  25  * @addtogroup Maintenance
  26  */
  27
  28 if( php_sapi_name() != 'cli' ) {
  29         echo "Please customize the settings and run me from the command line.";
  30         die( -1 );
  31 }
  32
  33 /** Set these correctly! */
  34 $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
  35 $wgRootDirectory = "/kalman/Projects/wiki2002/wiki/lib-http/db/wiki";
  36
  37 /* On a large wiki, you might run out of memory */
  38 @ini_set( 'memory_limit', '40M' );
  39
  40 /* globals */
  41 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
  42         $FS = $wgFieldSeparator ;
  43         $FS1 = $FS."1" ;
  44         $FS2 = $FS."2" ;
  45         $FS3 = $FS."3" ;
  46
  47 # Unicode sanitization tools
  48 require_once( '../includes/normal/UtfNormal.php' );
  49
  50 $usercache = array();
  51
  52 importPages();
  53
  54 # ------------------------------------------------------------------------------
  55
  56 function importPages()
  57 {
  58         global $wgRootDirectory;
  59
  60         $gt = '>';
  61         echo <<<END
  62 <?xml version="1.0" encoding="UTF-8" ?$gt
  63 <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.1/"
  64            xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  65            xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.1/
  66                                http://www.mediawiki.org/xml/export-0.1.xsd"
  67            version="0.1"
  68            xml:lang="en">
  69 <!-- generated by importUseModWiki.php -->
  70
  71 END;
  72         $letters = array(
  73                 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
  74                 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
  75                 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
  76         foreach( $letters as $letter ) {
  77                 $dir = "$wgRootDirectory/page/$letter";
  78                 if( is_dir( $dir ) )
  79                         importPageDirectory( $dir );
  80         }
  81         echo <<<END
  82 </mediawiki>
  83
  84 END;
  85 }
  86
  87 function importPageDirectory( $dir, $prefix = "" )
  88 {
  89         echo "\n<!-- Checking page directory " . xmlCommentSafe( $dir ) . " -->\n";
  90         $mydir = opendir( $dir );
  91         while( $entry = readdir( $mydir ) ) {
  92                 $m = array();
  93                 if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
  94                         echo importPage( $prefix . $m[1] );
  95                 } else {
  96                         if( is_dir( "$dir/$entry" ) ) {
  97                                 if( $entry != '.' && $entry != '..' ) {
  98                                         importPageDirectory( "$dir/$entry", "$entry/" );
  99                                 }
 100                         } else {
 101                                 echo "<!-- File '" . xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n";
 102                         }
 103                 }
 104         }
 105 }
 106
 107
 108 # ------------------------------------------------------------------------------
 109
 110 /* fetch_ functions
 111         Grab a given item from the database
 112         */
 113
 114 function useModFilename( $title ) {
 115         $c = substr( $title, 0, 1 );
 116         if(preg_match( '/[A-Z]/i', $c ) ) {
 117                 return strtoupper( $c ) . "/$title";
 118         }
 119         return "other/$title";
 120 }
 121
 122 function fetchPage( $title )
 123 {
 124         global $FS1,$FS2,$FS3, $wgRootDirectory;
 125
 126         $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
 127         if( !file_exists( $fname ) ) {
 128                 echo "Couldn't open file '$fname' for page '$title'.\n";
 129                 die( -1 );
 130         }
 131
 132         $page = splitHash( $FS1, file_get_contents( $fname ) );
 133         $section = splitHash( $FS2, $page["text_default"] );
 134         $text = splitHash( $FS3, $section["data"] );
 135
 136         return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
 137                 "minor" => $text["minor"] , "ts" => $section["ts"] ,
 138                 "username" => $section["username"] , "host" => $section["host"] ) );
 139 }
 140
 141 function fetchKeptPages( $title )
 142 {
 143         global $FS1,$FS2,$FS3, $wgRootDirectory;
 144
 145         $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
 146         if( !file_exists( $fname ) ) return array();
 147
 148         $keptlist = explode( $FS1, file_get_contents( $fname ) );
 149         array_shift( $keptlist ); # Drop the junk at beginning of file
 150
 151         $revisions = array();
 152         foreach( $keptlist as $rev ) {
 153                 $section = splitHash( $FS2, $rev );
 154                 $text = splitHash( $FS3, $section["data"] );
 155                 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
 156                         array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
 157                                 "minor" => $text["minor"] , "ts" => $section["ts"] ,
 158                                 "username" => $section["username"] , "host" => $section["host"] ) ) );
 159                 } else {
 160                         echo "<!-- skipped a bad old revision -->\n";
 161                 }
 162         }
 163         return $revisions;
 164 }
 165
 166 function splitHash ( $sep , $str ) {
 167         $temp = explode ( $sep , $str ) ;
 168         $ret = array () ;
 169         for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
 170                 $ret[$temp[$i]] = $temp[++$i] ;
 171                 }
 172         return $ret ;
 173         }
 174
 175
 176 /* import_ functions
 177         Take a fetched item and produce SQL
 178         */
 179
 180 function checkUserCache( $name, $host )
 181 {
 182         global $usercache;
 183
 184         if( $name ) {
 185                 if( in_array( $name, $usercache ) ) {
 186                         $userid = $usercache[$name];
 187                 } else {
 188                         # If we haven't imported user accounts
 189                         $userid = 0;
 190                 }
 191                 $username = str_replace( '_', ' ', $name );
 192         } else {
 193                 $userid = 0;
 194                 $username = $host;
 195         }
 196         return array( $userid, $username );
 197 }
 198
 199 function importPage( $title )
 200 {
 201         global $usercache;
 202
 203         echo "\n<!-- Importing page " . xmlCommentSafe( $title ) . " -->\n";
 204         $page = fetchPage( $title );
 205
 206         $newtitle = xmlsafe( str_replace( '_', ' ', recodeText( $title ) ) );
 207
 208         $munged = mungeFormat( $page->text );
 209         if( $munged != $page->text ) {
 210                 /**
 211                  * Save a *new* revision with the conversion, and put the
 212                  * previous last version into the history.
 213                  */
 214                 $next = array2object( array(
 215                         'text'     => $munged,
 216                         'minor'    => 1,
 217                         'username' => 'Conversion script',
 218                         'host'     => '127.0.0.1',
 219                         'ts'       => time(),
 220                         'summary'  => 'link fix',
 221                         ) );
 222                 $revisions = array( $page, $next );
 223         } else {
 224                 /**
 225                  * Current revision:
 226                  */
 227                 $revisions = array( $page );
 228         }
 229         $xml = <<<END
 230         <page>
 231                 <title>$newtitle</title>
 232
 233 END;
 234
 235         # History
 236         $revisions = array_merge( $revisions, fetchKeptPages( $title ) );
 237         if(count( $revisions ) == 0 ) {
 238                 return NULL; // Was "$sql", which does not appear to be defined.
 239         }
 240
 241         foreach( $revisions as $rev ) {
 242                 $text      = xmlsafe( recodeText( $rev->text ) );
 243                 $minor     = ($rev->minor ? '<minor/>' : '');
 244                 list( /* $userid */ , $username ) = checkUserCache( $rev->username, $rev->host );
 245                 $username  = xmlsafe( recodeText( $username ) );
 246                 $timestamp = xmlsafe( timestamp2ISO8601( $rev->ts ) );
 247                 $comment   = xmlsafe( recodeText( $rev->summary ) );
 248
 249                 $xml .= <<<END
 250                 <revision>
 251                         <timestamp>$timestamp</timestamp>
 252                         <contributor><username>$username</username></contributor>
 253                         $minor
 254                         <comment>$comment</comment>
 255                         <text>$text</text>
 256                 </revision>
 257
 258 END;
 259         }
 260         $xml .= "</page>\n\n";
 261         return $xml;
 262 }
 263
 264 # Whee!
 265 function recodeText( $string ) {
 266         global $wgImportEncoding;
 267         # For currently latin-1 wikis
 268         $string = str_replace( "\r\n", "\n", $string );
 269         $string = @iconv( $wgImportEncoding, "UTF-8", $string );
 270         $string = wfMungeToUtf8( $string ); # Any old &#1234; stuff
 271         return $string;
 272 }
 273
 274 function wfUtf8Sequence($codepoint) {
 275         if($codepoint <     0x80) return chr($codepoint);
 276         if($codepoint <    0x800) return chr($codepoint >>  6 & 0x3f | 0xc0) .
 277                                      chr($codepoint       & 0x3f | 0x80);
 278     if($codepoint <  0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
 279                                      chr($codepoint >>  6 & 0x3f | 0x80) .
 280                                      chr($codepoint       & 0x3f | 0x80);
 281         if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this
 282                                          chr($codepoint >> 12 & 0x3f | 0x80) .
 283                                      chr($codepoint >>  6 & 0x3f | 0x80) .
 284                                      chr($codepoint       & 0x3f | 0x80);
 285         # Doesn't yet handle outside the BMP
 286         return "&#$codepoint;";
 287 }
 288
 289 function wfMungeToUtf8($string) {
 290         $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
 291         $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
 292         # Should also do named entities here
 293         return $string;
 294 }
 295
 296 function timestamp2ISO8601( $ts ) {
 297         #2003-08-05T18:30:02Z
 298         return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z';
 299 }
 300
 301 function xmlsafe( $string ) {
 302         /**
 303          * The page may contain old data which has not been properly normalized.
 304          * Invalid UTF-8 sequences or forbidden control characters will make our
 305          * XML output invalid, so be sure to strip them out.
 306          */
 307         $string = UtfNormal::cleanUp( $string );
 308
 309         $string = htmlspecialchars( $string );
 310         return $string;
 311 }
 312
 313 function xmlCommentSafe( $text ) {
 314         return str_replace( '--', '\\-\\-', xmlsafe( recodeText( $text ) ) );
 315 }
 316
 317
 318 function array2object( $arr ) {
 319         $o = (object)0;
 320         foreach( $arr as $x => $y ) {
 321                 $o->$x = $y;
 322         }
 323         return $o;
 324 }
 325
 326
 327 /**
 328  * Make CamelCase and /Talk links work
 329  */
 330 function mungeFormat( $text ) {
 331         global $nowiki;
 332         $nowiki = array();
 333         $staged = preg_replace_callback(
 334                 '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s',
 335                 'nowikiPlaceholder', $text );
 336
 337         # This is probably not  100% correct, I'm just
 338         # glancing at the UseModWiki code.
 339         $upper   = "[A-Z]";
 340         $lower   = "[a-z_0-9]";
 341         $any     = "[A-Za-z_0-9]";
 342         $camel   = "(?:$upper+$lower+$upper+$any*)";
 343         $subpage = "(?:\\/$any+)";
 344         $substart = "(?:\\/$upper$any*)";
 345
 346         $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/",
 347                 '[[$1]]', $staged );
 348
 349         $final = preg_replace( '/' . preg_quote( placeholder() ) . '/es',
 350                 'array_shift( $nowiki )', $munged );
 351         return $final;
 352 }
 353
 354
 355 function placeholder( $x = null ) {
 356         return '\xffplaceholder\xff';
 357 }
 358
 359 function nowikiPlaceholder( $matches ) {
 360         global $nowiki;
 361         $nowiki[] = $matches[1];
 362         return placeholder();
 363 }
 364
 365 ?>