maintenance/importUseModWiki.php

   1 <?php
   2 /**
   3  * Import data from a UseModWiki into a MediaWiki wiki
   4  * 2003-02-09 Brion VIBBER <brion@pobox.com>
   5  * Based loosely on Magnus's code from 2001-2002
   6  *
   7  * Updated limited version to get something working temporarily
   8  * 2003-10-09
   9  * Be sure to run the link & index rebuilding scripts!
  10  *
  11  * Some more munging for charsets etc
  12  * 2003-11-28
  13  *
  14  * Partial fix for pages starting with lowercase letters (??)
  15  * and CamelCase and /Subpage link conversion
  16  * 2004-11-17
  17  *
  18  * Rewrite output to create Special:Export format for import
  19  * instead of raw SQL. Should be 'future-proof' against future
  20  * schema changes.
  21  * 2005-03-14
  22  *
  23  * This program is free software; you can redistribute it and/or modify
  24  * it under the terms of the GNU General Public License as published by
  25  * the Free Software Foundation; either version 2 of the License, or
  26  * (at your option) any later version.
  27  *
  28  * This program is distributed in the hope that it will be useful,
  29  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  30  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  31  * GNU General Public License for more details.
  32  *
  33  * You should have received a copy of the GNU General Public License along
  34  * with this program; if not, write to the Free Software Foundation, Inc.,
  35  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  36  * http://www.gnu.org/copyleft/gpl.html
  37  *
  38  * @todo document
  39  * @file
  40  * @ingroup Maintenance
  41  */
  42
  43 if ( php_sapi_name() != 'cli' ) {
  44         echo "Please customize the settings and run me from the command line.";
  45         die( -1 );
  46 }
  47
  48 /** Set these correctly! */
  49 $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
  50 $wgRootDirectory = "/kalman/Projects/wiki2002/wiki/lib-http/db/wiki";
  51
  52 /* On a large wiki, you might run out of memory */
  53 @ini_set( 'memory_limit', '40M' );
  54
  55 /* globals */
  56 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
  57         $FS = $wgFieldSeparator ;
  58         $FS1 = $FS . "1" ;
  59         $FS2 = $FS . "2" ;
  60         $FS3 = $FS . "3" ;
  61
  62 # Unicode sanitization tools
  63 require_once( dirname( dirname( __FILE__ ) ) . '/includes/normal/UtfNormal.php' );
  64
  65 $usercache = array();
  66
  67 importPages();
  68
  69 # ------------------------------------------------------------------------------
  70
  71 function importPages()
  72 {
  73         global $wgRootDirectory;
  74
  75         $gt = '>';
  76         echo <<<XML
  77 <?xml version="1.0" encoding="UTF-8" ?>
  78 <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.1/"
  79                    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  80                    xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.1/
  81                                                            http://www.mediawiki.org/xml/export-0.1.xsd"
  82                    version="0.1"
  83                    xml:lang="en">
  84 <!-- generated by importUseModWiki.php -->
  85
  86 XML;
  87         $letters = array(
  88                 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
  89                 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
  90                 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
  91         foreach ( $letters as $letter ) {
  92                 $dir = "$wgRootDirectory/page/$letter";
  93                 if ( is_dir( $dir ) )
  94                         importPageDirectory( $dir );
  95         }
  96         echo <<<XML
  97 </mediawiki>
  98
  99 XML;
 100 }
 101
 102 function importPageDirectory( $dir, $prefix = "" )
 103 {
 104         echo "\n<!-- Checking page directory " . xmlCommentSafe( $dir ) . " -->\n";
 105         $mydir = opendir( $dir );
 106         while ( $entry = readdir( $mydir ) ) {
 107                 $m = array();
 108                 if ( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
 109                         echo importPage( $prefix . $m[1] );
 110                 } else {
 111                         if ( is_dir( "$dir/$entry" ) ) {
 112                                 if ( $entry != '.' && $entry != '..' ) {
 113                                         importPageDirectory( "$dir/$entry", "$entry/" );
 114                                 }
 115                         } else {
 116                                 echo "<!-- File '" . xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n";
 117                         }
 118                 }
 119         }
 120 }
 121
 122
 123 # ------------------------------------------------------------------------------
 124
 125 /* fetch_ functions
 126         Grab a given item from the database
 127         */
 128
 129 function useModFilename( $title ) {
 130         $c = substr( $title, 0, 1 );
 131         if ( preg_match( '/[A-Z]/i', $c ) ) {
 132                 return strtoupper( $c ) . "/$title";
 133         }
 134         return "other/$title";
 135 }
 136
 137 function fetchPage( $title )
 138 {
 139         global $FS1, $FS2, $FS3, $wgRootDirectory;
 140
 141         $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
 142         if ( !file_exists( $fname ) ) {
 143                 echo "Couldn't open file '$fname' for page '$title'.\n";
 144                 die( -1 );
 145         }
 146
 147         $page = splitHash( $FS1, file_get_contents( $fname ) );
 148         $section = splitHash( $FS2, $page["text_default"] );
 149         $text = splitHash( $FS3, $section["data"] );
 150
 151         return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
 152                 "minor" => $text["minor"] , "ts" => $section["ts"] ,
 153                 "username" => $section["username"] , "host" => $section["host"] ) );
 154 }
 155
 156 function fetchKeptPages( $title )
 157 {
 158         global $FS1, $FS2, $FS3, $wgRootDirectory;
 159
 160         $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
 161         if ( !file_exists( $fname ) ) return array();
 162
 163         $keptlist = explode( $FS1, file_get_contents( $fname ) );
 164         array_shift( $keptlist ); # Drop the junk at beginning of file
 165
 166         $revisions = array();
 167         foreach ( $keptlist as $rev ) {
 168                 $section = splitHash( $FS2, $rev );
 169                 $text = splitHash( $FS3, $section["data"] );
 170                 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"] * 1 > 0 ) ) {
 171                         array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
 172                                 "minor" => $text["minor"] , "ts" => $section["ts"] ,
 173                                 "username" => $section["username"] , "host" => $section["host"] ) ) );
 174                 } else {
 175                         echo "<!-- skipped a bad old revision -->\n";
 176                 }
 177         }
 178         return $revisions;
 179 }
 180
 181 function splitHash ( $sep , $str ) {
 182         $temp = explode ( $sep , $str ) ;
 183         $ret = array () ;
 184         for ( $i = 0; $i + 1 < count ( $temp ) ; $i++ ) {
 185                 $ret[$temp[$i]] = $temp[++$i] ;
 186                 }
 187         return $ret ;
 188         }
 189
 190
 191 /* import_ functions
 192         Take a fetched item and produce SQL
 193         */
 194
 195 function checkUserCache( $name, $host )
 196 {
 197         global $usercache;
 198
 199         if ( $name ) {
 200                 if ( in_array( $name, $usercache ) ) {
 201                         $userid = $usercache[$name];
 202                 } else {
 203                         # If we haven't imported user accounts
 204                         $userid = 0;
 205                 }
 206                 $username = str_replace( '_', ' ', $name );
 207         } else {
 208                 $userid = 0;
 209                 $username = $host;
 210         }
 211         return array( $userid, $username );
 212 }
 213
 214 function importPage( $title )
 215 {
 216         echo "\n<!-- Importing page " . xmlCommentSafe( $title ) . " -->\n";
 217         $page = fetchPage( $title );
 218
 219         $newtitle = xmlsafe( str_replace( '_', ' ', recodeText( $title ) ) );
 220
 221         $munged = mungeFormat( $page->text );
 222         if ( $munged != $page->text ) {
 223                 /**
 224                  * Save a *new* revision with the conversion, and put the
 225                  * previous last version into the history.
 226                  */
 227                 $next = array2object( array(
 228                         'text'     => $munged,
 229                         'minor'    => 1,
 230                         'username' => 'Conversion script',
 231                         'host'     => '127.0.0.1',
 232                         'ts'       => time(),
 233                         'summary'  => 'link fix',
 234                         ) );
 235                 $revisions = array( $page, $next );
 236         } else {
 237                 /**
 238                  * Current revision:
 239                  */
 240                 $revisions = array( $page );
 241         }
 242         $xml = <<<XML
 243         <page>
 244                 <title>$newtitle</title>
 245
 246 XML;
 247
 248         # History
 249         $revisions = array_merge( $revisions, fetchKeptPages( $title ) );
 250         if ( count( $revisions ) == 0 ) {
 251                 return NULL; // Was "$sql", which does not appear to be defined.
 252         }
 253
 254         foreach ( $revisions as $rev ) {
 255                 $text      = xmlsafe( recodeText( $rev->text ) );
 256                 $minor     = ( $rev->minor ? '<minor/>' : '' );
 257                 list( /* $userid */ , $username ) = checkUserCache( $rev->username, $rev->host );
 258                 $username  = xmlsafe( recodeText( $username ) );
 259                 $timestamp = xmlsafe( timestamp2ISO8601( $rev->ts ) );
 260                 $comment   = xmlsafe( recodeText( $rev->summary ) );
 261
 262                 $xml .= <<<XML
 263                 <revision>
 264                         <timestamp>$timestamp</timestamp>
 265                         <contributor><username>$username</username></contributor>
 266                         $minor
 267                         <comment>$comment</comment>
 268                         <text>$text</text>
 269                 </revision>
 270
 271 XML;
 272         }
 273         $xml .= "</page>\n\n";
 274         return $xml;
 275 }
 276
 277 # Whee!
 278 function recodeText( $string ) {
 279         global $wgImportEncoding;
 280         # For currently latin-1 wikis
 281         $string = str_replace( "\r\n", "\n", $string );
 282         $string = @iconv( $wgImportEncoding, "UTF-8", $string );
 283         $string = wfMungeToUtf8( $string ); # Any old &#1234; stuff
 284         return $string;
 285 }
 286
 287 function wfUtf8Sequence( $codepoint ) {
 288         if ( $codepoint <     0x80 ) return chr( $codepoint );
 289         if ( $codepoint <    0x800 ) return chr( $codepoint >>  6 & 0x3f | 0xc0 ) .
 290                                                                          chr( $codepoint       & 0x3f | 0x80 );
 291         if ( $codepoint <  0x10000 ) return chr( $codepoint >> 12 & 0x0f | 0xe0 ) .
 292                                                                          chr( $codepoint >>  6 & 0x3f | 0x80 ) .
 293                                                                          chr( $codepoint       & 0x3f | 0x80 );
 294         if ( $codepoint < 0x100000 ) return chr( $codepoint >> 18 & 0x07 | 0xf0 ) . # Double-check this
 295                                                                          chr( $codepoint >> 12 & 0x3f | 0x80 ) .
 296                                                                          chr( $codepoint >>  6 & 0x3f | 0x80 ) .
 297                                                                          chr( $codepoint       & 0x3f | 0x80 );
 298         # Doesn't yet handle outside the BMP
 299         return "&#$codepoint;";
 300 }
 301
 302 function wfMungeToUtf8( $string ) {
 303         $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
 304         $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
 305         # Should also do named entities here
 306         return $string;
 307 }
 308
 309 function timestamp2ISO8601( $ts ) {
 310         # 2003-08-05T18:30:02Z
 311         return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z';
 312 }
 313
 314 function xmlsafe( $string ) {
 315         /**
 316          * The page may contain old data which has not been properly normalized.
 317          * Invalid UTF-8 sequences or forbidden control characters will make our
 318          * XML output invalid, so be sure to strip them out.
 319          */
 320         $string = UtfNormal::cleanUp( $string );
 321
 322         $string = htmlspecialchars( $string );
 323         return $string;
 324 }
 325
 326 function xmlCommentSafe( $text ) {
 327         return str_replace( '--', '\\-\\-', xmlsafe( recodeText( $text ) ) );
 328 }
 329
 330
 331 function array2object( $arr ) {
 332         $o = (object)0;
 333         foreach ( $arr as $x => $y ) {
 334                 $o->$x = $y;
 335         }
 336         return $o;
 337 }
 338
 339
 340 /**
 341  * Make CamelCase and /Talk links work
 342  */
 343 function mungeFormat( $text ) {
 344         global $nowiki;
 345         $nowiki = array();
 346         $staged = preg_replace_callback(
 347                 '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s',
 348                 'nowikiPlaceholder', $text );
 349
 350         # This is probably not  100% correct, I'm just
 351         # glancing at the UseModWiki code.
 352         $upper   = "[A-Z]";
 353         $lower   = "[a-z_0-9]";
 354         $any     = "[A-Za-z_0-9]";
 355         $camel   = "(?:$upper+$lower+$upper+$any*)";
 356         $subpage = "(?:\\/$any+)";
 357         $substart = "(?:\\/$upper$any*)";
 358
 359         $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/",
 360                 '[[$1]]', $staged );
 361
 362         $final = preg_replace( '/' . preg_quote( placeholder() ) . '/es',
 363                 'array_shift( $nowiki )', $munged );
 364         return $final;
 365 }
 366
 367
 368 function placeholder( $x = null ) {
 369         return '\xffplaceholder\xff';
 370 }
 371
 372 function nowikiPlaceholder( $matches ) {
 373         global $nowiki;
 374         $nowiki[] = $matches[1];
 375         return placeholder();
 376 }
 377
 378