tableName calls moved inside fieldInfoMulti and removed call that existed only for...

[lhc/web/wiklou.git] / maintenance / importUseModWiki.php
diff --git a/maintenance/importUseModWiki.php b/maintenance/importUseModWiki.php

index c0a4b24..0d01414 100644 (file)
--- a/maintenance/importUseModWiki.php
+++ b/maintenance/importUseModWiki.php
@@ -1,7 +1,7 @@
  <?php
  
  /**
- * Import data from a UseModWiki into a PediaWiki wiki
+ * Import data from a UseModWiki into a MediaWiki wiki
   * 2003-02-09 Brion VIBBER <brion@pobox.com>
   * Based loosely on Magnus's code from 2001-2002
   *
@@ -12,14 +12,31 @@
   * Some more munging for charsets etc
   * 2003-11-28
   *
+ * Partial fix for pages starting with lowercase letters (??)
+ * and CamelCase and /Subpage link conversion
+ * 2004-11-17
+ *
+ * Rewrite output to create Special:Export format for import
+ * instead of raw SQL. Should be 'future-proof' against future
+ * schema changes.
+ * 2005-03-14
+ *
   * @todo document
- * @package MediaWiki
- * @subpackage Maintenance
+ * @file
+ * @ingroup Maintenance
   */
  
+if( php_sapi_name() != 'cli' ) {
+       echo "Please customize the settings and run me from the command line.";
+       die( -1 );
+}
+
  /** Set these correctly! */
  $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
-$wgRootDirectory = "/home/usemod/wiki-ia/lib-http/db/wiki";
+$wgRootDirectory = "/kalman/Projects/wiki2002/wiki/lib-http/db/wiki";
+
+/* On a large wiki, you might run out of memory */
+@ini_set( 'memory_limit', '40M' );
  
  /* globals */
  $wgFieldSeparator = "\xb3"; # Some wikis may use different char
@@ -28,10 +45,11 @@ $wgFieldSeparator = "\xb3"; # Some wikis may use different char
         $FS2 = $FS."2" ;
         $FS3 = $FS."3" ;
  
-$conversiontime = wfTimestampNow(); # Conversions will be marked with this timestamp
+# Unicode sanitization tools
+require_once( dirname( dirname( __FILE__ ) ) . '/includes/normal/UtfNormal.php' );
+
  $usercache = array();
  
-wfSeedRandom();
  importPages();
  
  # ------------------------------------------------------------------------------
@@ -39,7 +57,19 @@ importPages();
  function importPages()
  {
         global $wgRootDirectory;
-       
+
+       $gt = '>';
+       echo <<<XML
+<?xml version="1.0" encoding="UTF-8" ?$gt
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.1/"
+           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+           xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.1/
+                               http://www.mediawiki.org/xml/export-0.1.xsd"
+           version="0.1"
+           xml:lang="en">
+<!-- generated by importUseModWiki.php -->
+
+XML;
         $letters = array(
                 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
                 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
@@ -49,13 +79,18 @@ function importPages()
                 if( is_dir( $dir ) )
                         importPageDirectory( $dir );
         }
+       echo <<<XML
+</mediawiki>
+
+XML;
  }
  
  function importPageDirectory( $dir, $prefix = "" )
  {
-       echo "\n-- Checking page directory $dir\n";
+       echo "\n<!-- Checking page directory " . xmlCommentSafe( $dir ) . " -->\n";
         $mydir = opendir( $dir );
         while( $entry = readdir( $mydir ) ) {
+               $m = array();
                 if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
                         echo importPage( $prefix . $m[1] );
                 } else {
@@ -64,7 +99,7 @@ function importPageDirectory( $dir, $prefix = "" )
                                         importPageDirectory( "$dir/$entry", "$entry/" );
                                 }
                         } else {
-                               echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
+                               echo "<!-- File '" . xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n";
                         }
                 }
         }
@@ -76,42 +111,29 @@ function importPageDirectory( $dir, $prefix = "" )
  /* fetch_ functions
         Grab a given item from the database
         */
-function fetchUser( $uid )
-{
-       die ("fetchUser not implemented" );
-       
-       global $FS,$FS2,$FS3, $wgRootDirectory;
-       
-       $fname = $wgRootDirectory . "/page/" . $title;
-       if( !file_exists( $fname ) ) return false;
-       
-       $data = splitHash( implode( "", file( $fname ) ) );
-       # enough?
-       
-       return $data;
-}
  
  function useModFilename( $title ) {
         $c = substr( $title, 0, 1 );
-       if(preg_match( '/[A-Z]/', $c ) ) {
-               return "$c/$title";
+       if(preg_match( '/[A-Z]/i', $c ) ) {
+               return strtoupper( $c ) . "/$title";
         }
         return "other/$title";
  }
  
  function fetchPage( $title )
  {
-       global $FS,$FS1,$FS2,$FS3, $wgRootDirectory;
-       
+       global $FS1,$FS2,$FS3, $wgRootDirectory;
+
         $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
         if( !file_exists( $fname ) ) {
-               die( "Couldn't open file '$fname' for page '$title'.\n" );
+               echo "Couldn't open file '$fname' for page '$title'.\n";
+               die( -1 );
         }
-       
+
         $page = splitHash( $FS1, file_get_contents( $fname ) );
         $section = splitHash( $FS2, $page["text_default"] );
         $text = splitHash( $FS3, $section["data"] );
-       
+
         return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
                 "minor" => $text["minor"] , "ts" => $section["ts"] ,
                 "username" => $section["username"] , "host" => $section["host"] ) );
@@ -119,14 +141,14 @@ function fetchPage( $title )
  
  function fetchKeptPages( $title )
  {
-       global $FS,$FS1,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
-       
+       global $FS1,$FS2,$FS3, $wgRootDirectory;
+
         $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
         if( !file_exists( $fname ) ) return array();
-       
+
         $keptlist = explode( $FS1, file_get_contents( $fname ) );
         array_shift( $keptlist ); # Drop the junk at beginning of file
-       
+
         $revisions = array();
         foreach( $keptlist as $rev ) {
                 $section = splitHash( $FS2, $rev );
@@ -136,7 +158,7 @@ function fetchKeptPages( $title )
                                 "minor" => $text["minor"] , "ts" => $section["ts"] ,
                                 "username" => $section["username"] , "host" => $section["host"] ) ) );
                 } else {
-                       echo "-- skipped a bad old revision\n";
+                       echo "<!-- skipped a bad old revision -->\n";
                 }
         }
         return $revisions;
@@ -156,38 +178,6 @@ function splitHash ( $sep , $str ) {
         Take a fetched item and produce SQL
         */
  
-/* importUser
-       $uid is the UseMod user id number.
-       The new ones will be assigned arbitrarily and are for internal use only.
-       
-       THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
-       */
-function importUser( $uid )
-{
-       global $last_uid, $user_list, $wgTimestampCorrection;
-       die("importUser NYI");
-       return "";
-
-       $stuff = fetchUser( $uid );
-       $last_uid++;
-
-       $name = wfStrencode( $stuff->username );
-       $hash = md5hash( $stuff->password ); # Doable?
-       $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
-       $hideminor = ($stuff['rcall'] ? 0 : 1);
-       $options = "cols={$stuff['editcols']}
-rows={$stuff['editrows']}
-rcdays={$stuff['rcdays']}
-timecorrection={$tzoffset}
-hideminor={$hideminor}
-       ";
-       
-       $sql = "INSERT
-               INTO user (user_id,user_name,user_password,user_options)
-               VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
-       return $sql;
-}
-
  function checkUserCache( $name, $host )
  {
         global $usercache;
@@ -199,10 +189,10 @@ function checkUserCache( $name, $host )
                         # If we haven't imported user accounts
                         $userid = 0;
                 }
-               $username = wfStrencode( $name );
+               $username = str_replace( '_', ' ', $name );
         } else {
                 $userid = 0;
-               $username = wfStrencode( $host );
+               $username = $host;
         }
         return array( $userid, $username );
  }
@@ -210,53 +200,66 @@ function checkUserCache( $name, $host )
  function importPage( $title )
  {
         global $usercache;
-       global $conversiontime;
-       
-       echo "\n-- Importing page $title\n";
+
+       echo "\n<!-- Importing page " . xmlCommentSafe( $title ) . " -->\n";
         $page = fetchPage( $title );
  
-       $newtitle = wfStrencode( recodeText( $title ) );
-       $namespace = 0;
-       
-       # Current revision:
-       $text = wfStrencode( recodeText( $page->text ) );
-       $comment = wfStrencode( recodeText( $page->summary ) );
-       $minor = ($page->minor ? 1 : 0);
-       list( $userid, $username ) = checkUserCache( $page->username, $page->host );
-       $username = wfStrencode( recodeText( $username ) );
-       $timestamp = wfUnix2Timestamp( $page->ts );
-       $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
-       $random = mt_rand() / mt_getrandmax();
-       $inverse = wfInvertTimestamp( $timestamp );
-       $sql = "
-INSERT
-       INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,inverse_timestamp,cur_touched,cur_minor_edit,cur_is_redirect,cur_random) VALUES
-       ($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse','$conversiontime',$minor,$redirect,$random);\n";
+       $newtitle = xmlsafe( str_replace( '_', ' ', recodeText( $title ) ) );
+
+       $munged = mungeFormat( $page->text );
+       if( $munged != $page->text ) {
+               /**
+                * Save a *new* revision with the conversion, and put the
+                * previous last version into the history.
+                */
+               $next = array2object( array(
+                       'text'     => $munged,
+                       'minor'    => 1,
+                       'username' => 'Conversion script',
+                       'host'     => '127.0.0.1',
+                       'ts'       => time(),
+                       'summary'  => 'link fix',
+                       ) );
+               $revisions = array( $page, $next );
+       } else {
+               /**
+                * Current revision:
+                */
+               $revisions = array( $page );
+       }
+       $xml = <<<XML
+       <page>
+               <title>$newtitle</title>
+
+XML;
  
         # History
-       $revisions = fetchKeptPages( $title );
+       $revisions = array_merge( $revisions, fetchKeptPages( $title ) );
         if(count( $revisions ) == 0 ) {
-               return $sql;
+               return NULL; // Was "$sql", which does not appear to be defined.
         }
-       
-       $any = false;
-       $sql .= "INSERT
-       INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,inverse_timestamp,old_minor_edit) VALUES\n";
+
         foreach( $revisions as $rev ) {
-               $text = wfStrencode( recodeText( $rev->text ) );
-               $minor = ($rev->minor ? 1 : 0);
-               list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
-               $username = wfStrencode( recodeText( $username ) );
-               $timestamp = wfUnix2Timestamp( $rev->ts );
-               $inverse = wfInvertTimestamp( $timestamp );
-               $comment = wfStrencode( recodeText( $rev->summary ) );
-               
-               if($any) $sql .= ",";
-               $sql .= "\n\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse',$minor)";
-               $any = true;
+               $text      = xmlsafe( recodeText( $rev->text ) );
+               $minor     = ($rev->minor ? '<minor/>' : '');
+               list( /* $userid */ , $username ) = checkUserCache( $rev->username, $rev->host );
+               $username  = xmlsafe( recodeText( $username ) );
+               $timestamp = xmlsafe( timestamp2ISO8601( $rev->ts ) );
+               $comment   = xmlsafe( recodeText( $rev->summary ) );
+
+               $xml .= <<<XML
+               <revision>
+                       <timestamp>$timestamp</timestamp>
+                       <contributor><username>$username</username></contributor>
+                       $minor
+                       <comment>$comment</comment>
+                       <text>$text</text>
+               </revision>
+
+XML;
         }
-       $sql .= ";\n\n";
-       return $sql;
+       $xml .= "</page>\n\n";
+       return $xml;
  }
  
  # Whee!
@@ -264,7 +267,7 @@ function recodeText( $string ) {
         global $wgImportEncoding;
         # For currently latin-1 wikis
         $string = str_replace( "\r\n", "\n", $string );
-       $string = iconv( $wgImportEncoding, "UTF-8", $string );
+       $string = @iconv( $wgImportEncoding, "UTF-8", $string );
         $string = wfMungeToUtf8( $string ); # Any old &#1234; stuff
         return $string;
  }
@@ -291,42 +294,27 @@ function wfMungeToUtf8($string) {
         return $string;
  }
  
-function wfStrencode( $string ) {
-       return mysql_escape_string( $string );
+function timestamp2ISO8601( $ts ) {
+       #2003-08-05T18:30:02Z
+       return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z';
  }
  
-function wfUnix2Timestamp( $unixtime ) {
-        return gmdate( "YmdHis", $unixtime );
-}
+function xmlsafe( $string ) {
+       /**
+        * The page may contain old data which has not been properly normalized.
+        * Invalid UTF-8 sequences or forbidden control characters will make our
+        * XML output invalid, so be sure to strip them out.
+        */
+       $string = UtfNormal::cleanUp( $string );
  
-function wfTimestamp2Unix( $ts )
-{
-        return gmmktime( ( (int)substr( $ts, 8, 2) ),
-                  (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
-                  (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
-                  (int)substr( $ts, 0, 4 ) );
-}
-
-function wfTimestampNow() {
-       # return NOW
-       return gmdate( "YmdHis" );
+       $string = htmlspecialchars( $string );
+       return $string;
  }
  
-# Sorting hack for MySQL 3, which doesn't use index sorts for DESC
-function wfInvertTimestamp( $ts ) {
-       return strtr(
-               $ts,
-               "0123456789",
-               "9876543210"
-       );
+function xmlCommentSafe( $text ) {
+       return str_replace( '--', '\\-\\-', xmlsafe( recodeText( $text ) ) );
  }
  
-function wfSeedRandom()
-{
-       $seed = hexdec(substr(md5(microtime()),-8)) & 0x7fffffff;
-       mt_srand( $seed );
-       $wgRandomSeeded = true;
-}
  
  function array2object( $arr ) {
         $o = (object)0;
@@ -336,4 +324,43 @@ function array2object( $arr ) {
         return $o;
  }
  
-?>
+
+/**
+ * Make CamelCase and /Talk links work
+ */
+function mungeFormat( $text ) {
+       global $nowiki;
+       $nowiki = array();
+       $staged = preg_replace_callback(
+               '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s',
+               'nowikiPlaceholder', $text );
+
+       # This is probably not  100% correct, I'm just
+       # glancing at the UseModWiki code.
+       $upper   = "[A-Z]";
+       $lower   = "[a-z_0-9]";
+       $any     = "[A-Za-z_0-9]";
+       $camel   = "(?:$upper+$lower+$upper+$any*)";
+       $subpage = "(?:\\/$any+)";
+       $substart = "(?:\\/$upper$any*)";
+
+       $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/",
+               '[[$1]]', $staged );
+
+       $final = preg_replace( '/' . preg_quote( placeholder() ) . '/es',
+               'array_shift( $nowiki )', $munged );
+       return $final;
+}
+
+
+function placeholder( $x = null ) {
+       return '\xffplaceholder\xff';
+}
+
+function nowikiPlaceholder( $matches ) {
+       global $nowiki;
+       $nowiki[] = $matches[1];
+       return placeholder();
+}
+
+