X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;ds=sidebyside;f=maintenance%2FimportUseModWiki.php;h=e8463e2e5e1179423cc1e84d48551060b50bff61;hb=ddc2be18c729738a4913c61bd2d82e2fb8467a6b;hp=318483e712425d5c9efaf96b4119c15f6440e05f;hpb=856c95474e3201e430bde51fa4881e6148a323dd;p=lhc%2Fweb%2Fwiklou.git

diff --git a/maintenance/importUseModWiki.php b/maintenance/importUseModWiki.php
index 318483e712..e8463e2e5e 100644
--- a/maintenance/importUseModWiki.php
+++ b/maintenance/importUseModWiki.php
@@ -1,7 +1,7 @@
 <?php
 
 /**
- * Import data from a UseModWiki into a PediaWiki wiki
+ * Import data from a UseModWiki into a MediaWiki wiki
  * 2003-02-09 Brion VIBBER <brion@pobox.com>
  * Based loosely on Magnus's code from 2001-2002
  *
@@ -16,11 +16,20 @@
  * and CamelCase and /Subpage link conversion
  * 2004-11-17
  *
+ * Rewrite output to create Special:Export format for import
+ * instead of raw SQL. Should be 'future-proof' against future
+ * schema changes.
+ * 2005-03-14
+ *
  * @todo document
- * @package MediaWiki
- * @subpackage Maintenance
+ * @file
+ * @ingroup Maintenance
  */
 
+if ( php_sapi_name() != 'cli' ) {
+	echo "Please customize the settings and run me from the command line.";
+	die( -1 );
+}
 
 /** Set these correctly! */
 $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
@@ -32,14 +41,15 @@ $wgRootDirectory = "/kalman/Projects/wiki2002/wiki/lib-http/db/wiki";
 /* globals */
 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
 	$FS = $wgFieldSeparator ;
-	$FS1 = $FS."1" ;
-	$FS2 = $FS."2" ;
-	$FS3 = $FS."3" ;
+	$FS1 = $FS . "1" ;
+	$FS2 = $FS . "2" ;
+	$FS3 = $FS . "3" ;
+
+# Unicode sanitization tools
+require_once( dirname( dirname( __FILE__ ) ) . '/includes/normal/UtfNormal.php' );
 
-$conversiontime = wfTimestampNow(); # Conversions will be marked with this timestamp
 $usercache = array();
 
-wfSeedRandom();
 importPages();
 
 # ------------------------------------------------------------------------------
@@ -47,32 +57,49 @@ importPages();
 function importPages()
 {
 	global $wgRootDirectory;
-	
+
+	$gt = '>';
+	echo <<<XML
+<?xml version="1.0" encoding="UTF-8" ?$gt
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.1/"
+           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+           xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.1/
+                               http://www.mediawiki.org/xml/export-0.1.xsd"
+           version="0.1"
+           xml:lang="en">
+<!-- generated by importUseModWiki.php -->
+
+XML;
 	$letters = array(
 		'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
 		'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
 		'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
-	foreach( $letters as $letter ) {
+	foreach ( $letters as $letter ) {
 		$dir = "$wgRootDirectory/page/$letter";
-		if( is_dir( $dir ) )
+		if ( is_dir( $dir ) )
 			importPageDirectory( $dir );
 	}
+	echo <<<XML
+</mediawiki>
+
+XML;
 }
 
 function importPageDirectory( $dir, $prefix = "" )
 {
-	echo "\n-- Checking page directory $dir\n";
+	echo "\n<!-- Checking page directory " . xmlCommentSafe( $dir ) . " -->\n";
 	$mydir = opendir( $dir );
-	while( $entry = readdir( $mydir ) ) {
-		if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
+	while ( $entry = readdir( $mydir ) ) {
+		$m = array();
+		if ( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
 			echo importPage( $prefix . $m[1] );
 		} else {
-			if( is_dir( "$dir/$entry" ) ) {
-				if( $entry != '.' && $entry != '..' ) {
+			if ( is_dir( "$dir/$entry" ) ) {
+				if ( $entry != '.' && $entry != '..' ) {
 					importPageDirectory( "$dir/$entry", "$entry/" );
 				}
 			} else {
-				echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
+				echo "<!-- File '" . xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n";
 			}
 		}
 	}
@@ -84,24 +111,10 @@ function importPageDirectory( $dir, $prefix = "" )
 /* fetch_ functions
 	Grab a given item from the database
 	*/
-function fetchUser( $uid )
-{
-	die ("fetchUser not implemented" );
-	
-	global $FS,$FS2,$FS3, $wgRootDirectory;
-	
-	$fname = $wgRootDirectory . "/page/" . $title;
-	if( !file_exists( $fname ) ) return false;
-	
-	$data = splitHash( implode( "", file( $fname ) ) );
-	# enough?
-	
-	return $data;
-}
 
 function useModFilename( $title ) {
 	$c = substr( $title, 0, 1 );
-	if(preg_match( '/[A-Z]/i', $c ) ) {
+	if ( preg_match( '/[A-Z]/i', $c ) ) {
 		return strtoupper( $c ) . "/$title";
 	}
 	return "other/$title";
@@ -109,17 +122,18 @@ function useModFilename( $title ) {
 
 function fetchPage( $title )
 {
-	global $FS,$FS1,$FS2,$FS3, $wgRootDirectory;
-	
+	global $FS1, $FS2, $FS3, $wgRootDirectory;
+
 	$fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
-	if( !file_exists( $fname ) ) {
-		die( "Couldn't open file '$fname' for page '$title'.\n" );
+	if ( !file_exists( $fname ) ) {
+		echo "Couldn't open file '$fname' for page '$title'.\n";
+		die( -1 );
 	}
-	
+
 	$page = splitHash( $FS1, file_get_contents( $fname ) );
 	$section = splitHash( $FS2, $page["text_default"] );
 	$text = splitHash( $FS3, $section["data"] );
-	
+
 	return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
 		"minor" => $text["minor"] , "ts" => $section["ts"] ,
 		"username" => $section["username"] , "host" => $section["host"] ) );
@@ -127,24 +141,24 @@ function fetchPage( $title )
 
 function fetchKeptPages( $title )
 {
-	global $FS,$FS1,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
-	
+	global $FS1, $FS2, $FS3, $wgRootDirectory;
+
 	$fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
-	if( !file_exists( $fname ) ) return array();
-	
+	if ( !file_exists( $fname ) ) return array();
+
 	$keptlist = explode( $FS1, file_get_contents( $fname ) );
 	array_shift( $keptlist ); # Drop the junk at beginning of file
-	
+
 	$revisions = array();
-	foreach( $keptlist as $rev ) {
+	foreach ( $keptlist as $rev ) {
 		$section = splitHash( $FS2, $rev );
 		$text = splitHash( $FS3, $section["data"] );
-		if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
+		if ( $text["text"] && $text["minor"] != "" && ( $section["ts"] * 1 > 0 ) ) {
 			array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
 				"minor" => $text["minor"] , "ts" => $section["ts"] ,
 				"username" => $section["username"] , "host" => $section["host"] ) ) );
 		} else {
-			echo "-- skipped a bad old revision\n";
+			echo "<!-- skipped a bad old revision -->\n";
 		}
 	}
 	return $revisions;
@@ -153,7 +167,7 @@ function fetchKeptPages( $title )
 function splitHash ( $sep , $str ) {
 	$temp = explode ( $sep , $str ) ;
 	$ret = array () ;
-	for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
+	for ( $i = 0; $i + 1 < count ( $temp ) ; $i++ ) {
 		$ret[$temp[$i]] = $temp[++$i] ;
 		}
 	return $ret ;
@@ -164,53 +178,21 @@ function splitHash ( $sep , $str ) {
 	Take a fetched item and produce SQL
 	*/
 
-/* importUser
-	$uid is the UseMod user id number.
-	The new ones will be assigned arbitrarily and are for internal use only.
-	
-	THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
-	*/
-function importUser( $uid )
-{
-	global $last_uid, $user_list, $wgTimestampCorrection;
-	die("importUser NYI");
-	return "";
-
-	$stuff = fetchUser( $uid );
-	$last_uid++;
-
-	$name = wfStrencode( $stuff->username );
-	$hash = md5hash( $stuff->password ); # Doable?
-	$tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
-	$hideminor = ($stuff['rcall'] ? 0 : 1);
-	$options = "cols={$stuff['editcols']}
-rows={$stuff['editrows']}
-rcdays={$stuff['rcdays']}
-timecorrection={$tzoffset}
-hideminor={$hideminor}
-	";
-	
-	$sql = "INSERT
-		INTO user (user_id,user_name,user_password,user_options)
-		VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
-	return $sql;
-}
-
 function checkUserCache( $name, $host )
 {
 	global $usercache;
 
-	if( $name ) {
-		if( in_array( $name, $usercache ) ) {
+	if ( $name ) {
+		if ( in_array( $name, $usercache ) ) {
 			$userid = $usercache[$name];
 		} else {
 			# If we haven't imported user accounts
 			$userid = 0;
 		}
-		$username = wfStrencode( $name );
+		$username = str_replace( '_', ' ', $name );
 	} else {
 		$userid = 0;
-		$username = wfStrencode( $host );
+		$username = $host;
 	}
 	return array( $userid, $username );
 }
@@ -218,76 +200,66 @@ function checkUserCache( $name, $host )
 function importPage( $title )
 {
 	global $usercache;
-	global $conversiontime;
-	
-	echo "\n-- Importing page $title\n";
+
+	echo "\n<!-- Importing page " . xmlCommentSafe( $title ) . " -->\n";
 	$page = fetchPage( $title );
 
-	$newtitle = wfStrencode( recodeText( $title ) );
-	$namespace = 0;
-	
+	$newtitle = xmlsafe( str_replace( '_', ' ', recodeText( $title ) ) );
+
 	$munged = mungeFormat( $page->text );
-	if( $munged != $page->text ) {
+	if ( $munged != $page->text ) {
 		/**
 		 * Save a *new* revision with the conversion, and put the
 		 * previous last version into the history.
 		 */
-		$text = wfStrencode( recodeText( $munged ) );
-		$comment = "link fix";
-		$minor = 1;
-		$userid = 0;
-		$username = "Conversion script";
-		$timestamp = wfUnix2Timestamp( time() );
-		$redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
-		$random = mt_rand() / mt_getrandmax();
-		$inverse = wfInvertTimestamp( $timestamp );
-		
-		$revisions = array( $page );
+		$next = array2object( array(
+			'text'     => $munged,
+			'minor'    => 1,
+			'username' => 'Conversion script',
+			'host'     => '127.0.0.1',
+			'ts'       => time(),
+			'summary'  => 'link fix',
+			) );
+		$revisions = array( $page, $next );
 	} else {
 		/**
 		 * Current revision:
 		 */
-		$text = wfStrencode( recodeText( $page->text ) );
-		$comment = wfStrencode( recodeText( $page->summary ) );
-		$minor = ($page->minor ? 1 : 0);
-		list( $userid, $username ) = checkUserCache( $page->username, $page->host );
-		$username = wfStrencode( recodeText( $username ) );
-		$timestamp = wfUnix2Timestamp( $page->ts );
-		$redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
-		$random = mt_rand() / mt_getrandmax();
-		$inverse = wfInvertTimestamp( $timestamp );
-
-		$revisions = array();
+		$revisions = array( $page );
 	}
-	$sql = "
-INSERT
-	INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,inverse_timestamp,cur_touched,cur_minor_edit,cur_is_redirect,cur_random) VALUES
-	($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse','$conversiontime',$minor,$redirect,$random);\n";
+	$xml = <<<XML
+	<page>
+		<title>$newtitle</title>
+
+XML;
 
 	# History
 	$revisions = array_merge( $revisions, fetchKeptPages( $title ) );
-	if(count( $revisions ) == 0 ) {
-		return $sql;
+	if ( count( $revisions ) == 0 ) {
+		return NULL; // Was "$sql", which does not appear to be defined.
 	}
-	
-	$any = false;
-	$sql .= "INSERT
-	INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,inverse_timestamp,old_minor_edit) VALUES\n";
-	foreach( $revisions as $rev ) {
-		$text = wfStrencode( recodeText( $rev->text ) );
-		$minor = ($rev->minor ? 1 : 0);
-		list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
-		$username = wfStrencode( recodeText( $username ) );
-		$timestamp = wfUnix2Timestamp( $rev->ts );
-		$inverse = wfInvertTimestamp( $timestamp );
-		$comment = wfStrencode( recodeText( $rev->summary ) );
-		
-		if($any) $sql .= ",";
-		$sql .= "\n\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse',$minor)";
-		$any = true;
+
+	foreach ( $revisions as $rev ) {
+		$text      = xmlsafe( recodeText( $rev->text ) );
+		$minor     = ( $rev->minor ? '<minor/>' : '' );
+		list( /* $userid */ , $username ) = checkUserCache( $rev->username, $rev->host );
+		$username  = xmlsafe( recodeText( $username ) );
+		$timestamp = xmlsafe( timestamp2ISO8601( $rev->ts ) );
+		$comment   = xmlsafe( recodeText( $rev->summary ) );
+
+		$xml .= <<<XML
+		<revision>
+			<timestamp>$timestamp</timestamp>
+			<contributor><username>$username</username></contributor>
+			$minor
+			<comment>$comment</comment>
+			<text>$text</text>
+		</revision>
+
+XML;
 	}
-	$sql .= ";\n\n";
-	return $sql;
+	$xml .= "</page>\n\n";
+	return $xml;
 }
 
 # Whee!
@@ -300,68 +272,53 @@ function recodeText( $string ) {
 	return $string;
 }
 
-function wfUtf8Sequence($codepoint) {
-	if($codepoint <     0x80) return chr($codepoint);
-	if($codepoint <    0x800) return chr($codepoint >>  6 & 0x3f | 0xc0) .
-                                     chr($codepoint       & 0x3f | 0x80);
-    if($codepoint <  0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
-                                     chr($codepoint >>  6 & 0x3f | 0x80) .
-                                     chr($codepoint       & 0x3f | 0x80);
-	if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this
-	                                 chr($codepoint >> 12 & 0x3f | 0x80) .
-                                     chr($codepoint >>  6 & 0x3f | 0x80) .
-                                     chr($codepoint       & 0x3f | 0x80);
+function wfUtf8Sequence( $codepoint ) {
+	if ( $codepoint <     0x80 ) return chr( $codepoint );
+	if ( $codepoint <    0x800 ) return chr( $codepoint >>  6 & 0x3f | 0xc0 ) .
+                                     chr( $codepoint       & 0x3f | 0x80 );
+    if ( $codepoint <  0x10000 ) return chr( $codepoint >> 12 & 0x0f | 0xe0 ) .
+                                     chr( $codepoint >>  6 & 0x3f | 0x80 ) .
+                                     chr( $codepoint       & 0x3f | 0x80 );
+	if ( $codepoint < 0x100000 ) return chr( $codepoint >> 18 & 0x07 | 0xf0 ) . # Double-check this
+	                                 chr( $codepoint >> 12 & 0x3f | 0x80 ) .
+                                     chr( $codepoint >>  6 & 0x3f | 0x80 ) .
+                                     chr( $codepoint       & 0x3f | 0x80 );
 	# Doesn't yet handle outside the BMP
 	return "&#$codepoint;";
 }
 
-function wfMungeToUtf8($string) {
+function wfMungeToUtf8( $string ) {
 	$string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
 	$string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
 	# Should also do named entities here
 	return $string;
 }
 
-function wfStrencode( $string ) {
-	return mysql_escape_string( $string );
+function timestamp2ISO8601( $ts ) {
+	# 2003-08-05T18:30:02Z
+	return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z';
 }
 
-function wfUnix2Timestamp( $unixtime ) {
-        return gmdate( "YmdHis", $unixtime );
-}
+function xmlsafe( $string ) {
+	/**
+	 * The page may contain old data which has not been properly normalized.
+	 * Invalid UTF-8 sequences or forbidden control characters will make our
+	 * XML output invalid, so be sure to strip them out.
+	 */
+	$string = UtfNormal::cleanUp( $string );
 
-function wfTimestamp2Unix( $ts )
-{
-        return gmmktime( ( (int)substr( $ts, 8, 2) ),
-                  (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
-                  (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
-                  (int)substr( $ts, 0, 4 ) );
-}
-
-function wfTimestampNow() {
-	# return NOW
-	return gmdate( "YmdHis" );
+	$string = htmlspecialchars( $string );
+	return $string;
 }
 
-# Sorting hack for MySQL 3, which doesn't use index sorts for DESC
-function wfInvertTimestamp( $ts ) {
-	return strtr(
-		$ts,
-		"0123456789",
-		"9876543210"
-	);
+function xmlCommentSafe( $text ) {
+	return str_replace( '--', '\\-\\-', xmlsafe( recodeText( $text ) ) );
 }
 
-function wfSeedRandom()
-{
-	$seed = hexdec(substr(md5(microtime()),-8)) & 0x7fffffff;
-	mt_srand( $seed );
-	$wgRandomSeeded = true;
-}
 
 function array2object( $arr ) {
 	$o = (object)0;
-	foreach( $arr as $x => $y ) {
+	foreach ( $arr as $x => $y ) {
 		$o->$x = $y;
 	}
 	return $o;
@@ -377,7 +334,7 @@ function mungeFormat( $text ) {
 	$staged = preg_replace_callback(
 		'/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s',
 		'nowikiPlaceholder', $text );
-	
+
 	# This is probably not  100% correct, I'm just
 	# glancing at the UseModWiki code.
 	$upper   = "[A-Z]";
@@ -386,10 +343,10 @@ function mungeFormat( $text ) {
 	$camel   = "(?:$upper+$lower+$upper+$any*)";
 	$subpage = "(?:\\/$any+)";
 	$substart = "(?:\\/$upper$any*)";
-	
+
 	$munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/",
 		'[[$1]]', $staged );
-	
+
 	$final = preg_replace( '/' . preg_quote( placeholder() ) . '/es',
 		'array_shift( $nowiki )', $munged );
 	return $final;
@@ -406,4 +363,4 @@ function nowikiPlaceholder( $matches ) {
 	return placeholder();
 }
 
-?>
+