Based loosely on Magnus's code from 2001-2002 Updated limited version to get something working temporarily 2003-10-09 Be sure to run the link & index rebuilding scripts! Some more munging for charsets etc 2003-11-28 */ /* Set these correctly! */ $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */ $wgRootDirectory = "/home/usemod/wiki-fi/lib-http/db/wiki"; /* globals */ $wgFieldSeparator = "\xb3"; # Some wikis may use different char $FS = $wgFieldSeparator ; $FS1 = $FS."1" ; $FS2 = $FS."2" ; $FS3 = $FS."3" ; $conversiontime = wfTimestampNow(); # Conversions will be marked with this timestamp $usercache = array(); wfSeedRandom(); importPages(); # ------------------------------------------------------------------------------ function importPages() { global $wgRootDirectory; $letters = array( 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' ); foreach( $letters as $letter ) { $dir = "$wgRootDirectory/page/$letter"; if( is_dir( $dir ) ) importPageDirectory( $dir ); } } function importPageDirectory( $dir, $prefix = "" ) { echo "\n-- Checking page directory $dir\n"; $mydir = opendir( $dir ); while( $entry = readdir( $mydir ) ) { if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) { echo importPage( $prefix . $m[1] ); } else { if( is_dir( "$dir/$entry" ) ) { if( $entry != '.' && $entry != '..' ) { importPageDirectory( "$dir/$entry", "$entry/" ); } } else { echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n"; } } } } # ------------------------------------------------------------------------------ /* fetch_ functions Grab a given item from the database */ function fetchUser( $uid ) { die ("fetchUser not implemented" ); global $FS,$FS2,$FS3, $wgRootDirectory; $fname = $wgRootDirectory . "/page/" . $title; if( !file_exists( $fname ) ) return false; $data = splitHash( implode( "", file( $fname ) ) ); # enough? return $data; } function useModFilename( $title ) { $c = substr( $title, 0, 1 ); if(preg_match( '/[A-Z]/', $c ) ) { return "$c/$title"; } return "other/$title"; } function fetchPage( $title ) { global $FS,$FS1,$FS2,$FS3, $wgRootDirectory; $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db"; if( !file_exists( $fname ) ) { die( "Couldn't open file '$fname' for page '$title'.\n" ); } $page = splitHash( $FS1, file_get_contents( $fname ) ); $section = splitHash( $FS2, $page["text_default"] ); $text = splitHash( $FS3, $section["data"] ); return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] , "minor" => $text["minor"] , "ts" => $section["ts"] , "username" => $section["username"] , "host" => $section["host"] ) ); } function fetchKeptPages( $title ) { global $FS,$FS1,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection; $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp"; if( !file_exists( $fname ) ) return array(); $keptlist = explode( $FS1, file_get_contents( $fname ) ); array_shift( $keptlist ); # Drop the junk at beginning of file $revisions = array(); foreach( $keptlist as $rev ) { $section = splitHash( $FS2, $rev ); $text = splitHash( $FS3, $section["data"] ); if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) { array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] , "minor" => $text["minor"] , "ts" => $section["ts"] , "username" => $section["username"] , "host" => $section["host"] ) ) ); } else { echo "-- skipped a bad old revision\n"; } } return $revisions; } function splitHash ( $sep , $str ) { $temp = explode ( $sep , $str ) ; $ret = array () ; for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) { $ret[$temp[$i]] = $temp[++$i] ; } return $ret ; } /* import_ functions Take a fetched item and produce SQL */ /* importUser $uid is the UseMod user id number. The new ones will be assigned arbitrarily and are for internal use only. THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR */ function importUser( $uid ) { global $last_uid, $user_list, $wgTimestampCorrection; die("importUser NYI"); return ""; $stuff = fetchUser( $uid ); $last_uid++; $name = wfStrencode( $stuff->username ); $hash = md5hash( $stuff->password ); # Doable? $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1 $hideminor = ($stuff['rcall'] ? 0 : 1); $options = "cols={$stuff['editcols']} rows={$stuff['editrows']} rcdays={$stuff['rcdays']} timecorrection={$tzoffset} hideminor={$hideminor} "; $sql = "INSERT INTO user (user_id,user_name,user_password,user_options) VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n"; return $sql; } function checkUserCache( $name, $host ) { global $usercache; if( $name ) { if( in_array( $name, $usercache ) ) { $userid = $usercache[$name]; } else { # If we haven't imported user accounts $userid = 0; } $username = wfStrencode( $name ); } else { $userid = 0; $username = wfStrencode( $host ); } return array( $userid, $username ); } function importPage( $title ) { global $usercache; global $conversiontime; echo "\n-- Importing page $title\n"; $page = fetchPage( $title ); $newtitle = wfStrencode( recodeText( $title ) ); $namespace = 0; # Current revision: $text = wfStrencode( recodeText( $page->text ) ); $comment = wfStrencode( recodeText( $page->summary ) ); $minor = ($page->minor ? 1 : 0); list( $userid, $username ) = checkUserCache( $page->username, $page->host ); $timestamp = wfUnix2Timestamp( $page->ts ); $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 ); $random = mt_rand() / mt_getrandmax(); $inverse = wfInvertTimestamp( $timestamp ); $sql = " INSERT INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,inverse_timestamp,cur_touched,cur_minor_edit,cur_is_redirect,cur_random) VALUES ($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse','$conversiontime',$minor,$redirect,$random);\n"; # History $revisions = fetchKeptPages( $title ); if(count( $revisions ) == 0 ) { return $sql; } $any = false; $sql .= "INSERT INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,inverse_timestamp,old_minor_edit) VALUES\n"; foreach( $revisions as $rev ) { $text = wfStrencode( recodeText( $rev->text ) ); $minor = ($rev->minor ? 1 : 0); list( $userid, $username ) = checkUserCache( $rev->username, $rev->host ); $username = wfStrencode( recodeText( $username ) ); $timestamp = wfUnix2Timestamp( $rev->ts ); $inverse = wfInvertTimestamp( $timestamp ); $comment = wfStrencode( recodeText( $rev->summary ) ); if($any) $sql .= ","; $sql .= "\n\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse',$minor)"; $any = true; } $sql .= ";\n\n"; return $sql; } # Whee! function recodeText( $string ) { global $wgImportEncoding; # For currently latin-1 wikis $string = str_replace( "\r\n", "\n", $string ); $string = iconv( $wgImportEncoding, "UTF-8", $string ); $string = wfMungeToUtf8( $string ); # Any old Ӓ stuff return $string; } function wfUtf8Sequence($codepoint) { if($codepoint < 0x80) return chr($codepoint); if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) . chr($codepoint & 0x3f | 0x80); if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) . chr($codepoint >> 6 & 0x3f | 0x80) . chr($codepoint & 0x3f | 0x80); if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this chr($codepoint >> 12 & 0x3f | 0x80) . chr($codepoint >> 6 & 0x3f | 0x80) . chr($codepoint & 0x3f | 0x80); # Doesn't yet handle outside the BMP return "&#$codepoint;"; } function wfMungeToUtf8($string) { $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string ); $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string ); # Should also do named entities here return $string; } function wfStrencode( $string ) { return mysql_escape_string( $string ); } function wfUnix2Timestamp( $unixtime ) { return gmdate( "YmdHis", $unixtime ); } function wfTimestamp2Unix( $ts ) { return gmmktime( ( (int)substr( $ts, 8, 2) ), (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ), (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ), (int)substr( $ts, 0, 4 ) ); } function wfTimestampNow() { # return NOW return gmdate( "YmdHis" ); } # Sorting hack for MySQL 3, which doesn't use index sorts for DESC function wfInvertTimestamp( $ts ) { return strtr( $ts, "0123456789", "9876543210" ); } function wfSeedRandom() { $seed = hexdec(substr(md5(microtime()),-8)) & 0x7fffffff; mt_srand( $seed ); $wgRandomSeeded = true; } function array2object( $arr ) { $o = (object)0; foreach( $arr as $x => $y ) { $o->$x = $y; } return $o; } ?>