X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;ds=sidebyside;f=maintenance%2FimportUseModWiki.php;h=e8463e2e5e1179423cc1e84d48551060b50bff61;hb=ddc2be18c729738a4913c61bd2d82e2fb8467a6b;hp=318483e712425d5c9efaf96b4119c15f6440e05f;hpb=856c95474e3201e430bde51fa4881e6148a323dd;p=lhc%2Fweb%2Fwiklou.git diff --git a/maintenance/importUseModWiki.php b/maintenance/importUseModWiki.php index 318483e712..e8463e2e5e 100644 --- a/maintenance/importUseModWiki.php +++ b/maintenance/importUseModWiki.php @@ -1,7 +1,7 @@ * Based loosely on Magnus's code from 2001-2002 * @@ -16,11 +16,20 @@ * and CamelCase and /Subpage link conversion * 2004-11-17 * + * Rewrite output to create Special:Export format for import + * instead of raw SQL. Should be 'future-proof' against future + * schema changes. + * 2005-03-14 + * * @todo document - * @package MediaWiki - * @subpackage Maintenance + * @file + * @ingroup Maintenance */ +if ( php_sapi_name() != 'cli' ) { + echo "Please customize the settings and run me from the command line."; + die( -1 ); +} /** Set these correctly! */ $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */ @@ -32,14 +41,15 @@ $wgRootDirectory = "/kalman/Projects/wiki2002/wiki/lib-http/db/wiki"; /* globals */ $wgFieldSeparator = "\xb3"; # Some wikis may use different char $FS = $wgFieldSeparator ; - $FS1 = $FS."1" ; - $FS2 = $FS."2" ; - $FS3 = $FS."3" ; + $FS1 = $FS . "1" ; + $FS2 = $FS . "2" ; + $FS3 = $FS . "3" ; + +# Unicode sanitization tools +require_once( dirname( dirname( __FILE__ ) ) . '/includes/normal/UtfNormal.php' ); -$conversiontime = wfTimestampNow(); # Conversions will be marked with this timestamp $usercache = array(); -wfSeedRandom(); importPages(); # ------------------------------------------------------------------------------ @@ -47,32 +57,49 @@ importPages(); function importPages() { global $wgRootDirectory; - + + $gt = '>'; + echo << + + +XML; $letters = array( 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' ); - foreach( $letters as $letter ) { + foreach ( $letters as $letter ) { $dir = "$wgRootDirectory/page/$letter"; - if( is_dir( $dir ) ) + if ( is_dir( $dir ) ) importPageDirectory( $dir ); } + echo << + +XML; } function importPageDirectory( $dir, $prefix = "" ) { - echo "\n-- Checking page directory $dir\n"; + echo "\n\n"; $mydir = opendir( $dir ); - while( $entry = readdir( $mydir ) ) { - if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) { + while ( $entry = readdir( $mydir ) ) { + $m = array(); + if ( preg_match( '/^(.+)\.db$/', $entry, $m ) ) { echo importPage( $prefix . $m[1] ); } else { - if( is_dir( "$dir/$entry" ) ) { - if( $entry != '.' && $entry != '..' ) { + if ( is_dir( "$dir/$entry" ) ) { + if ( $entry != '.' && $entry != '..' ) { importPageDirectory( "$dir/$entry", "$entry/" ); } } else { - echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n"; + echo "\n"; } } } @@ -84,24 +111,10 @@ function importPageDirectory( $dir, $prefix = "" ) /* fetch_ functions Grab a given item from the database */ -function fetchUser( $uid ) -{ - die ("fetchUser not implemented" ); - - global $FS,$FS2,$FS3, $wgRootDirectory; - - $fname = $wgRootDirectory . "/page/" . $title; - if( !file_exists( $fname ) ) return false; - - $data = splitHash( implode( "", file( $fname ) ) ); - # enough? - - return $data; -} function useModFilename( $title ) { $c = substr( $title, 0, 1 ); - if(preg_match( '/[A-Z]/i', $c ) ) { + if ( preg_match( '/[A-Z]/i', $c ) ) { return strtoupper( $c ) . "/$title"; } return "other/$title"; @@ -109,17 +122,18 @@ function useModFilename( $title ) { function fetchPage( $title ) { - global $FS,$FS1,$FS2,$FS3, $wgRootDirectory; - + global $FS1, $FS2, $FS3, $wgRootDirectory; + $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db"; - if( !file_exists( $fname ) ) { - die( "Couldn't open file '$fname' for page '$title'.\n" ); + if ( !file_exists( $fname ) ) { + echo "Couldn't open file '$fname' for page '$title'.\n"; + die( -1 ); } - + $page = splitHash( $FS1, file_get_contents( $fname ) ); $section = splitHash( $FS2, $page["text_default"] ); $text = splitHash( $FS3, $section["data"] ); - + return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] , "minor" => $text["minor"] , "ts" => $section["ts"] , "username" => $section["username"] , "host" => $section["host"] ) ); @@ -127,24 +141,24 @@ function fetchPage( $title ) function fetchKeptPages( $title ) { - global $FS,$FS1,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection; - + global $FS1, $FS2, $FS3, $wgRootDirectory; + $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp"; - if( !file_exists( $fname ) ) return array(); - + if ( !file_exists( $fname ) ) return array(); + $keptlist = explode( $FS1, file_get_contents( $fname ) ); array_shift( $keptlist ); # Drop the junk at beginning of file - + $revisions = array(); - foreach( $keptlist as $rev ) { + foreach ( $keptlist as $rev ) { $section = splitHash( $FS2, $rev ); $text = splitHash( $FS3, $section["data"] ); - if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) { + if ( $text["text"] && $text["minor"] != "" && ( $section["ts"] * 1 > 0 ) ) { array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] , "minor" => $text["minor"] , "ts" => $section["ts"] , "username" => $section["username"] , "host" => $section["host"] ) ) ); } else { - echo "-- skipped a bad old revision\n"; + echo "\n"; } } return $revisions; @@ -153,7 +167,7 @@ function fetchKeptPages( $title ) function splitHash ( $sep , $str ) { $temp = explode ( $sep , $str ) ; $ret = array () ; - for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) { + for ( $i = 0; $i + 1 < count ( $temp ) ; $i++ ) { $ret[$temp[$i]] = $temp[++$i] ; } return $ret ; @@ -164,53 +178,21 @@ function splitHash ( $sep , $str ) { Take a fetched item and produce SQL */ -/* importUser - $uid is the UseMod user id number. - The new ones will be assigned arbitrarily and are for internal use only. - - THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR - */ -function importUser( $uid ) -{ - global $last_uid, $user_list, $wgTimestampCorrection; - die("importUser NYI"); - return ""; - - $stuff = fetchUser( $uid ); - $last_uid++; - - $name = wfStrencode( $stuff->username ); - $hash = md5hash( $stuff->password ); # Doable? - $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1 - $hideminor = ($stuff['rcall'] ? 0 : 1); - $options = "cols={$stuff['editcols']} -rows={$stuff['editrows']} -rcdays={$stuff['rcdays']} -timecorrection={$tzoffset} -hideminor={$hideminor} - "; - - $sql = "INSERT - INTO user (user_id,user_name,user_password,user_options) - VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n"; - return $sql; -} - function checkUserCache( $name, $host ) { global $usercache; - if( $name ) { - if( in_array( $name, $usercache ) ) { + if ( $name ) { + if ( in_array( $name, $usercache ) ) { $userid = $usercache[$name]; } else { # If we haven't imported user accounts $userid = 0; } - $username = wfStrencode( $name ); + $username = str_replace( '_', ' ', $name ); } else { $userid = 0; - $username = wfStrencode( $host ); + $username = $host; } return array( $userid, $username ); } @@ -218,76 +200,66 @@ function checkUserCache( $name, $host ) function importPage( $title ) { global $usercache; - global $conversiontime; - - echo "\n-- Importing page $title\n"; + + echo "\n\n"; $page = fetchPage( $title ); - $newtitle = wfStrencode( recodeText( $title ) ); - $namespace = 0; - + $newtitle = xmlsafe( str_replace( '_', ' ', recodeText( $title ) ) ); + $munged = mungeFormat( $page->text ); - if( $munged != $page->text ) { + if ( $munged != $page->text ) { /** * Save a *new* revision with the conversion, and put the * previous last version into the history. */ - $text = wfStrencode( recodeText( $munged ) ); - $comment = "link fix"; - $minor = 1; - $userid = 0; - $username = "Conversion script"; - $timestamp = wfUnix2Timestamp( time() ); - $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 ); - $random = mt_rand() / mt_getrandmax(); - $inverse = wfInvertTimestamp( $timestamp ); - - $revisions = array( $page ); + $next = array2object( array( + 'text' => $munged, + 'minor' => 1, + 'username' => 'Conversion script', + 'host' => '127.0.0.1', + 'ts' => time(), + 'summary' => 'link fix', + ) ); + $revisions = array( $page, $next ); } else { /** * Current revision: */ - $text = wfStrencode( recodeText( $page->text ) ); - $comment = wfStrencode( recodeText( $page->summary ) ); - $minor = ($page->minor ? 1 : 0); - list( $userid, $username ) = checkUserCache( $page->username, $page->host ); - $username = wfStrencode( recodeText( $username ) ); - $timestamp = wfUnix2Timestamp( $page->ts ); - $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 ); - $random = mt_rand() / mt_getrandmax(); - $inverse = wfInvertTimestamp( $timestamp ); - - $revisions = array(); + $revisions = array( $page ); } - $sql = " -INSERT - INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,inverse_timestamp,cur_touched,cur_minor_edit,cur_is_redirect,cur_random) VALUES - ($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse','$conversiontime',$minor,$redirect,$random);\n"; + $xml = << + $newtitle + +XML; # History $revisions = array_merge( $revisions, fetchKeptPages( $title ) ); - if(count( $revisions ) == 0 ) { - return $sql; + if ( count( $revisions ) == 0 ) { + return NULL; // Was "$sql", which does not appear to be defined. } - - $any = false; - $sql .= "INSERT - INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,inverse_timestamp,old_minor_edit) VALUES\n"; - foreach( $revisions as $rev ) { - $text = wfStrencode( recodeText( $rev->text ) ); - $minor = ($rev->minor ? 1 : 0); - list( $userid, $username ) = checkUserCache( $rev->username, $rev->host ); - $username = wfStrencode( recodeText( $username ) ); - $timestamp = wfUnix2Timestamp( $rev->ts ); - $inverse = wfInvertTimestamp( $timestamp ); - $comment = wfStrencode( recodeText( $rev->summary ) ); - - if($any) $sql .= ","; - $sql .= "\n\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse',$minor)"; - $any = true; + + foreach ( $revisions as $rev ) { + $text = xmlsafe( recodeText( $rev->text ) ); + $minor = ( $rev->minor ? '' : '' ); + list( /* $userid */ , $username ) = checkUserCache( $rev->username, $rev->host ); + $username = xmlsafe( recodeText( $username ) ); + $timestamp = xmlsafe( timestamp2ISO8601( $rev->ts ) ); + $comment = xmlsafe( recodeText( $rev->summary ) ); + + $xml .= << + $timestamp + $username + $minor + $comment + $text + + +XML; } - $sql .= ";\n\n"; - return $sql; + $xml .= "\n\n"; + return $xml; } # Whee! @@ -300,68 +272,53 @@ function recodeText( $string ) { return $string; } -function wfUtf8Sequence($codepoint) { - if($codepoint < 0x80) return chr($codepoint); - if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) . - chr($codepoint & 0x3f | 0x80); - if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) . - chr($codepoint >> 6 & 0x3f | 0x80) . - chr($codepoint & 0x3f | 0x80); - if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this - chr($codepoint >> 12 & 0x3f | 0x80) . - chr($codepoint >> 6 & 0x3f | 0x80) . - chr($codepoint & 0x3f | 0x80); +function wfUtf8Sequence( $codepoint ) { + if ( $codepoint < 0x80 ) return chr( $codepoint ); + if ( $codepoint < 0x800 ) return chr( $codepoint >> 6 & 0x3f | 0xc0 ) . + chr( $codepoint & 0x3f | 0x80 ); + if ( $codepoint < 0x10000 ) return chr( $codepoint >> 12 & 0x0f | 0xe0 ) . + chr( $codepoint >> 6 & 0x3f | 0x80 ) . + chr( $codepoint & 0x3f | 0x80 ); + if ( $codepoint < 0x100000 ) return chr( $codepoint >> 18 & 0x07 | 0xf0 ) . # Double-check this + chr( $codepoint >> 12 & 0x3f | 0x80 ) . + chr( $codepoint >> 6 & 0x3f | 0x80 ) . + chr( $codepoint & 0x3f | 0x80 ); # Doesn't yet handle outside the BMP return "&#$codepoint;"; } -function wfMungeToUtf8($string) { +function wfMungeToUtf8( $string ) { $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string ); $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string ); # Should also do named entities here return $string; } -function wfStrencode( $string ) { - return mysql_escape_string( $string ); +function timestamp2ISO8601( $ts ) { + # 2003-08-05T18:30:02Z + return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z'; } -function wfUnix2Timestamp( $unixtime ) { - return gmdate( "YmdHis", $unixtime ); -} +function xmlsafe( $string ) { + /** + * The page may contain old data which has not been properly normalized. + * Invalid UTF-8 sequences or forbidden control characters will make our + * XML output invalid, so be sure to strip them out. + */ + $string = UtfNormal::cleanUp( $string ); -function wfTimestamp2Unix( $ts ) -{ - return gmmktime( ( (int)substr( $ts, 8, 2) ), - (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ), - (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ), - (int)substr( $ts, 0, 4 ) ); -} - -function wfTimestampNow() { - # return NOW - return gmdate( "YmdHis" ); + $string = htmlspecialchars( $string ); + return $string; } -# Sorting hack for MySQL 3, which doesn't use index sorts for DESC -function wfInvertTimestamp( $ts ) { - return strtr( - $ts, - "0123456789", - "9876543210" - ); +function xmlCommentSafe( $text ) { + return str_replace( '--', '\\-\\-', xmlsafe( recodeText( $text ) ) ); } -function wfSeedRandom() -{ - $seed = hexdec(substr(md5(microtime()),-8)) & 0x7fffffff; - mt_srand( $seed ); - $wgRandomSeeded = true; -} function array2object( $arr ) { $o = (object)0; - foreach( $arr as $x => $y ) { + foreach ( $arr as $x => $y ) { $o->$x = $y; } return $o; @@ -377,7 +334,7 @@ function mungeFormat( $text ) { $staged = preg_replace_callback( '/(.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s', 'nowikiPlaceholder', $text ); - + # This is probably not 100% correct, I'm just # glancing at the UseModWiki code. $upper = "[A-Z]"; @@ -386,10 +343,10 @@ function mungeFormat( $text ) { $camel = "(?:$upper+$lower+$upper+$any*)"; $subpage = "(?:\\/$any+)"; $substart = "(?:\\/$upper$any*)"; - + $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/", '[[$1]]', $staged ); - + $final = preg_replace( '/' . preg_quote( placeholder() ) . '/es', 'array_shift( $nowiki )', $munged ); return $final; @@ -406,4 +363,4 @@ function nowikiPlaceholder( $matches ) { return placeholder(); } -?> +