CLI-protect the usemod importer, remove the old version
[lhc/web/wiklou.git] / maintenance / importUseModWiki.php
1 <?php
2
3 /**
4 * Import data from a UseModWiki into a PediaWiki wiki
5 * 2003-02-09 Brion VIBBER <brion@pobox.com>
6 * Based loosely on Magnus's code from 2001-2002
7 *
8 * Updated limited version to get something working temporarily
9 * 2003-10-09
10 * Be sure to run the link & index rebuilding scripts!
11 *
12 * Some more munging for charsets etc
13 * 2003-11-28
14 *
15 * Partial fix for pages starting with lowercase letters (??)
16 * and CamelCase and /Subpage link conversion
17 * 2004-11-17
18 *
19 * @todo document
20 * @package MediaWiki
21 * @subpackage Maintenance
22 */
23
24 if( php_sapi_name() != 'cli' ) {
25 die( "Please customize the settings and run me from the command line." );
26 }
27
28 /** Set these correctly! */
29 $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
30 $wgRootDirectory = "/kalman/Projects/wiki2002/wiki/lib-http/db/wiki";
31
32 /* On a large wiki, you might run out of memory */
33 @ini_set( 'memory_limit', '40M' );
34
35 /* globals */
36 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
37 $FS = $wgFieldSeparator ;
38 $FS1 = $FS."1" ;
39 $FS2 = $FS."2" ;
40 $FS3 = $FS."3" ;
41
42 $conversiontime = wfTimestampNow(); # Conversions will be marked with this timestamp
43 $usercache = array();
44
45 wfSeedRandom();
46 importPages();
47
48 # ------------------------------------------------------------------------------
49
50 function importPages()
51 {
52 global $wgRootDirectory;
53
54 $letters = array(
55 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
56 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
57 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
58 foreach( $letters as $letter ) {
59 $dir = "$wgRootDirectory/page/$letter";
60 if( is_dir( $dir ) )
61 importPageDirectory( $dir );
62 }
63 }
64
65 function importPageDirectory( $dir, $prefix = "" )
66 {
67 echo "\n-- Checking page directory $dir\n";
68 $mydir = opendir( $dir );
69 while( $entry = readdir( $mydir ) ) {
70 if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
71 echo importPage( $prefix . $m[1] );
72 } else {
73 if( is_dir( "$dir/$entry" ) ) {
74 if( $entry != '.' && $entry != '..' ) {
75 importPageDirectory( "$dir/$entry", "$entry/" );
76 }
77 } else {
78 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
79 }
80 }
81 }
82 }
83
84
85 # ------------------------------------------------------------------------------
86
87 /* fetch_ functions
88 Grab a given item from the database
89 */
90 function fetchUser( $uid )
91 {
92 die ("fetchUser not implemented" );
93
94 global $FS,$FS2,$FS3, $wgRootDirectory;
95
96 $fname = $wgRootDirectory . "/page/" . $title;
97 if( !file_exists( $fname ) ) return false;
98
99 $data = splitHash( implode( "", file( $fname ) ) );
100 # enough?
101
102 return $data;
103 }
104
105 function useModFilename( $title ) {
106 $c = substr( $title, 0, 1 );
107 if(preg_match( '/[A-Z]/i', $c ) ) {
108 return strtoupper( $c ) . "/$title";
109 }
110 return "other/$title";
111 }
112
113 function fetchPage( $title )
114 {
115 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory;
116
117 $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
118 if( !file_exists( $fname ) ) {
119 die( "Couldn't open file '$fname' for page '$title'.\n" );
120 }
121
122 $page = splitHash( $FS1, file_get_contents( $fname ) );
123 $section = splitHash( $FS2, $page["text_default"] );
124 $text = splitHash( $FS3, $section["data"] );
125
126 return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
127 "minor" => $text["minor"] , "ts" => $section["ts"] ,
128 "username" => $section["username"] , "host" => $section["host"] ) );
129 }
130
131 function fetchKeptPages( $title )
132 {
133 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
134
135 $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
136 if( !file_exists( $fname ) ) return array();
137
138 $keptlist = explode( $FS1, file_get_contents( $fname ) );
139 array_shift( $keptlist ); # Drop the junk at beginning of file
140
141 $revisions = array();
142 foreach( $keptlist as $rev ) {
143 $section = splitHash( $FS2, $rev );
144 $text = splitHash( $FS3, $section["data"] );
145 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
146 array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
147 "minor" => $text["minor"] , "ts" => $section["ts"] ,
148 "username" => $section["username"] , "host" => $section["host"] ) ) );
149 } else {
150 echo "-- skipped a bad old revision\n";
151 }
152 }
153 return $revisions;
154 }
155
156 function splitHash ( $sep , $str ) {
157 $temp = explode ( $sep , $str ) ;
158 $ret = array () ;
159 for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
160 $ret[$temp[$i]] = $temp[++$i] ;
161 }
162 return $ret ;
163 }
164
165
166 /* import_ functions
167 Take a fetched item and produce SQL
168 */
169
170 /* importUser
171 $uid is the UseMod user id number.
172 The new ones will be assigned arbitrarily and are for internal use only.
173
174 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
175 */
176 function importUser( $uid )
177 {
178 global $last_uid, $user_list, $wgTimestampCorrection;
179 die("importUser NYI");
180 return "";
181
182 $stuff = fetchUser( $uid );
183 $last_uid++;
184
185 $name = wfStrencode( $stuff->username );
186 $hash = md5hash( $stuff->password ); # Doable?
187 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
188 $hideminor = ($stuff['rcall'] ? 0 : 1);
189 $options = "cols={$stuff['editcols']}
190 rows={$stuff['editrows']}
191 rcdays={$stuff['rcdays']}
192 timecorrection={$tzoffset}
193 hideminor={$hideminor}
194 ";
195
196 $sql = "INSERT
197 INTO user (user_id,user_name,user_password,user_options)
198 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
199 return $sql;
200 }
201
202 function checkUserCache( $name, $host )
203 {
204 global $usercache;
205
206 if( $name ) {
207 if( in_array( $name, $usercache ) ) {
208 $userid = $usercache[$name];
209 } else {
210 # If we haven't imported user accounts
211 $userid = 0;
212 }
213 $username = wfStrencode( $name );
214 } else {
215 $userid = 0;
216 $username = wfStrencode( $host );
217 }
218 return array( $userid, $username );
219 }
220
221 function importPage( $title )
222 {
223 global $usercache;
224 global $conversiontime;
225
226 echo "\n-- Importing page $title\n";
227 $page = fetchPage( $title );
228
229 $newtitle = wfStrencode( recodeText( $title ) );
230 $namespace = 0;
231
232 $munged = mungeFormat( $page->text );
233 if( $munged != $page->text ) {
234 /**
235 * Save a *new* revision with the conversion, and put the
236 * previous last version into the history.
237 */
238 $text = wfStrencode( recodeText( $munged ) );
239 $comment = "link fix";
240 $minor = 1;
241 $userid = 0;
242 $username = "Conversion script";
243 $timestamp = wfUnix2Timestamp( time() );
244 $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
245 $random = mt_rand() / mt_getrandmax();
246 $inverse = wfInvertTimestamp( $timestamp );
247
248 $revisions = array( $page );
249 } else {
250 /**
251 * Current revision:
252 */
253 $text = wfStrencode( recodeText( $page->text ) );
254 $comment = wfStrencode( recodeText( $page->summary ) );
255 $minor = ($page->minor ? 1 : 0);
256 list( $userid, $username ) = checkUserCache( $page->username, $page->host );
257 $username = wfStrencode( recodeText( $username ) );
258 $timestamp = wfUnix2Timestamp( $page->ts );
259 $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
260 $random = mt_rand() / mt_getrandmax();
261 $inverse = wfInvertTimestamp( $timestamp );
262
263 $revisions = array();
264 }
265 $sql = "
266 INSERT
267 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,inverse_timestamp,cur_touched,cur_minor_edit,cur_is_redirect,cur_random) VALUES
268 ($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse','$conversiontime',$minor,$redirect,$random);\n";
269
270 # History
271 $revisions = array_merge( $revisions, fetchKeptPages( $title ) );
272 if(count( $revisions ) == 0 ) {
273 return $sql;
274 }
275
276 $any = false;
277 $sql .= "INSERT
278 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,inverse_timestamp,old_minor_edit) VALUES\n";
279 foreach( $revisions as $rev ) {
280 $text = wfStrencode( recodeText( $rev->text ) );
281 $minor = ($rev->minor ? 1 : 0);
282 list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
283 $username = wfStrencode( recodeText( $username ) );
284 $timestamp = wfUnix2Timestamp( $rev->ts );
285 $inverse = wfInvertTimestamp( $timestamp );
286 $comment = wfStrencode( recodeText( $rev->summary ) );
287
288 if($any) $sql .= ",";
289 $sql .= "\n\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse',$minor)";
290 $any = true;
291 }
292 $sql .= ";\n\n";
293 return $sql;
294 }
295
296 # Whee!
297 function recodeText( $string ) {
298 global $wgImportEncoding;
299 # For currently latin-1 wikis
300 $string = str_replace( "\r\n", "\n", $string );
301 $string = @iconv( $wgImportEncoding, "UTF-8", $string );
302 $string = wfMungeToUtf8( $string ); # Any old &#1234; stuff
303 return $string;
304 }
305
306 function wfUtf8Sequence($codepoint) {
307 if($codepoint < 0x80) return chr($codepoint);
308 if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
309 chr($codepoint & 0x3f | 0x80);
310 if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
311 chr($codepoint >> 6 & 0x3f | 0x80) .
312 chr($codepoint & 0x3f | 0x80);
313 if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this
314 chr($codepoint >> 12 & 0x3f | 0x80) .
315 chr($codepoint >> 6 & 0x3f | 0x80) .
316 chr($codepoint & 0x3f | 0x80);
317 # Doesn't yet handle outside the BMP
318 return "&#$codepoint;";
319 }
320
321 function wfMungeToUtf8($string) {
322 $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
323 $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
324 # Should also do named entities here
325 return $string;
326 }
327
328 function wfStrencode( $string ) {
329 return mysql_escape_string( $string );
330 }
331
332 function wfUnix2Timestamp( $unixtime ) {
333 return gmdate( "YmdHis", $unixtime );
334 }
335
336 function wfTimestamp2Unix( $ts )
337 {
338 return gmmktime( ( (int)substr( $ts, 8, 2) ),
339 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
340 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
341 (int)substr( $ts, 0, 4 ) );
342 }
343
344 function wfTimestampNow() {
345 # return NOW
346 return gmdate( "YmdHis" );
347 }
348
349 # Sorting hack for MySQL 3, which doesn't use index sorts for DESC
350 function wfInvertTimestamp( $ts ) {
351 return strtr(
352 $ts,
353 "0123456789",
354 "9876543210"
355 );
356 }
357
358 function wfSeedRandom()
359 {
360 $seed = hexdec(substr(md5(microtime()),-8)) & 0x7fffffff;
361 mt_srand( $seed );
362 $wgRandomSeeded = true;
363 }
364
365 function array2object( $arr ) {
366 $o = (object)0;
367 foreach( $arr as $x => $y ) {
368 $o->$x = $y;
369 }
370 return $o;
371 }
372
373
374 /**
375 * Make CamelCase and /Talk links work
376 */
377 function mungeFormat( $text ) {
378 global $nowiki;
379 $nowiki = array();
380 $staged = preg_replace_callback(
381 '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s',
382 'nowikiPlaceholder', $text );
383
384 # This is probably not 100% correct, I'm just
385 # glancing at the UseModWiki code.
386 $upper = "[A-Z]";
387 $lower = "[a-z_0-9]";
388 $any = "[A-Za-z_0-9]";
389 $camel = "(?:$upper+$lower+$upper+$any*)";
390 $subpage = "(?:\\/$any+)";
391 $substart = "(?:\\/$upper$any*)";
392
393 $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/",
394 '[[$1]]', $staged );
395
396 $final = preg_replace( '/' . preg_quote( placeholder() ) . '/es',
397 'array_shift( $nowiki )', $munged );
398 return $final;
399 }
400
401
402 function placeholder( $x = null ) {
403 return '\xffplaceholder\xff';
404 }
405
406 function nowikiPlaceholder( $matches ) {
407 global $nowiki;
408 $nowiki[] = $matches[1];
409 return placeholder();
410 }
411
412 ?>