various bug fixes
[lhc/web/wiklou.git] / maintenance / importUseModWiki.php
1 <?php
2
3 /*
4 Import data from a UseModWiki into a PediaWiki wiki
5 2003-02-09 Brion VIBBER <brion@pobox.com>
6 Based loosely on Magnus's code from 2001-2002
7
8 Updated limited version to get something working temporarily
9 2003-10-09
10 Be sure to run the link & index rebuilding scripts!
11
12 Some more munging for charsets etc
13 2003-11-28
14
15 */
16
17 /* Set these correctly! */
18 $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
19 $wgRootDirectory = "/home/usemod/wiki-ia/lib-http/db/wiki";
20
21 /* globals */
22 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
23 $FS = $wgFieldSeparator ;
24 $FS1 = $FS."1" ;
25 $FS2 = $FS."2" ;
26 $FS3 = $FS."3" ;
27
28 $conversiontime = wfTimestampNow(); # Conversions will be marked with this timestamp
29 $usercache = array();
30
31 wfSeedRandom();
32 importPages();
33
34 # ------------------------------------------------------------------------------
35
36 function importPages()
37 {
38 global $wgRootDirectory;
39
40 $letters = array(
41 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
42 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
43 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
44 foreach( $letters as $letter ) {
45 $dir = "$wgRootDirectory/page/$letter";
46 if( is_dir( $dir ) )
47 importPageDirectory( $dir );
48 }
49 }
50
51 function importPageDirectory( $dir, $prefix = "" )
52 {
53 echo "\n-- Checking page directory $dir\n";
54 $mydir = opendir( $dir );
55 while( $entry = readdir( $mydir ) ) {
56 if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
57 echo importPage( $prefix . $m[1] );
58 } else {
59 if( is_dir( "$dir/$entry" ) ) {
60 if( $entry != '.' && $entry != '..' ) {
61 importPageDirectory( "$dir/$entry", "$entry/" );
62 }
63 } else {
64 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
65 }
66 }
67 }
68 }
69
70
71 # ------------------------------------------------------------------------------
72
73 /* fetch_ functions
74 Grab a given item from the database
75 */
76 function fetchUser( $uid )
77 {
78 die ("fetchUser not implemented" );
79
80 global $FS,$FS2,$FS3, $wgRootDirectory;
81
82 $fname = $wgRootDirectory . "/page/" . $title;
83 if( !file_exists( $fname ) ) return false;
84
85 $data = splitHash( implode( "", file( $fname ) ) );
86 # enough?
87
88 return $data;
89 }
90
91 function useModFilename( $title ) {
92 $c = substr( $title, 0, 1 );
93 if(preg_match( '/[A-Z]/', $c ) ) {
94 return "$c/$title";
95 }
96 return "other/$title";
97 }
98
99 function fetchPage( $title )
100 {
101 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory;
102
103 $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
104 if( !file_exists( $fname ) ) {
105 die( "Couldn't open file '$fname' for page '$title'.\n" );
106 }
107
108 $page = splitHash( $FS1, file_get_contents( $fname ) );
109 $section = splitHash( $FS2, $page["text_default"] );
110 $text = splitHash( $FS3, $section["data"] );
111
112 return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
113 "minor" => $text["minor"] , "ts" => $section["ts"] ,
114 "username" => $section["username"] , "host" => $section["host"] ) );
115 }
116
117 function fetchKeptPages( $title )
118 {
119 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
120
121 $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
122 if( !file_exists( $fname ) ) return array();
123
124 $keptlist = explode( $FS1, file_get_contents( $fname ) );
125 array_shift( $keptlist ); # Drop the junk at beginning of file
126
127 $revisions = array();
128 foreach( $keptlist as $rev ) {
129 $section = splitHash( $FS2, $rev );
130 $text = splitHash( $FS3, $section["data"] );
131 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
132 array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
133 "minor" => $text["minor"] , "ts" => $section["ts"] ,
134 "username" => $section["username"] , "host" => $section["host"] ) ) );
135 } else {
136 echo "-- skipped a bad old revision\n";
137 }
138 }
139 return $revisions;
140 }
141
142 function splitHash ( $sep , $str ) {
143 $temp = explode ( $sep , $str ) ;
144 $ret = array () ;
145 for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
146 $ret[$temp[$i]] = $temp[++$i] ;
147 }
148 return $ret ;
149 }
150
151
152 /* import_ functions
153 Take a fetched item and produce SQL
154 */
155
156 /* importUser
157 $uid is the UseMod user id number.
158 The new ones will be assigned arbitrarily and are for internal use only.
159
160 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
161 */
162 function importUser( $uid )
163 {
164 global $last_uid, $user_list, $wgTimestampCorrection;
165 die("importUser NYI");
166 return "";
167
168 $stuff = fetchUser( $uid );
169 $last_uid++;
170
171 $name = wfStrencode( $stuff->username );
172 $hash = md5hash( $stuff->password ); # Doable?
173 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
174 $hideminor = ($stuff['rcall'] ? 0 : 1);
175 $options = "cols={$stuff['editcols']}
176 rows={$stuff['editrows']}
177 rcdays={$stuff['rcdays']}
178 timecorrection={$tzoffset}
179 hideminor={$hideminor}
180 ";
181
182 $sql = "INSERT
183 INTO user (user_id,user_name,user_password,user_options)
184 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
185 return $sql;
186 }
187
188 function checkUserCache( $name, $host )
189 {
190 global $usercache;
191
192 if( $name ) {
193 if( in_array( $name, $usercache ) ) {
194 $userid = $usercache[$name];
195 } else {
196 # If we haven't imported user accounts
197 $userid = 0;
198 }
199 $username = wfStrencode( $name );
200 } else {
201 $userid = 0;
202 $username = wfStrencode( $host );
203 }
204 return array( $userid, $username );
205 }
206
207 function importPage( $title )
208 {
209 global $usercache;
210 global $conversiontime;
211
212 echo "\n-- Importing page $title\n";
213 $page = fetchPage( $title );
214
215 $newtitle = wfStrencode( recodeText( $title ) );
216 $namespace = 0;
217
218 # Current revision:
219 $text = wfStrencode( recodeText( $page->text ) );
220 $comment = wfStrencode( recodeText( $page->summary ) );
221 $minor = ($page->minor ? 1 : 0);
222 list( $userid, $username ) = checkUserCache( $page->username, $page->host );
223 $username = wfStrencode( recodeText( $username ) );
224 $timestamp = wfUnix2Timestamp( $page->ts );
225 $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
226 $random = mt_rand() / mt_getrandmax();
227 $inverse = wfInvertTimestamp( $timestamp );
228 $sql = "
229 INSERT
230 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,inverse_timestamp,cur_touched,cur_minor_edit,cur_is_redirect,cur_random) VALUES
231 ($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse','$conversiontime',$minor,$redirect,$random);\n";
232
233 # History
234 $revisions = fetchKeptPages( $title );
235 if(count( $revisions ) == 0 ) {
236 return $sql;
237 }
238
239 $any = false;
240 $sql .= "INSERT
241 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,inverse_timestamp,old_minor_edit) VALUES\n";
242 foreach( $revisions as $rev ) {
243 $text = wfStrencode( recodeText( $rev->text ) );
244 $minor = ($rev->minor ? 1 : 0);
245 list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
246 $username = wfStrencode( recodeText( $username ) );
247 $timestamp = wfUnix2Timestamp( $rev->ts );
248 $inverse = wfInvertTimestamp( $timestamp );
249 $comment = wfStrencode( recodeText( $rev->summary ) );
250
251 if($any) $sql .= ",";
252 $sql .= "\n\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse',$minor)";
253 $any = true;
254 }
255 $sql .= ";\n\n";
256 return $sql;
257 }
258
259 # Whee!
260 function recodeText( $string ) {
261 global $wgImportEncoding;
262 # For currently latin-1 wikis
263 $string = str_replace( "\r\n", "\n", $string );
264 $string = iconv( $wgImportEncoding, "UTF-8", $string );
265 $string = wfMungeToUtf8( $string ); # Any old &#1234; stuff
266 return $string;
267 }
268
269 function wfUtf8Sequence($codepoint) {
270 if($codepoint < 0x80) return chr($codepoint);
271 if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
272 chr($codepoint & 0x3f | 0x80);
273 if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
274 chr($codepoint >> 6 & 0x3f | 0x80) .
275 chr($codepoint & 0x3f | 0x80);
276 if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this
277 chr($codepoint >> 12 & 0x3f | 0x80) .
278 chr($codepoint >> 6 & 0x3f | 0x80) .
279 chr($codepoint & 0x3f | 0x80);
280 # Doesn't yet handle outside the BMP
281 return "&#$codepoint;";
282 }
283
284 function wfMungeToUtf8($string) {
285 $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
286 $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
287 # Should also do named entities here
288 return $string;
289 }
290
291 function wfStrencode( $string ) {
292 return mysql_escape_string( $string );
293 }
294
295 function wfUnix2Timestamp( $unixtime ) {
296 return gmdate( "YmdHis", $unixtime );
297 }
298
299 function wfTimestamp2Unix( $ts )
300 {
301 return gmmktime( ( (int)substr( $ts, 8, 2) ),
302 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
303 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
304 (int)substr( $ts, 0, 4 ) );
305 }
306
307 function wfTimestampNow() {
308 # return NOW
309 return gmdate( "YmdHis" );
310 }
311
312 # Sorting hack for MySQL 3, which doesn't use index sorts for DESC
313 function wfInvertTimestamp( $ts ) {
314 return strtr(
315 $ts,
316 "0123456789",
317 "9876543210"
318 );
319 }
320
321 function wfSeedRandom()
322 {
323 $seed = hexdec(substr(md5(microtime()),-8)) & 0x7fffffff;
324 mt_srand( $seed );
325 $wgRandomSeeded = true;
326 }
327
328 function array2object( $arr ) {
329 $o = (object)0;
330 foreach( $arr as $x => $y ) {
331 $o->$x = $y;
332 }
333 return $o;
334 }
335
336 ?>