Some bug fixes, munge more charset stuff
[lhc/web/wiklou.git] / maintenance / importUseModWiki.php
1 <?php
2
3 /*
4 Import data from a UseModWiki into a PediaWiki wiki
5 2003-02-09 Brion VIBBER <brion@pobox.com>
6 Based loosely on Magnus's code from 2001-2002
7
8 Updated limited version to get something working temporarily
9 2003-10-09
10 Be sure to run the link & index rebuilding scripts!
11
12 Some more munging for charsets etc
13 2003-11-28
14
15 */
16
17 /* Set these correctly! */
18 $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
19 $wgRootDirectory = "/home/usemod/wiki-fi/lib-http/db/wiki";
20
21 /* globals */
22 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
23 $FS = $wgFieldSeparator ;
24 $FS1 = $FS."1" ;
25 $FS2 = $FS."2" ;
26 $FS3 = $FS."3" ;
27
28 $conversiontime = wfTimestampNow(); # Conversions will be marked with this timestamp
29 $usercache = array();
30
31 wfSeedRandom();
32 importPages();
33
34 # ------------------------------------------------------------------------------
35
36 function importPages()
37 {
38 global $wgRootDirectory;
39
40 $letters = array(
41 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
42 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
43 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
44 foreach( $letters as $letter ) {
45 $dir = "$wgRootDirectory/page/$letter";
46 if( is_dir( $dir ) )
47 importPageDirectory( $dir );
48 }
49 }
50
51 function importPageDirectory( $dir, $prefix = "" )
52 {
53 echo "\n-- Checking page directory $dir\n";
54 $mydir = opendir( $dir );
55 while( $entry = readdir( $mydir ) ) {
56 if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
57 echo importPage( $prefix . $m[1] );
58 } else {
59 if( is_dir( "$dir/$entry" ) ) {
60 if( $entry != '.' && $entry != '..' ) {
61 importPageDirectory( "$dir/$entry", "$entry/" );
62 }
63 } else {
64 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
65 }
66 }
67 }
68 }
69
70
71 # ------------------------------------------------------------------------------
72
73 /* fetch_ functions
74 Grab a given item from the database
75 */
76 function fetchUser( $uid )
77 {
78 die ("fetchUser not implemented" );
79
80 global $FS,$FS2,$FS3, $wgRootDirectory;
81
82 $fname = $wgRootDirectory . "/page/" . $title;
83 if( !file_exists( $fname ) ) return false;
84
85 $data = splitHash( implode( "", file( $fname ) ) );
86 # enough?
87
88 return $data;
89 }
90
91 function useModFilename( $title ) {
92 $c = substr( $title, 0, 1 );
93 if(preg_match( '/[A-Z]/', $c ) ) {
94 return "$c/$title";
95 }
96 return "other/$title";
97 }
98
99 function fetchPage( $title )
100 {
101 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory;
102
103 $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
104 if( !file_exists( $fname ) ) {
105 die( "Couldn't open file '$fname' for page '$title'.\n" );
106 }
107
108 $page = splitHash( $FS1, file_get_contents( $fname ) );
109 $section = splitHash( $FS2, $page["text_default"] );
110 $text = splitHash( $FS3, $section["data"] );
111
112 return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
113 "minor" => $text["minor"] , "ts" => $section["ts"] ,
114 "username" => $section["username"] , "host" => $section["host"] ) );
115 }
116
117 function fetchKeptPages( $title )
118 {
119 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
120
121 $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
122 if( !file_exists( $fname ) ) return array();
123
124 $keptlist = explode( $FS1, file_get_contents( $fname ) );
125 array_shift( $keptlist ); # Drop the junk at beginning of file
126
127 $revisions = array();
128 foreach( $keptlist as $rev ) {
129 $section = splitHash( $FS2, $rev );
130 $text = splitHash( $FS3, $section["data"] );
131 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
132 array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
133 "minor" => $text["minor"] , "ts" => $section["ts"] ,
134 "username" => $section["username"] , "host" => $section["host"] ) ) );
135 } else {
136 echo "-- skipped a bad old revision\n";
137 }
138 }
139 return $revisions;
140 }
141
142 function splitHash ( $sep , $str ) {
143 $temp = explode ( $sep , $str ) ;
144 $ret = array () ;
145 for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
146 $ret[$temp[$i]] = $temp[++$i] ;
147 }
148 return $ret ;
149 }
150
151
152 /* import_ functions
153 Take a fetched item and produce SQL
154 */
155
156 /* importUser
157 $uid is the UseMod user id number.
158 The new ones will be assigned arbitrarily and are for internal use only.
159
160 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
161 */
162 function importUser( $uid )
163 {
164 global $last_uid, $user_list, $wgTimestampCorrection;
165 die("importUser NYI");
166 return "";
167
168 $stuff = fetchUser( $uid );
169 $last_uid++;
170
171 $name = wfStrencode( $stuff->username );
172 $hash = md5hash( $stuff->password ); # Doable?
173 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
174 $hideminor = ($stuff['rcall'] ? 0 : 1);
175 $options = "cols={$stuff['editcols']}
176 rows={$stuff['editrows']}
177 rcdays={$stuff['rcdays']}
178 timecorrection={$tzoffset}
179 hideminor={$hideminor}
180 ";
181
182 $sql = "INSERT
183 INTO user (user_id,user_name,user_password,user_options)
184 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
185 return $sql;
186 }
187
188 function checkUserCache( $name, $host )
189 {
190 global $usercache;
191
192 if( $name ) {
193 if( in_array( $name, $usercache ) ) {
194 $userid = $usercache[$name];
195 } else {
196 # If we haven't imported user accounts
197 $userid = 0;
198 }
199 $username = wfStrencode( $name );
200 } else {
201 $userid = 0;
202 $username = wfStrencode( $host );
203 }
204 return array( $userid, $username );
205 }
206
207 function importPage( $title )
208 {
209 global $usercache;
210 global $conversiontime;
211
212 echo "\n-- Importing page $title\n";
213 $page = fetchPage( $title );
214
215 $newtitle = wfStrencode( recodeText( $title ) );
216 $namespace = 0;
217
218 # Current revision:
219 $text = wfStrencode( recodeText( $page->text ) );
220 $comment = wfStrencode( recodeText( $page->summary ) );
221 $minor = ($page->minor ? 1 : 0);
222 list( $userid, $username ) = checkUserCache( $page->username, $page->host );
223 $timestamp = wfUnix2Timestamp( $page->ts );
224 $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
225 $random = mt_rand() / mt_getrandmax();
226 $inverse = wfInvertTimestamp( $timestamp );
227 $sql = "
228 INSERT
229 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,inverse_timestamp,cur_touched,cur_minor_edit,cur_is_redirect,cur_random) VALUES
230 ($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse','$conversiontime',$minor,$redirect,$random);\n";
231
232 # History
233 $revisions = fetchKeptPages( $title );
234 if(count( $revisions ) == 0 ) {
235 return $sql;
236 }
237
238 $any = false;
239 $sql .= "INSERT
240 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,inverse_timestamp,old_minor_edit) VALUES\n";
241 foreach( $revisions as $rev ) {
242 $text = wfStrencode( recodeText( $rev->text ) );
243 $minor = ($rev->minor ? 1 : 0);
244 list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
245 $username = wfStrencode( recodeText( $username ) );
246 $timestamp = wfUnix2Timestamp( $rev->ts );
247 $inverse = wfInvertTimestamp( $timestamp );
248 $comment = wfStrencode( recodeText( $rev->summary ) );
249
250 if($any) $sql .= ",";
251 $sql .= "\n\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse',$minor)";
252 $any = true;
253 }
254 $sql .= ";\n\n";
255 return $sql;
256 }
257
258 # Whee!
259 function recodeText( $string ) {
260 global $wgImportEncoding;
261 # For currently latin-1 wikis
262 $string = str_replace( "\r\n", "\n", $string );
263 $string = iconv( $wgImportEncoding, "UTF-8", $string );
264 $string = wfMungeToUtf8( $string ); # Any old &#1234; stuff
265 return $string;
266 }
267
268 function wfUtf8Sequence($codepoint) {
269 if($codepoint < 0x80) return chr($codepoint);
270 if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
271 chr($codepoint & 0x3f | 0x80);
272 if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
273 chr($codepoint >> 6 & 0x3f | 0x80) .
274 chr($codepoint & 0x3f | 0x80);
275 if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this
276 chr($codepoint >> 12 & 0x3f | 0x80) .
277 chr($codepoint >> 6 & 0x3f | 0x80) .
278 chr($codepoint & 0x3f | 0x80);
279 # Doesn't yet handle outside the BMP
280 return "&#$codepoint;";
281 }
282
283 function wfMungeToUtf8($string) {
284 $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
285 $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
286 # Should also do named entities here
287 return $string;
288 }
289
290 function wfStrencode( $string ) {
291 return mysql_escape_string( $string );
292 }
293
294 function wfUnix2Timestamp( $unixtime ) {
295 return gmdate( "YmdHis", $unixtime );
296 }
297
298 function wfTimestamp2Unix( $ts )
299 {
300 return gmmktime( ( (int)substr( $ts, 8, 2) ),
301 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
302 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
303 (int)substr( $ts, 0, 4 ) );
304 }
305
306 function wfTimestampNow() {
307 # return NOW
308 return gmdate( "YmdHis" );
309 }
310
311 # Sorting hack for MySQL 3, which doesn't use index sorts for DESC
312 function wfInvertTimestamp( $ts ) {
313 return strtr(
314 $ts,
315 "0123456789",
316 "9876543210"
317 );
318 }
319
320 function wfSeedRandom()
321 {
322 $seed = hexdec(substr(md5(microtime()),-8)) & 0x7fffffff;
323 mt_srand( $seed );
324 $wgRandomSeeded = true;
325 }
326
327 function array2object( $arr ) {
328 $o = (object)0;
329 foreach( $arr as $x => $y ) {
330 $o->$x = $y;
331 }
332 return $o;
333 }
334
335 ?>