Phpdoc comments and place holder. Part of the subpackage "maintenance", archives...
[lhc/web/wiklou.git] / maintenance / importUseModWiki.php
1 <?php
2
3 /**
4 * Import data from a UseModWiki into a PediaWiki wiki
5 * 2003-02-09 Brion VIBBER <brion@pobox.com>
6 * Based loosely on Magnus's code from 2001-2002
7 *
8 * Updated limited version to get something working temporarily
9 * 2003-10-09
10 * Be sure to run the link & index rebuilding scripts!
11 *
12 * Some more munging for charsets etc
13 * 2003-11-28
14 *
15 * @todo document
16 * @package MediaWiki
17 * @subpackage Maintenance
18 */
19
20 /** Set these correctly! */
21 $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
22 $wgRootDirectory = "/home/usemod/wiki-ia/lib-http/db/wiki";
23
24 /* globals */
25 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
26 $FS = $wgFieldSeparator ;
27 $FS1 = $FS."1" ;
28 $FS2 = $FS."2" ;
29 $FS3 = $FS."3" ;
30
31 $conversiontime = wfTimestampNow(); # Conversions will be marked with this timestamp
32 $usercache = array();
33
34 wfSeedRandom();
35 importPages();
36
37 # ------------------------------------------------------------------------------
38
39 function importPages()
40 {
41 global $wgRootDirectory;
42
43 $letters = array(
44 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
45 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
46 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
47 foreach( $letters as $letter ) {
48 $dir = "$wgRootDirectory/page/$letter";
49 if( is_dir( $dir ) )
50 importPageDirectory( $dir );
51 }
52 }
53
54 function importPageDirectory( $dir, $prefix = "" )
55 {
56 echo "\n-- Checking page directory $dir\n";
57 $mydir = opendir( $dir );
58 while( $entry = readdir( $mydir ) ) {
59 if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
60 echo importPage( $prefix . $m[1] );
61 } else {
62 if( is_dir( "$dir/$entry" ) ) {
63 if( $entry != '.' && $entry != '..' ) {
64 importPageDirectory( "$dir/$entry", "$entry/" );
65 }
66 } else {
67 echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
68 }
69 }
70 }
71 }
72
73
74 # ------------------------------------------------------------------------------
75
76 /* fetch_ functions
77 Grab a given item from the database
78 */
79 function fetchUser( $uid )
80 {
81 die ("fetchUser not implemented" );
82
83 global $FS,$FS2,$FS3, $wgRootDirectory;
84
85 $fname = $wgRootDirectory . "/page/" . $title;
86 if( !file_exists( $fname ) ) return false;
87
88 $data = splitHash( implode( "", file( $fname ) ) );
89 # enough?
90
91 return $data;
92 }
93
94 function useModFilename( $title ) {
95 $c = substr( $title, 0, 1 );
96 if(preg_match( '/[A-Z]/', $c ) ) {
97 return "$c/$title";
98 }
99 return "other/$title";
100 }
101
102 function fetchPage( $title )
103 {
104 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory;
105
106 $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
107 if( !file_exists( $fname ) ) {
108 die( "Couldn't open file '$fname' for page '$title'.\n" );
109 }
110
111 $page = splitHash( $FS1, file_get_contents( $fname ) );
112 $section = splitHash( $FS2, $page["text_default"] );
113 $text = splitHash( $FS3, $section["data"] );
114
115 return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
116 "minor" => $text["minor"] , "ts" => $section["ts"] ,
117 "username" => $section["username"] , "host" => $section["host"] ) );
118 }
119
120 function fetchKeptPages( $title )
121 {
122 global $FS,$FS1,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
123
124 $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
125 if( !file_exists( $fname ) ) return array();
126
127 $keptlist = explode( $FS1, file_get_contents( $fname ) );
128 array_shift( $keptlist ); # Drop the junk at beginning of file
129
130 $revisions = array();
131 foreach( $keptlist as $rev ) {
132 $section = splitHash( $FS2, $rev );
133 $text = splitHash( $FS3, $section["data"] );
134 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
135 array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
136 "minor" => $text["minor"] , "ts" => $section["ts"] ,
137 "username" => $section["username"] , "host" => $section["host"] ) ) );
138 } else {
139 echo "-- skipped a bad old revision\n";
140 }
141 }
142 return $revisions;
143 }
144
145 function splitHash ( $sep , $str ) {
146 $temp = explode ( $sep , $str ) ;
147 $ret = array () ;
148 for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
149 $ret[$temp[$i]] = $temp[++$i] ;
150 }
151 return $ret ;
152 }
153
154
155 /* import_ functions
156 Take a fetched item and produce SQL
157 */
158
159 /* importUser
160 $uid is the UseMod user id number.
161 The new ones will be assigned arbitrarily and are for internal use only.
162
163 THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
164 */
165 function importUser( $uid )
166 {
167 global $last_uid, $user_list, $wgTimestampCorrection;
168 die("importUser NYI");
169 return "";
170
171 $stuff = fetchUser( $uid );
172 $last_uid++;
173
174 $name = wfStrencode( $stuff->username );
175 $hash = md5hash( $stuff->password ); # Doable?
176 $tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
177 $hideminor = ($stuff['rcall'] ? 0 : 1);
178 $options = "cols={$stuff['editcols']}
179 rows={$stuff['editrows']}
180 rcdays={$stuff['rcdays']}
181 timecorrection={$tzoffset}
182 hideminor={$hideminor}
183 ";
184
185 $sql = "INSERT
186 INTO user (user_id,user_name,user_password,user_options)
187 VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
188 return $sql;
189 }
190
191 function checkUserCache( $name, $host )
192 {
193 global $usercache;
194
195 if( $name ) {
196 if( in_array( $name, $usercache ) ) {
197 $userid = $usercache[$name];
198 } else {
199 # If we haven't imported user accounts
200 $userid = 0;
201 }
202 $username = wfStrencode( $name );
203 } else {
204 $userid = 0;
205 $username = wfStrencode( $host );
206 }
207 return array( $userid, $username );
208 }
209
210 function importPage( $title )
211 {
212 global $usercache;
213 global $conversiontime;
214
215 echo "\n-- Importing page $title\n";
216 $page = fetchPage( $title );
217
218 $newtitle = wfStrencode( recodeText( $title ) );
219 $namespace = 0;
220
221 # Current revision:
222 $text = wfStrencode( recodeText( $page->text ) );
223 $comment = wfStrencode( recodeText( $page->summary ) );
224 $minor = ($page->minor ? 1 : 0);
225 list( $userid, $username ) = checkUserCache( $page->username, $page->host );
226 $username = wfStrencode( recodeText( $username ) );
227 $timestamp = wfUnix2Timestamp( $page->ts );
228 $redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
229 $random = mt_rand() / mt_getrandmax();
230 $inverse = wfInvertTimestamp( $timestamp );
231 $sql = "
232 INSERT
233 INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,inverse_timestamp,cur_touched,cur_minor_edit,cur_is_redirect,cur_random) VALUES
234 ($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse','$conversiontime',$minor,$redirect,$random);\n";
235
236 # History
237 $revisions = fetchKeptPages( $title );
238 if(count( $revisions ) == 0 ) {
239 return $sql;
240 }
241
242 $any = false;
243 $sql .= "INSERT
244 INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,inverse_timestamp,old_minor_edit) VALUES\n";
245 foreach( $revisions as $rev ) {
246 $text = wfStrencode( recodeText( $rev->text ) );
247 $minor = ($rev->minor ? 1 : 0);
248 list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
249 $username = wfStrencode( recodeText( $username ) );
250 $timestamp = wfUnix2Timestamp( $rev->ts );
251 $inverse = wfInvertTimestamp( $timestamp );
252 $comment = wfStrencode( recodeText( $rev->summary ) );
253
254 if($any) $sql .= ",";
255 $sql .= "\n\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp','$inverse',$minor)";
256 $any = true;
257 }
258 $sql .= ";\n\n";
259 return $sql;
260 }
261
262 # Whee!
263 function recodeText( $string ) {
264 global $wgImportEncoding;
265 # For currently latin-1 wikis
266 $string = str_replace( "\r\n", "\n", $string );
267 $string = iconv( $wgImportEncoding, "UTF-8", $string );
268 $string = wfMungeToUtf8( $string ); # Any old &#1234; stuff
269 return $string;
270 }
271
272 function wfUtf8Sequence($codepoint) {
273 if($codepoint < 0x80) return chr($codepoint);
274 if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
275 chr($codepoint & 0x3f | 0x80);
276 if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
277 chr($codepoint >> 6 & 0x3f | 0x80) .
278 chr($codepoint & 0x3f | 0x80);
279 if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this
280 chr($codepoint >> 12 & 0x3f | 0x80) .
281 chr($codepoint >> 6 & 0x3f | 0x80) .
282 chr($codepoint & 0x3f | 0x80);
283 # Doesn't yet handle outside the BMP
284 return "&#$codepoint;";
285 }
286
287 function wfMungeToUtf8($string) {
288 $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
289 $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
290 # Should also do named entities here
291 return $string;
292 }
293
294 function wfStrencode( $string ) {
295 return mysql_escape_string( $string );
296 }
297
298 function wfUnix2Timestamp( $unixtime ) {
299 return gmdate( "YmdHis", $unixtime );
300 }
301
302 function wfTimestamp2Unix( $ts )
303 {
304 return gmmktime( ( (int)substr( $ts, 8, 2) ),
305 (int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
306 (int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
307 (int)substr( $ts, 0, 4 ) );
308 }
309
310 function wfTimestampNow() {
311 # return NOW
312 return gmdate( "YmdHis" );
313 }
314
315 # Sorting hack for MySQL 3, which doesn't use index sorts for DESC
316 function wfInvertTimestamp( $ts ) {
317 return strtr(
318 $ts,
319 "0123456789",
320 "9876543210"
321 );
322 }
323
324 function wfSeedRandom()
325 {
326 $seed = hexdec(substr(md5(microtime()),-8)) & 0x7fffffff;
327 mt_srand( $seed );
328 $wgRandomSeeded = true;
329 }
330
331 function array2object( $arr ) {
332 $o = (object)0;
333 foreach( $arr as $x => $y ) {
334 $o->$x = $y;
335 }
336 return $o;
337 }
338
339 ?>