Use Doxygen @addtogroup instead of phpdoc @package && @subpackage
[lhc/web/wiklou.git] / maintenance / importUseModWiki.php
1 <?php
2
3 /**
4 * Import data from a UseModWiki into a MediaWiki wiki
5 * 2003-02-09 Brion VIBBER <brion@pobox.com>
6 * Based loosely on Magnus's code from 2001-2002
7 *
8 * Updated limited version to get something working temporarily
9 * 2003-10-09
10 * Be sure to run the link & index rebuilding scripts!
11 *
12 * Some more munging for charsets etc
13 * 2003-11-28
14 *
15 * Partial fix for pages starting with lowercase letters (??)
16 * and CamelCase and /Subpage link conversion
17 * 2004-11-17
18 *
19 * Rewrite output to create Special:Export format for import
20 * instead of raw SQL. Should be 'future-proof' against future
21 * schema changes.
22 * 2005-03-14
23 *
24 * @todo document
25 * @addtogroup Maintenance
26 */
27
28 if( php_sapi_name() != 'cli' ) {
29 echo "Please customize the settings and run me from the command line.";
30 die( -1 );
31 }
32
33 /** Set these correctly! */
34 $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
35 $wgRootDirectory = "/kalman/Projects/wiki2002/wiki/lib-http/db/wiki";
36
37 /* On a large wiki, you might run out of memory */
38 @ini_set( 'memory_limit', '40M' );
39
40 /* globals */
41 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
42 $FS = $wgFieldSeparator ;
43 $FS1 = $FS."1" ;
44 $FS2 = $FS."2" ;
45 $FS3 = $FS."3" ;
46
47 # Unicode sanitization tools
48 require_once( '../includes/normal/UtfNormal.php' );
49
50 $usercache = array();
51
52 importPages();
53
54 # ------------------------------------------------------------------------------
55
56 function importPages()
57 {
58 global $wgRootDirectory;
59
60 $gt = '>';
61 echo <<<END
62 <?xml version="1.0" encoding="UTF-8" ?$gt
63 <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.1/"
64 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
65 xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.1/
66 http://www.mediawiki.org/xml/export-0.1.xsd"
67 version="0.1"
68 xml:lang="en">
69 <!-- generated by importUseModWiki.php -->
70
71 END;
72 $letters = array(
73 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
74 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
75 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
76 foreach( $letters as $letter ) {
77 $dir = "$wgRootDirectory/page/$letter";
78 if( is_dir( $dir ) )
79 importPageDirectory( $dir );
80 }
81 echo <<<END
82 </mediawiki>
83
84 END;
85 }
86
87 function importPageDirectory( $dir, $prefix = "" )
88 {
89 echo "\n<!-- Checking page directory " . xmlCommentSafe( $dir ) . " -->\n";
90 $mydir = opendir( $dir );
91 while( $entry = readdir( $mydir ) ) {
92 $m = array();
93 if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
94 echo importPage( $prefix . $m[1] );
95 } else {
96 if( is_dir( "$dir/$entry" ) ) {
97 if( $entry != '.' && $entry != '..' ) {
98 importPageDirectory( "$dir/$entry", "$entry/" );
99 }
100 } else {
101 echo "<!-- File '" . xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n";
102 }
103 }
104 }
105 }
106
107
108 # ------------------------------------------------------------------------------
109
110 /* fetch_ functions
111 Grab a given item from the database
112 */
113
114 function useModFilename( $title ) {
115 $c = substr( $title, 0, 1 );
116 if(preg_match( '/[A-Z]/i', $c ) ) {
117 return strtoupper( $c ) . "/$title";
118 }
119 return "other/$title";
120 }
121
122 function fetchPage( $title )
123 {
124 global $FS1,$FS2,$FS3, $wgRootDirectory;
125
126 $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
127 if( !file_exists( $fname ) ) {
128 echo "Couldn't open file '$fname' for page '$title'.\n";
129 die( -1 );
130 }
131
132 $page = splitHash( $FS1, file_get_contents( $fname ) );
133 $section = splitHash( $FS2, $page["text_default"] );
134 $text = splitHash( $FS3, $section["data"] );
135
136 return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
137 "minor" => $text["minor"] , "ts" => $section["ts"] ,
138 "username" => $section["username"] , "host" => $section["host"] ) );
139 }
140
141 function fetchKeptPages( $title )
142 {
143 global $FS1,$FS2,$FS3, $wgRootDirectory;
144
145 $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
146 if( !file_exists( $fname ) ) return array();
147
148 $keptlist = explode( $FS1, file_get_contents( $fname ) );
149 array_shift( $keptlist ); # Drop the junk at beginning of file
150
151 $revisions = array();
152 foreach( $keptlist as $rev ) {
153 $section = splitHash( $FS2, $rev );
154 $text = splitHash( $FS3, $section["data"] );
155 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
156 array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
157 "minor" => $text["minor"] , "ts" => $section["ts"] ,
158 "username" => $section["username"] , "host" => $section["host"] ) ) );
159 } else {
160 echo "<!-- skipped a bad old revision -->\n";
161 }
162 }
163 return $revisions;
164 }
165
166 function splitHash ( $sep , $str ) {
167 $temp = explode ( $sep , $str ) ;
168 $ret = array () ;
169 for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
170 $ret[$temp[$i]] = $temp[++$i] ;
171 }
172 return $ret ;
173 }
174
175
176 /* import_ functions
177 Take a fetched item and produce SQL
178 */
179
180 function checkUserCache( $name, $host )
181 {
182 global $usercache;
183
184 if( $name ) {
185 if( in_array( $name, $usercache ) ) {
186 $userid = $usercache[$name];
187 } else {
188 # If we haven't imported user accounts
189 $userid = 0;
190 }
191 $username = str_replace( '_', ' ', $name );
192 } else {
193 $userid = 0;
194 $username = $host;
195 }
196 return array( $userid, $username );
197 }
198
199 function importPage( $title )
200 {
201 global $usercache;
202
203 echo "\n<!-- Importing page " . xmlCommentSafe( $title ) . " -->\n";
204 $page = fetchPage( $title );
205
206 $newtitle = xmlsafe( str_replace( '_', ' ', recodeText( $title ) ) );
207
208 $munged = mungeFormat( $page->text );
209 if( $munged != $page->text ) {
210 /**
211 * Save a *new* revision with the conversion, and put the
212 * previous last version into the history.
213 */
214 $next = array2object( array(
215 'text' => $munged,
216 'minor' => 1,
217 'username' => 'Conversion script',
218 'host' => '127.0.0.1',
219 'ts' => time(),
220 'summary' => 'link fix',
221 ) );
222 $revisions = array( $page, $next );
223 } else {
224 /**
225 * Current revision:
226 */
227 $revisions = array( $page );
228 }
229 $xml = <<<END
230 <page>
231 <title>$newtitle</title>
232
233 END;
234
235 # History
236 $revisions = array_merge( $revisions, fetchKeptPages( $title ) );
237 if(count( $revisions ) == 0 ) {
238 return NULL; // Was "$sql", which does not appear to be defined.
239 }
240
241 foreach( $revisions as $rev ) {
242 $text = xmlsafe( recodeText( $rev->text ) );
243 $minor = ($rev->minor ? '<minor/>' : '');
244 list( /* $userid */ , $username ) = checkUserCache( $rev->username, $rev->host );
245 $username = xmlsafe( recodeText( $username ) );
246 $timestamp = xmlsafe( timestamp2ISO8601( $rev->ts ) );
247 $comment = xmlsafe( recodeText( $rev->summary ) );
248
249 $xml .= <<<END
250 <revision>
251 <timestamp>$timestamp</timestamp>
252 <contributor><username>$username</username></contributor>
253 $minor
254 <comment>$comment</comment>
255 <text>$text</text>
256 </revision>
257
258 END;
259 }
260 $xml .= "</page>\n\n";
261 return $xml;
262 }
263
264 # Whee!
265 function recodeText( $string ) {
266 global $wgImportEncoding;
267 # For currently latin-1 wikis
268 $string = str_replace( "\r\n", "\n", $string );
269 $string = @iconv( $wgImportEncoding, "UTF-8", $string );
270 $string = wfMungeToUtf8( $string ); # Any old &#1234; stuff
271 return $string;
272 }
273
274 function wfUtf8Sequence($codepoint) {
275 if($codepoint < 0x80) return chr($codepoint);
276 if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
277 chr($codepoint & 0x3f | 0x80);
278 if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
279 chr($codepoint >> 6 & 0x3f | 0x80) .
280 chr($codepoint & 0x3f | 0x80);
281 if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this
282 chr($codepoint >> 12 & 0x3f | 0x80) .
283 chr($codepoint >> 6 & 0x3f | 0x80) .
284 chr($codepoint & 0x3f | 0x80);
285 # Doesn't yet handle outside the BMP
286 return "&#$codepoint;";
287 }
288
289 function wfMungeToUtf8($string) {
290 $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
291 $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
292 # Should also do named entities here
293 return $string;
294 }
295
296 function timestamp2ISO8601( $ts ) {
297 #2003-08-05T18:30:02Z
298 return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z';
299 }
300
301 function xmlsafe( $string ) {
302 /**
303 * The page may contain old data which has not been properly normalized.
304 * Invalid UTF-8 sequences or forbidden control characters will make our
305 * XML output invalid, so be sure to strip them out.
306 */
307 $string = UtfNormal::cleanUp( $string );
308
309 $string = htmlspecialchars( $string );
310 return $string;
311 }
312
313 function xmlCommentSafe( $text ) {
314 return str_replace( '--', '\\-\\-', xmlsafe( recodeText( $text ) ) );
315 }
316
317
318 function array2object( $arr ) {
319 $o = (object)0;
320 foreach( $arr as $x => $y ) {
321 $o->$x = $y;
322 }
323 return $o;
324 }
325
326
327 /**
328 * Make CamelCase and /Talk links work
329 */
330 function mungeFormat( $text ) {
331 global $nowiki;
332 $nowiki = array();
333 $staged = preg_replace_callback(
334 '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s',
335 'nowikiPlaceholder', $text );
336
337 # This is probably not 100% correct, I'm just
338 # glancing at the UseModWiki code.
339 $upper = "[A-Z]";
340 $lower = "[a-z_0-9]";
341 $any = "[A-Za-z_0-9]";
342 $camel = "(?:$upper+$lower+$upper+$any*)";
343 $subpage = "(?:\\/$any+)";
344 $substart = "(?:\\/$upper$any*)";
345
346 $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/",
347 '[[$1]]', $staged );
348
349 $final = preg_replace( '/' . preg_quote( placeholder() ) . '/es',
350 'array_shift( $nowiki )', $munged );
351 return $final;
352 }
353
354
355 function placeholder( $x = null ) {
356 return '\xffplaceholder\xff';
357 }
358
359 function nowikiPlaceholder( $matches ) {
360 global $nowiki;
361 $nowiki[] = $matches[1];
362 return placeholder();
363 }
364
365 ?>