Revert r81542 for now, too lazy to fix this properly
[lhc/web/wiklou.git] / maintenance / importUseModWiki.php
1 <?php
2 /**
3 * Import data from a UseModWiki into a MediaWiki wiki
4 * 2003-02-09 Brion VIBBER <brion@pobox.com>
5 * Based loosely on Magnus's code from 2001-2002
6 *
7 * Updated limited version to get something working temporarily
8 * 2003-10-09
9 * Be sure to run the link & index rebuilding scripts!
10 *
11 * Some more munging for charsets etc
12 * 2003-11-28
13 *
14 * Partial fix for pages starting with lowercase letters (??)
15 * and CamelCase and /Subpage link conversion
16 * 2004-11-17
17 *
18 * Rewrite output to create Special:Export format for import
19 * instead of raw SQL. Should be 'future-proof' against future
20 * schema changes.
21 * 2005-03-14
22 *
23 * This program is free software; you can redistribute it and/or modify
24 * it under the terms of the GNU General Public License as published by
25 * the Free Software Foundation; either version 2 of the License, or
26 * (at your option) any later version.
27 *
28 * This program is distributed in the hope that it will be useful,
29 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
31 * GNU General Public License for more details.
32 *
33 * You should have received a copy of the GNU General Public License along
34 * with this program; if not, write to the Free Software Foundation, Inc.,
35 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
36 * http://www.gnu.org/copyleft/gpl.html
37 *
38 * @todo document
39 * @file
40 * @ingroup Maintenance
41 */
42
43 if ( php_sapi_name() != 'cli' ) {
44 echo "Please customize the settings and run me from the command line.";
45 die( -1 );
46 }
47
48 /** Set these correctly! */
49 $wgImportEncoding = "CP1252"; /* We convert all to UTF-8 */
50 $wgRootDirectory = "/kalman/Projects/wiki2002/wiki/lib-http/db/wiki";
51
52 /* On a large wiki, you might run out of memory */
53 @ini_set( 'memory_limit', '40M' );
54
55 /* globals */
56 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
57 $FS = $wgFieldSeparator ;
58 $FS1 = $FS . "1" ;
59 $FS2 = $FS . "2" ;
60 $FS3 = $FS . "3" ;
61
62 # Unicode sanitization tools
63 require_once( dirname( dirname( __FILE__ ) ) . '/includes/normal/UtfNormal.php' );
64
65 $usercache = array();
66
67 importPages();
68
69 # ------------------------------------------------------------------------------
70
71 function importPages()
72 {
73 global $wgRootDirectory;
74
75 $gt = '>';
76 echo <<<XML
77 <?xml version="1.0" encoding="UTF-8" ?>
78 <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.1/"
79 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
80 xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.1/
81 http://www.mediawiki.org/xml/export-0.1.xsd"
82 version="0.1"
83 xml:lang="en">
84 <!-- generated by importUseModWiki.php -->
85
86 XML;
87 $letters = array(
88 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
89 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
90 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
91 foreach ( $letters as $letter ) {
92 $dir = "$wgRootDirectory/page/$letter";
93 if ( is_dir( $dir ) )
94 importPageDirectory( $dir );
95 }
96 echo <<<XML
97 </mediawiki>
98
99 XML;
100 }
101
102 function importPageDirectory( $dir, $prefix = "" )
103 {
104 echo "\n<!-- Checking page directory " . xmlCommentSafe( $dir ) . " -->\n";
105 $mydir = opendir( $dir );
106 while ( $entry = readdir( $mydir ) ) {
107 $m = array();
108 if ( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
109 echo importPage( $prefix . $m[1] );
110 } else {
111 if ( is_dir( "$dir/$entry" ) ) {
112 if ( $entry != '.' && $entry != '..' ) {
113 importPageDirectory( "$dir/$entry", "$entry/" );
114 }
115 } else {
116 echo "<!-- File '" . xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n";
117 }
118 }
119 }
120 }
121
122
123 # ------------------------------------------------------------------------------
124
125 /* fetch_ functions
126 Grab a given item from the database
127 */
128
129 function useModFilename( $title ) {
130 $c = substr( $title, 0, 1 );
131 if ( preg_match( '/[A-Z]/i', $c ) ) {
132 return strtoupper( $c ) . "/$title";
133 }
134 return "other/$title";
135 }
136
137 function fetchPage( $title )
138 {
139 global $FS1, $FS2, $FS3, $wgRootDirectory;
140
141 $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
142 if ( !file_exists( $fname ) ) {
143 echo "Couldn't open file '$fname' for page '$title'.\n";
144 die( -1 );
145 }
146
147 $page = splitHash( $FS1, file_get_contents( $fname ) );
148 $section = splitHash( $FS2, $page["text_default"] );
149 $text = splitHash( $FS3, $section["data"] );
150
151 return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
152 "minor" => $text["minor"] , "ts" => $section["ts"] ,
153 "username" => $section["username"] , "host" => $section["host"] ) );
154 }
155
156 function fetchKeptPages( $title )
157 {
158 global $FS1, $FS2, $FS3, $wgRootDirectory;
159
160 $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
161 if ( !file_exists( $fname ) ) return array();
162
163 $keptlist = explode( $FS1, file_get_contents( $fname ) );
164 array_shift( $keptlist ); # Drop the junk at beginning of file
165
166 $revisions = array();
167 foreach ( $keptlist as $rev ) {
168 $section = splitHash( $FS2, $rev );
169 $text = splitHash( $FS3, $section["data"] );
170 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"] * 1 > 0 ) ) {
171 array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
172 "minor" => $text["minor"] , "ts" => $section["ts"] ,
173 "username" => $section["username"] , "host" => $section["host"] ) ) );
174 } else {
175 echo "<!-- skipped a bad old revision -->\n";
176 }
177 }
178 return $revisions;
179 }
180
181 function splitHash ( $sep , $str ) {
182 $temp = explode ( $sep , $str ) ;
183 $ret = array () ;
184 for ( $i = 0; $i + 1 < count ( $temp ) ; $i++ ) {
185 $ret[$temp[$i]] = $temp[++$i] ;
186 }
187 return $ret ;
188 }
189
190
191 /* import_ functions
192 Take a fetched item and produce SQL
193 */
194
195 function checkUserCache( $name, $host )
196 {
197 global $usercache;
198
199 if ( $name ) {
200 if ( in_array( $name, $usercache ) ) {
201 $userid = $usercache[$name];
202 } else {
203 # If we haven't imported user accounts
204 $userid = 0;
205 }
206 $username = str_replace( '_', ' ', $name );
207 } else {
208 $userid = 0;
209 $username = $host;
210 }
211 return array( $userid, $username );
212 }
213
214 function importPage( $title )
215 {
216 echo "\n<!-- Importing page " . xmlCommentSafe( $title ) . " -->\n";
217 $page = fetchPage( $title );
218
219 $newtitle = xmlsafe( str_replace( '_', ' ', recodeText( $title ) ) );
220
221 $munged = mungeFormat( $page->text );
222 if ( $munged != $page->text ) {
223 /**
224 * Save a *new* revision with the conversion, and put the
225 * previous last version into the history.
226 */
227 $next = array2object( array(
228 'text' => $munged,
229 'minor' => 1,
230 'username' => 'Conversion script',
231 'host' => '127.0.0.1',
232 'ts' => time(),
233 'summary' => 'link fix',
234 ) );
235 $revisions = array( $page, $next );
236 } else {
237 /**
238 * Current revision:
239 */
240 $revisions = array( $page );
241 }
242 $xml = <<<XML
243 <page>
244 <title>$newtitle</title>
245
246 XML;
247
248 # History
249 $revisions = array_merge( $revisions, fetchKeptPages( $title ) );
250 if ( count( $revisions ) == 0 ) {
251 return NULL; // Was "$sql", which does not appear to be defined.
252 }
253
254 foreach ( $revisions as $rev ) {
255 $text = xmlsafe( recodeText( $rev->text ) );
256 $minor = ( $rev->minor ? '<minor/>' : '' );
257 list( /* $userid */ , $username ) = checkUserCache( $rev->username, $rev->host );
258 $username = xmlsafe( recodeText( $username ) );
259 $timestamp = xmlsafe( timestamp2ISO8601( $rev->ts ) );
260 $comment = xmlsafe( recodeText( $rev->summary ) );
261
262 $xml .= <<<XML
263 <revision>
264 <timestamp>$timestamp</timestamp>
265 <contributor><username>$username</username></contributor>
266 $minor
267 <comment>$comment</comment>
268 <text>$text</text>
269 </revision>
270
271 XML;
272 }
273 $xml .= "</page>\n\n";
274 return $xml;
275 }
276
277 # Whee!
278 function recodeText( $string ) {
279 global $wgImportEncoding;
280 # For currently latin-1 wikis
281 $string = str_replace( "\r\n", "\n", $string );
282 $string = @iconv( $wgImportEncoding, "UTF-8", $string );
283 $string = wfMungeToUtf8( $string ); # Any old &#1234; stuff
284 return $string;
285 }
286
287 function wfUtf8Sequence( $codepoint ) {
288 if ( $codepoint < 0x80 ) return chr( $codepoint );
289 if ( $codepoint < 0x800 ) return chr( $codepoint >> 6 & 0x3f | 0xc0 ) .
290 chr( $codepoint & 0x3f | 0x80 );
291 if ( $codepoint < 0x10000 ) return chr( $codepoint >> 12 & 0x0f | 0xe0 ) .
292 chr( $codepoint >> 6 & 0x3f | 0x80 ) .
293 chr( $codepoint & 0x3f | 0x80 );
294 if ( $codepoint < 0x100000 ) return chr( $codepoint >> 18 & 0x07 | 0xf0 ) . # Double-check this
295 chr( $codepoint >> 12 & 0x3f | 0x80 ) .
296 chr( $codepoint >> 6 & 0x3f | 0x80 ) .
297 chr( $codepoint & 0x3f | 0x80 );
298 # Doesn't yet handle outside the BMP
299 return "&#$codepoint;";
300 }
301
302 function wfMungeToUtf8( $string ) {
303 $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
304 $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
305 # Should also do named entities here
306 return $string;
307 }
308
309 function timestamp2ISO8601( $ts ) {
310 # 2003-08-05T18:30:02Z
311 return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z';
312 }
313
314 function xmlsafe( $string ) {
315 /**
316 * The page may contain old data which has not been properly normalized.
317 * Invalid UTF-8 sequences or forbidden control characters will make our
318 * XML output invalid, so be sure to strip them out.
319 */
320 $string = UtfNormal::cleanUp( $string );
321
322 $string = htmlspecialchars( $string );
323 return $string;
324 }
325
326 function xmlCommentSafe( $text ) {
327 return str_replace( '--', '\\-\\-', xmlsafe( recodeText( $text ) ) );
328 }
329
330
331 function array2object( $arr ) {
332 $o = (object)0;
333 foreach ( $arr as $x => $y ) {
334 $o->$x = $y;
335 }
336 return $o;
337 }
338
339
340 /**
341 * Make CamelCase and /Talk links work
342 */
343 function mungeFormat( $text ) {
344 global $nowiki;
345 $nowiki = array();
346 $staged = preg_replace_callback(
347 '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s',
348 'nowikiPlaceholder', $text );
349
350 # This is probably not 100% correct, I'm just
351 # glancing at the UseModWiki code.
352 $upper = "[A-Z]";
353 $lower = "[a-z_0-9]";
354 $any = "[A-Za-z_0-9]";
355 $camel = "(?:$upper+$lower+$upper+$any*)";
356 $subpage = "(?:\\/$any+)";
357 $substart = "(?:\\/$upper$any*)";
358
359 $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/",
360 '[[$1]]', $staged );
361
362 $final = preg_replace( '/' . preg_quote( placeholder() ) . '/es',
363 'array_shift( $nowiki )', $munged );
364 return $final;
365 }
366
367
368 function placeholder( $x = null ) {
369 return '\xffplaceholder\xff';
370 }
371
372 function nowikiPlaceholder( $matches ) {
373 global $nowiki;
374 $nowiki[] = $matches[1];
375 return placeholder();
376 }
377
378