* Optimization
[lhc/web/wiklou.git] / maintenance / generateSitemap.php
1 <?php
2 define( 'GS_MAIN', -2 );
3 define( 'GS_TALK', -1 );
4 /**
5 * Creates a Google sitemap for the site
6 *
7 * @package MediaWiki
8 * @subpackage Maintenance
9 *
10 * @copyright Copyright © 2005, Ævar Arnfjörð Bjarmason
11 * @copyright Copyright © 2005, Jens Frank <jeluf@gmx.de>
12 * @copyright Copyright © 2005, Brion Vibber <brion@pobox.com>
13 *
14 * @link http://www.google.com/webmasters/sitemaps/docs/en/about.html
15 * @link http://www.google.com/schemas/sitemap/0.84/sitemap.xsd
16 *
17 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
18 */
19
20 class GenerateSitemap {
21 /**
22 * The path to prepend to the filename
23 *
24 * @var string
25 */
26 var $fspath;
27
28 /**
29 * The path to append to the domain name
30 *
31 * @var string
32 */
33 var $path;
34
35 /**
36 * Whether or not to use compression
37 *
38 * @var bool
39 */
40 var $compress;
41
42 /**
43 * The number of entries to save in each sitemap file
44 *
45 * @var int
46 */
47 var $limit;
48
49 /**
50 * Key => value entries of namespaces and their priorities
51 *
52 * @var array
53 */
54 var $priorities = array(
55 // Custom main namespaces
56 GS_MAIN => '0.5',
57 // Custom talk namesspaces
58 GS_TALK => '0.1',
59 // MediaWiki standard namespaces
60 NS_MAIN => '1.0',
61 NS_TALK => '0.1',
62 NS_USER => '0.5',
63 NS_USER_TALK => '0.1',
64 NS_PROJECT => '0.5',
65 NS_PROJECT_TALK => '0.1',
66 NS_IMAGE => '0.5',
67 NS_IMAGE_TALK => '0.1',
68 NS_MEDIAWIKI => '0.0',
69 NS_MEDIAWIKI_TALK => '0.1',
70 NS_TEMPLATE => '0.0',
71 NS_TEMPLATE_TALK => '0.1',
72 NS_HELP => '0.5',
73 NS_HELP_TALK => '0.1',
74 NS_CATEGORY => '0.5',
75 NS_CATEGORY_TALK => '0.1',
76 );
77
78 /**
79 * A one-dimensional array of namespaces in the wiki
80 *
81 * @var array
82 */
83 var $namespaces = array();
84
85 /**
86 * When this sitemap batch was generated
87 *
88 * @var string
89 */
90 var $timestamp;
91
92 /**
93 * A database slave object
94 *
95 * @var object
96 */
97 var $dbr;
98
99 /**
100 * A resource pointing to the sitemap index file
101 *
102 * @var resource
103 */
104 var $findex;
105
106
107 /**
108 * A resource pointing to a sitemap file
109 *
110 * @var resource
111 */
112 var $file;
113
114 /**
115 * A resource pointing to php://stderr
116 *
117 * @var resource
118 */
119 var $stderr;
120
121 /**
122 * Constructor
123 *
124 * @param string $fspath The path to prepend to the filenames, used to
125 * save them somewhere else than in the root directory
126 * @param string $path The path to append to the domain name
127 * @param bool $compress Whether to compress the sitemap files
128 */
129 function GenerateSitemap( $fspath, $path, $compress ) {
130 global $wgDBname, $wgScriptPath;
131
132 $this->fspath = isset( $fspath ) ? $fspath : '';
133 $this->path = isset( $path ) ? $path : $wgScriptPath;
134 $this->compress = $compress;
135
136 $this->stderr = fopen( 'php://stderr', 'wt' );
137 $this->dbr =& wfGetDB( DB_SLAVE );
138 $this->generateNamespaces();
139 $this->generateTimestamp();
140 $this->findex = fopen( "{$this->fspath}sitemap-index-$wgDBname.xml", 'wb' );
141 }
142
143 /**
144 * Generate a one-dimensional array of existing namespaces
145 */
146 function generateNamespaces() {
147 $fname = 'GenerateSitemap::generateNamespaces';
148
149 $res = $this->dbr->select( 'page',
150 array( 'page_namespace' ),
151 array(),
152 $fname,
153 array(
154 'GROUP BY' => 'page_namespace',
155 'ORDER BY' => 'page_namespace',
156 )
157 );
158
159 while ( $row = $this->dbr->fetchObject( $res ) )
160 $this->namespaces[] = $row->page_namespace;
161 }
162
163 /**
164 * Get the priority of a given namespace
165 *
166 * @param int $namespace The namespace to get the priority for
167 +
168 * @return string
169 */
170
171 function priority( $namespace ) {
172 return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
173 }
174
175 /**
176 * If the namespace isn't listed on the priority list return the
177 * default priority for the namespace, varies depending on whether it's
178 * a talkpage or not.
179 *
180 * @param int $namespace The namespace to get the priority for
181 *
182 * @return string
183 */
184 function guessPriority( $namespace ) {
185 return Namespace::isMain( $namespace ) ? $this->priorities[GS_MAIN] : $this->priorities[GS_TALK];
186 }
187
188 /**
189 * Return a database resolution of all the pages in a given namespace
190 *
191 * @param int $namespace Limit the query to this namespace
192 *
193 * @return resource
194 */
195 function getPageRes( $namespace ) {
196 $fname = 'GenerateSitemap::getPageRes';
197
198 return $this->dbr->select( 'page',
199 array(
200 'page_namespace',
201 'page_title',
202 'page_is_redirect',
203 'page_touched',
204 ),
205 array( 'page_namespace' => $namespace ),
206 $fname
207 );
208 }
209
210 /**
211 * Main loop
212 *
213 * @access public
214 */
215 function main() {
216 global $wgDBname;
217
218 fwrite( $this->findex, $this->openIndex() );
219
220 foreach ( $this->namespaces as $namespace ) {
221 $res = $this->getPageRes( $namespace );
222 $this->file = false;
223 $i = $smcount = 0;
224 $this->generateLimit( $namespace );
225
226 $this->debug( $namespace );
227 while ( $row = $this->dbr->fetchObject( $res ) ) {
228 if ( $i++ % $this->limit === 0 ) {
229 if ( $this->file !== false ) {
230 $this->write( $this->file, $this->closeFile() );
231 $this->close( $this->file );
232 }
233 $filename = $this->sitemapFilename( $namespace, $smcount++ );
234 $this->file = $this->open( $this->fspath . $filename, 'wb' );
235 $this->write( $this->file, $this->openFile() );
236 fwrite( $this->findex, $this->indexEntry( $filename ) );
237 $this->debug( "\t$filename" );
238 }
239 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
240 $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
241 $this->write( $this->file, $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) ) );
242 }
243 if ( $this->file ) {
244 $this->write( $this->file, $this->closeFile() );
245 $this->close( $this->file );
246 }
247 }
248 fwrite( $this->findex, $this->closeIndex() );
249 fclose( $this->findex );
250 }
251
252 /**
253 * gzopen() / fopen() wrapper
254 *
255 * @return resource
256 */
257 function open( $file, $flags ) {
258 return $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
259 }
260
261 /**
262 * gzwrite() / fwrite() wrapper
263 */
264 function write( &$handle, $str ) {
265 if ( $this->compress )
266 gzwrite( $handle, $str );
267 else
268 fwrite( $handle, $str );
269 }
270
271 /**
272 * gzclose() / fclose() wrapper
273 */
274 function close( &$handle ) {
275 if ( $this->compress )
276 gzclose( $handle );
277 else
278 fclose( $handle );
279 }
280
281 /**
282 * Get a sitemap filename
283 *
284 * @static
285 *
286 * @param int $namespace The namespace
287 * @param int $count The count
288 *
289 * @return string
290 */
291 function sitemapFilename( $namespace, $count ) {
292 global $wgDBname;
293
294 $ext = $this->compress ? '.gz' : '';
295
296 return "sitemap-$wgDBname-NS_$namespace-$count.xml$ext";
297 }
298
299 /**
300 * Return the XML required to open an XML file
301 *
302 * @static
303 *
304 * @return string
305 */
306 function xmlHead() {
307 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
308 }
309
310 /**
311 * Return the XML schema being used
312 *
313 * @static
314 *
315 * @returns string
316 */
317 function xmlSchema() {
318 return 'http://www.google.com/schemas/sitemap/0.84';
319 }
320
321 /**
322 * Return the XML required to open a sitemap index file
323 *
324 * @return string
325 */
326 function openIndex() {
327 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
328 }
329
330 /**
331 * Return the XML for a single sitemap indexfile entry
332 *
333 * @static
334 *
335 * @param string $filename The filename of the sitemap file
336 *
337 * @return string
338 */
339 function indexEntry( $filename ) {
340 global $wgServer;
341
342 return
343 "\t<sitemap>\n" .
344 "\t\t<loc>$wgServer{$this->path}/$filename</log>\n" .
345 "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
346 "\t</sitemap>\n";
347 }
348
349 /**
350 * Return the XML required to close a sitemap index file
351 *
352 * @static
353 *
354 * @return string
355 */
356 function closeIndex() {
357 return "</sitemapindex>\n";
358 }
359
360 /**
361 * Return the XML required to open a sitemap file
362 *
363 * @return string
364 */
365 function openFile() {
366 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
367 }
368
369 /**
370 * Return the XML for a single sitemap entry
371 *
372 * @static
373 *
374 * @param string $url An RFC 2396 compilant URL
375 * @param string $date A ISO 8601 date
376 * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
377 *
378 * @return string
379 */
380 function fileEntry( $url, $date, $priority ) {
381 return
382 "\t<url>\n" .
383 "\t\t<loc>$url</loc>\n" .
384 "\t\t<lastmod>$date</lastmod>\n" .
385 "\t\t<priority>$priority</priority>\n" .
386 "\t</url>\n";
387 }
388
389 /**
390 * Return the XML required to close sitemap file
391 *
392 * @static
393 * @return string
394 */
395 function closeFile() {
396 return "</urlset>\n";
397 }
398
399 /**
400 * Write a string to stderr followed by a UNIX newline
401 */
402 function debug( $str ) {
403 fwrite( $this->stderr, "$str\n" );
404 }
405
406 /**
407 * According to the sitemap specification each sitemap must contain no
408 * more than 50,000 urls and no more than 2^20 bytes (10MB), this
409 * function calculates how many urls we can have in each file assuming
410 * that we have the worst case of 63 four byte characters and 1 three
411 * byte character in the title (63*4+1*3 = 255)
412 */
413 function generateLimit( $namespace ) {
414 //$title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
415 $count = $this->getAveragePageLength( $namespace );
416 $title = Title::makeTitle( $namespace, str_repeat( 'a', $count ) );
417
418 $olen = strlen( $this->openFile() );
419 $elen = strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), '1.0' ) );
420 $clen = strlen( $this->closeFile() );
421
422 for ( $i = 1, $etot = $elen; $olen + $clen + $etot + $elen <= pow( 2, 20 ) * 10; ++$i )
423 $etot += $elen;
424
425 $this->limit = $i;
426 }
427
428 function getAveragePageLength( $namespace ) {
429 $fname = 'GenerateSitemap::getAveragePageLength';
430
431 return $this->dbr->selectField( 'page',
432 'CEIL(AVG(LENGTH(page_title)))',
433 array( 'page_namespace' => $namespace ),
434 $fname
435 );
436 }
437
438 /**
439 * Update $this->timestamp to the current time
440 */
441 function generateTimestamp() {
442 $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() );
443 }
444 }
445
446 if ( in_array( '--help', $argv ) )
447 die(
448 "Usage: php generateSitemap.php [host] [options]\n" .
449 "\thost = hostname\n" .
450 "\toptions:\n" .
451 "\t\t--help\tshow this message\n" .
452 "\t\t--fspath\tThe file system path to save to, e.g /tmp/sitemap/\n" .
453 "\t\t--path\tThe http path to use, e.g. /wiki\n" .
454 "\t\t--compress=[yes|no]\tcompress the sitemap files, default yes\n"
455 );
456
457 if ( isset( $argv[1] ) && strpos( $argv[1], '--' ) !== 0 )
458 $_SERVER['SERVER_NAME'] = $argv[1];
459
460 $optionsWithArgs = array( 'fspath', 'path', 'compress' );
461 require_once 'commandLine.inc';
462
463 $gs = new GenerateSitemap( @$options['fspath'], @$options['path'], @$options['compress'] !== 'no' );
464 $gs->main();
465 ?>