Improve RELEASE-NOTES wording from r52082
[lhc/web/wiklou.git] / maintenance / generateSitemap.php
1 <?php
2 define( 'GS_MAIN', -2 );
3 define( 'GS_TALK', -1 );
4 /**
5 * Creates a sitemap for the site
6 *
7 * @ingroup Maintenance
8 *
9 * @copyright Copyright © 2005, Ævar Arnfjörð Bjarmason
10 * @copyright Copyright © 2005, Jens Frank <jeluf@gmx.de>
11 * @copyright Copyright © 2005, Brion Vibber <brion@pobox.com>
12 *
13 * @see http://www.sitemaps.org/
14 * @see http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
15 *
16 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
17 */
18
19 class GenerateSitemap {
20 /**
21 * The maximum amount of urls in a sitemap file
22 *
23 * @link http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd
24 *
25 * @var int
26 */
27 var $url_limit;
28
29 /**
30 * The maximum size of a sitemap file
31 *
32 * @link http://www.sitemaps.org/faq.php#faq_sitemap_size
33 *
34 * @var int
35 */
36 var $size_limit;
37
38 /**
39 * The path to prepend to the filename
40 *
41 * @var string
42 */
43 var $fspath;
44
45 /**
46 * The path to append to the domain name
47 *
48 * @var string
49 */
50 var $path;
51
52 /**
53 * Whether or not to use compression
54 *
55 * @var bool
56 */
57 var $compress;
58
59 /**
60 * The number of entries to save in each sitemap file
61 *
62 * @var array
63 */
64 var $limit = array();
65
66 /**
67 * Key => value entries of namespaces and their priorities
68 *
69 * @var array
70 */
71 var $priorities = array(
72 // Custom main namespaces
73 GS_MAIN => '0.5',
74 // Custom talk namesspaces
75 GS_TALK => '0.1',
76 // MediaWiki standard namespaces
77 NS_MAIN => '1.0',
78 NS_TALK => '0.1',
79 NS_USER => '0.5',
80 NS_USER_TALK => '0.1',
81 NS_PROJECT => '0.5',
82 NS_PROJECT_TALK => '0.1',
83 NS_FILE => '0.5',
84 NS_FILE_TALK => '0.1',
85 NS_MEDIAWIKI => '0.0',
86 NS_MEDIAWIKI_TALK => '0.1',
87 NS_TEMPLATE => '0.0',
88 NS_TEMPLATE_TALK => '0.1',
89 NS_HELP => '0.5',
90 NS_HELP_TALK => '0.1',
91 NS_CATEGORY => '0.5',
92 NS_CATEGORY_TALK => '0.1',
93 );
94
95 /**
96 * A one-dimensional array of namespaces in the wiki
97 *
98 * @var array
99 */
100 var $namespaces = array();
101
102 /**
103 * When this sitemap batch was generated
104 *
105 * @var string
106 */
107 var $timestamp;
108
109 /**
110 * A database slave object
111 *
112 * @var object
113 */
114 var $dbr;
115
116 /**
117 * A resource pointing to the sitemap index file
118 *
119 * @var resource
120 */
121 var $findex;
122
123
124 /**
125 * A resource pointing to a sitemap file
126 *
127 * @var resource
128 */
129 var $file;
130
131 /**
132 * A resource pointing to php://stderr
133 *
134 * @var resource
135 */
136 var $stderr;
137
138 /**
139 * Constructor
140 *
141 * @param string $fspath The path to prepend to the filenames, used to
142 * save them somewhere else than in the root directory
143 * @param string $path The path to append to the domain name
144 * @param bool $compress Whether to compress the sitemap files
145 */
146 function GenerateSitemap( $fspath, $compress ) {
147 global $wgScriptPath;
148
149 $this->url_limit = 50000;
150 $this->size_limit = pow( 2, 20 ) * 10;
151 $this->fspath = self::init_path( $fspath );
152
153 $this->compress = $compress;
154
155 $this->stderr = fopen( 'php://stderr', 'wt' );
156 $this->dbr = wfGetDB( DB_SLAVE );
157 $this->generateNamespaces();
158 $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() );
159
160
161 $this->findex = fopen( "{$this->fspath}sitemap-index-" . wfWikiID() . ".xml", 'wb' );
162 }
163
164 /**
165 * Create directory if it does not exist and return pathname with a trailing slash
166 */
167 private static function init_path( $fspath ) {
168 if( !isset( $fspath ) ) {
169 return null;
170 }
171 # Create directory if needed
172 if( $fspath && !is_dir( $fspath ) ) {
173 mkdir( $fspath, 0755 ) or die("Can not create directory $fspath.\n");
174 }
175
176 return realpath( $fspath ). DIRECTORY_SEPARATOR ;
177 }
178
179 /**
180 * Generate a one-dimensional array of existing namespaces
181 */
182 function generateNamespaces() {
183 $fname = 'GenerateSitemap::generateNamespaces';
184
185 // Only generate for specific namespaces if $wgSitemapNamespaces is an array.
186 global $wgSitemapNamespaces;
187 if( is_array( $wgSitemapNamespaces ) ) {
188 $this->namespaces = $wgSitemapNamespaces;
189 return;
190 }
191
192 $res = $this->dbr->select( 'page',
193 array( 'page_namespace' ),
194 array(),
195 $fname,
196 array(
197 'GROUP BY' => 'page_namespace',
198 'ORDER BY' => 'page_namespace',
199 )
200 );
201
202 while ( $row = $this->dbr->fetchObject( $res ) )
203 $this->namespaces[] = $row->page_namespace;
204 }
205
206 /**
207 * Get the priority of a given namespace
208 *
209 * @param int $namespace The namespace to get the priority for
210 +
211 * @return string
212 */
213
214 function priority( $namespace ) {
215 return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
216 }
217
218 /**
219 * If the namespace isn't listed on the priority list return the
220 * default priority for the namespace, varies depending on whether it's
221 * a talkpage or not.
222 *
223 * @param int $namespace The namespace to get the priority for
224 *
225 * @return string
226 */
227 function guessPriority( $namespace ) {
228 return MWNamespace::isMain( $namespace ) ? $this->priorities[GS_MAIN] : $this->priorities[GS_TALK];
229 }
230
231 /**
232 * Return a database resolution of all the pages in a given namespace
233 *
234 * @param int $namespace Limit the query to this namespace
235 *
236 * @return resource
237 */
238 function getPageRes( $namespace ) {
239 $fname = 'GenerateSitemap::getPageRes';
240
241 return $this->dbr->select( 'page',
242 array(
243 'page_namespace',
244 'page_title',
245 'page_touched',
246 ),
247 array( 'page_namespace' => $namespace ),
248 $fname
249 );
250 }
251
252 /**
253 * Main loop
254 *
255 * @access public
256 */
257 function main() {
258 global $wgContLang;
259
260 fwrite( $this->findex, $this->openIndex() );
261
262 foreach ( $this->namespaces as $namespace ) {
263 $res = $this->getPageRes( $namespace );
264 $this->file = false;
265 $this->generateLimit( $namespace );
266 $length = $this->limit[0];
267 $i = $smcount = 0;
268
269 $fns = $wgContLang->getFormattedNsText( $namespace );
270 $this->debug( "$namespace ($fns)" );
271 while ( $row = $this->dbr->fetchObject( $res ) ) {
272 if ( $i++ === 0 || $i === $this->url_limit + 1 || $length + $this->limit[1] + $this->limit[2] > $this->size_limit ) {
273 if ( $this->file !== false ) {
274 $this->write( $this->file, $this->closeFile() );
275 $this->close( $this->file );
276 }
277 $filename = $this->sitemapFilename( $namespace, $smcount++ );
278 $this->file = $this->open( $this->fspath . $filename, 'wb' );
279 $this->write( $this->file, $this->openFile() );
280 fwrite( $this->findex, $this->indexEntry( $filename ) );
281 $this->debug( "\t$this->fspath$filename" );
282 $length = $this->limit[0];
283 $i = 1;
284 }
285 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
286 $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
287 $entry = $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) );
288 $length += strlen( $entry );
289 $this->write( $this->file, $entry );
290 // generate pages for language variants
291 if($wgContLang->hasVariants()){
292 $variants = $wgContLang->getVariants();
293 foreach($variants as $vCode){
294 if($vCode==$wgContLang->getCode()) continue; // we don't want default variant
295 $entry = $this->fileEntry( $title->getFullURL('',$vCode), $date, $this->priority( $namespace ) );
296 $length += strlen( $entry );
297 $this->write( $this->file, $entry );
298 }
299 }
300 }
301 if ( $this->file ) {
302 $this->write( $this->file, $this->closeFile() );
303 $this->close( $this->file );
304 }
305 }
306 fwrite( $this->findex, $this->closeIndex() );
307 fclose( $this->findex );
308 }
309
310 /**
311 * gzopen() / fopen() wrapper
312 *
313 * @return resource
314 */
315 function open( $file, $flags ) {
316 return $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
317 }
318
319 /**
320 * gzwrite() / fwrite() wrapper
321 */
322 function write( &$handle, $str ) {
323 if ( $this->compress )
324 gzwrite( $handle, $str );
325 else
326 fwrite( $handle, $str );
327 }
328
329 /**
330 * gzclose() / fclose() wrapper
331 */
332 function close( &$handle ) {
333 if ( $this->compress )
334 gzclose( $handle );
335 else
336 fclose( $handle );
337 }
338
339 /**
340 * Get a sitemap filename
341 *
342 * @static
343 *
344 * @param int $namespace The namespace
345 * @param int $count The count
346 *
347 * @return string
348 */
349 function sitemapFilename( $namespace, $count ) {
350 $ext = $this->compress ? '.gz' : '';
351 return "sitemap-".wfWikiID()."-NS_$namespace-$count.xml$ext";
352 }
353
354 /**
355 * Return the XML required to open an XML file
356 *
357 * @static
358 *
359 * @return string
360 */
361 function xmlHead() {
362 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
363 }
364
365 /**
366 * Return the XML schema being used
367 *
368 * @static
369 *
370 * @returns string
371 */
372 function xmlSchema() {
373 return 'http://www.sitemaps.org/schemas/sitemap/0.9';
374 }
375
376 /**
377 * Return the XML required to open a sitemap index file
378 *
379 * @return string
380 */
381 function openIndex() {
382 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
383 }
384
385 /**
386 * Return the XML for a single sitemap indexfile entry
387 *
388 * @static
389 *
390 * @param string $filename The filename of the sitemap file
391 *
392 * @return string
393 */
394 function indexEntry( $filename ) {
395 return
396 "\t<sitemap>\n" .
397 "\t\t<loc>$filename</loc>\n" .
398 "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
399 "\t</sitemap>\n";
400 }
401
402 /**
403 * Return the XML required to close a sitemap index file
404 *
405 * @static
406 *
407 * @return string
408 */
409 function closeIndex() {
410 return "</sitemapindex>\n";
411 }
412
413 /**
414 * Return the XML required to open a sitemap file
415 *
416 * @return string
417 */
418 function openFile() {
419 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
420 }
421
422 /**
423 * Return the XML for a single sitemap entry
424 *
425 * @static
426 *
427 * @param string $url An RFC 2396 compliant URL
428 * @param string $date A ISO 8601 date
429 * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
430 *
431 * @return string
432 */
433 function fileEntry( $url, $date, $priority ) {
434 return
435 "\t<url>\n" .
436 "\t\t<loc>$url</loc>\n" .
437 "\t\t<lastmod>$date</lastmod>\n" .
438 "\t\t<priority>$priority</priority>\n" .
439 "\t</url>\n";
440 }
441
442 /**
443 * Return the XML required to close sitemap file
444 *
445 * @static
446 * @return string
447 */
448 function closeFile() {
449 return "</urlset>\n";
450 }
451
452 /**
453 * Write a string to stderr followed by a UNIX newline
454 */
455 function debug( $str ) {
456 fwrite( $this->stderr, "$str\n" );
457 }
458
459 /**
460 * Populate $this->limit
461 */
462 function generateLimit( $namespace ) {
463 $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
464
465 $this->limit = array(
466 strlen( $this->openFile() ),
467 strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), $this->priority( $namespace ) ) ),
468 strlen( $this->closeFile() )
469 );
470 }
471 }
472
473 if ( in_array( '--help', $argv ) ) {
474 echo <<<EOT
475 Usage: php generateSitemap.php [options]
476 --help show this message
477
478 --fspath=<path> The file system path to save to, e.g /tmp/sitemap
479 Saves to current directory if not given.
480
481 --server=<server> The protocol and server name to use in URLs, e.g.
482 http://en.wikipedia.org. This is sometimes necessary because
483 server name detection may fail in command line scripts.
484
485 --compress=[yes|no] compress the sitemap files, default yes
486
487 EOT;
488 die( -1 );
489 }
490
491 $optionsWithArgs = array( 'fspath', 'server', 'compress' );
492 require_once( dirname( __FILE__ ) . '/commandLine.inc' );
493
494 if ( isset( $options['server'] ) ) {
495 $wgServer = $options['server'];
496 }
497
498 $gs = new GenerateSitemap( @$options['fspath'], @$options['compress'] !== 'no' );
499 $gs->main();
500