b8d6a5d6c150500b1c61384993d6a14ca548fd99
[lhc/web/wiklou.git] / maintenance / generateSitemap.php
1 <?php
2 define( 'GS_MAIN', -2 );
3 define( 'GS_TALK', -1 );
4 /**
5 * Creates a Google sitemap for the site
6 *
7 * @package MediaWiki
8 * @subpackage Maintenance
9 *
10 * @copyright Copyright © 2005, Ævar Arnfjörð Bjarmason
11 * @copyright Copyright © 2005, Jens Frank <jeluf@gmx.de>
12 * @copyright Copyright © 2005, Brion Vibber <brion@pobox.com>
13 *
14 * @link http://www.google.com/webmasters/sitemaps/docs/en/about.html
15 * @link http://www.google.com/schemas/sitemap/0.84/sitemap.xsd
16 *
17 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
18 */
19
20 class GenerateSitemap {
21 /**
22 * The maximum amount of urls in a sitemap file
23 *
24 * @link http://www.google.com/schemas/sitemap/0.84/sitemap.xsd
25 *
26 * @var int
27 */
28 var $url_limit;
29
30 /**
31 * The maximum size of a sitemap file
32 *
33 * @link http://www.google.com/webmasters/sitemaps/docs/en/protocol.html#faq_sitemap_size
34 *
35 * @var int
36 */
37 var $size_limit;
38
39 /**
40 * The path to prepend to the filename
41 *
42 * @var string
43 */
44 var $fspath;
45
46 /**
47 * The path to append to the domain name
48 *
49 * @var string
50 */
51 var $path;
52
53 /**
54 * Whether or not to use compression
55 *
56 * @var bool
57 */
58 var $compress;
59
60 /**
61 * The number of entries to save in each sitemap file
62 *
63 * @var array
64 */
65 var $limit = array();
66
67 /**
68 * Key => value entries of namespaces and their priorities
69 *
70 * @var array
71 */
72 var $priorities = array(
73 // Custom main namespaces
74 GS_MAIN => '0.5',
75 // Custom talk namesspaces
76 GS_TALK => '0.1',
77 // MediaWiki standard namespaces
78 NS_MAIN => '1.0',
79 NS_TALK => '0.1',
80 NS_USER => '0.5',
81 NS_USER_TALK => '0.1',
82 NS_PROJECT => '0.5',
83 NS_PROJECT_TALK => '0.1',
84 NS_IMAGE => '0.5',
85 NS_IMAGE_TALK => '0.1',
86 NS_MEDIAWIKI => '0.0',
87 NS_MEDIAWIKI_TALK => '0.1',
88 NS_TEMPLATE => '0.0',
89 NS_TEMPLATE_TALK => '0.1',
90 NS_HELP => '0.5',
91 NS_HELP_TALK => '0.1',
92 NS_CATEGORY => '0.5',
93 NS_CATEGORY_TALK => '0.1',
94 );
95
96 /**
97 * A one-dimensional array of namespaces in the wiki
98 *
99 * @var array
100 */
101 var $namespaces = array();
102
103 /**
104 * When this sitemap batch was generated
105 *
106 * @var string
107 */
108 var $timestamp;
109
110 /**
111 * A database slave object
112 *
113 * @var object
114 */
115 var $dbr;
116
117 /**
118 * A resource pointing to the sitemap index file
119 *
120 * @var resource
121 */
122 var $findex;
123
124
125 /**
126 * A resource pointing to a sitemap file
127 *
128 * @var resource
129 */
130 var $file;
131
132 /**
133 * A resource pointing to php://stderr
134 *
135 * @var resource
136 */
137 var $stderr;
138
139 /**
140 * Constructor
141 *
142 * @param string $fspath The path to prepend to the filenames, used to
143 * save them somewhere else than in the root directory
144 * @param string $path The path to append to the domain name
145 * @param bool $compress Whether to compress the sitemap files
146 */
147 function GenerateSitemap( $fspath, $path, $compress ) {
148 global $wgScriptPath;
149
150 $this->url_limit = 50000;
151 $this->size_limit = pow( 2, 20 ) * 10;
152 $this->fspath = isset( $fspath ) ? $fspath : '';
153 $this->path = isset( $path ) ? $path : $wgScriptPath;
154 $this->compress = $compress;
155
156 $this->stderr = fopen( 'php://stderr', 'wt' );
157 $this->dbr =& wfGetDB( DB_SLAVE );
158 $this->generateNamespaces();
159 $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() );
160 $this->findex = fopen( "{$this->fspath}sitemap-index-" . wfWikiID() . ".xml", 'wb' );
161 }
162
163 /**
164 * Generate a one-dimensional array of existing namespaces
165 */
166 function generateNamespaces() {
167 $fname = 'GenerateSitemap::generateNamespaces';
168
169 $res = $this->dbr->select( 'page',
170 array( 'page_namespace' ),
171 array(),
172 $fname,
173 array(
174 'GROUP BY' => 'page_namespace',
175 'ORDER BY' => 'page_namespace',
176 )
177 );
178
179 while ( $row = $this->dbr->fetchObject( $res ) )
180 $this->namespaces[] = $row->page_namespace;
181 }
182
183 /**
184 * Get the priority of a given namespace
185 *
186 * @param int $namespace The namespace to get the priority for
187 +
188 * @return string
189 */
190
191 function priority( $namespace ) {
192 return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
193 }
194
195 /**
196 * If the namespace isn't listed on the priority list return the
197 * default priority for the namespace, varies depending on whether it's
198 * a talkpage or not.
199 *
200 * @param int $namespace The namespace to get the priority for
201 *
202 * @return string
203 */
204 function guessPriority( $namespace ) {
205 return Namespace::isMain( $namespace ) ? $this->priorities[GS_MAIN] : $this->priorities[GS_TALK];
206 }
207
208 /**
209 * Return a database resolution of all the pages in a given namespace
210 *
211 * @param int $namespace Limit the query to this namespace
212 *
213 * @return resource
214 */
215 function getPageRes( $namespace ) {
216 $fname = 'GenerateSitemap::getPageRes';
217
218 return $this->dbr->select( 'page',
219 array(
220 'page_namespace',
221 'page_title',
222 'page_touched',
223 ),
224 array( 'page_namespace' => $namespace ),
225 $fname
226 );
227 }
228
229 /**
230 * Main loop
231 *
232 * @access public
233 */
234 function main() {
235 global $wgContLang;
236
237 fwrite( $this->findex, $this->openIndex() );
238
239 foreach ( $this->namespaces as $namespace ) {
240 $res = $this->getPageRes( $namespace );
241 $this->file = false;
242 $this->generateLimit( $namespace );
243 $length = $this->limit[0];
244 $i = $smcount = 0;
245
246 $fns = $wgContLang->getFormattedNsText( $namespace );
247 $this->debug( "$namespace ($fns)" );
248 while ( $row = $this->dbr->fetchObject( $res ) ) {
249 if ( $i++ === 0 || $i === $this->url_limit + 1 || $length + $this->limit[1] + $this->limit[2] > $this->size_limit ) {
250 if ( $this->file !== false ) {
251 $this->write( $this->file, $this->closeFile() );
252 $this->close( $this->file );
253 }
254 $filename = $this->sitemapFilename( $namespace, $smcount++ );
255 $this->file = $this->open( $this->fspath . $filename, 'wb' );
256 $this->write( $this->file, $this->openFile() );
257 fwrite( $this->findex, $this->indexEntry( $filename ) );
258 $this->debug( "\t$filename" );
259 $length = $this->limit[0];
260 $i = 1;
261 }
262 $title = Title::makeTitle( $row->page_namespace, $row->page_title );
263 $date = wfTimestamp( TS_ISO_8601, $row->page_touched );
264 $entry = $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) );
265 $length += strlen( $entry );
266 $this->write( $this->file, $entry );
267 // generate pages for language variants
268 if($wgContLang->hasVariants()){
269 $variants = $wgContLang->getVariants();
270 foreach($variants as $vCode){
271 if($vCode==$wgContLang->getCode()) continue; // we don't want default variant
272 $entry = $this->fileEntry( $title->getFullURL('',$vCode), $date, $this->priority( $namespace ) );
273 $length += strlen( $entry );
274 $this->write( $this->file, $entry );
275 }
276 }
277 }
278 if ( $this->file ) {
279 $this->write( $this->file, $this->closeFile() );
280 $this->close( $this->file );
281 }
282 }
283 fwrite( $this->findex, $this->closeIndex() );
284 fclose( $this->findex );
285 }
286
287 /**
288 * gzopen() / fopen() wrapper
289 *
290 * @return resource
291 */
292 function open( $file, $flags ) {
293 return $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags );
294 }
295
296 /**
297 * gzwrite() / fwrite() wrapper
298 */
299 function write( &$handle, $str ) {
300 if ( $this->compress )
301 gzwrite( $handle, $str );
302 else
303 fwrite( $handle, $str );
304 }
305
306 /**
307 * gzclose() / fclose() wrapper
308 */
309 function close( &$handle ) {
310 if ( $this->compress )
311 gzclose( $handle );
312 else
313 fclose( $handle );
314 }
315
316 /**
317 * Get a sitemap filename
318 *
319 * @static
320 *
321 * @param int $namespace The namespace
322 * @param int $count The count
323 *
324 * @return string
325 */
326 function sitemapFilename( $namespace, $count ) {
327 $ext = $this->compress ? '.gz' : '';
328 return "sitemap-".wfWikiID()."-NS_$namespace-$count.xml$ext";
329 }
330
331 /**
332 * Return the XML required to open an XML file
333 *
334 * @static
335 *
336 * @return string
337 */
338 function xmlHead() {
339 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
340 }
341
342 /**
343 * Return the XML schema being used
344 *
345 * @static
346 *
347 * @returns string
348 */
349 function xmlSchema() {
350 return 'http://www.google.com/schemas/sitemap/0.84';
351 }
352
353 /**
354 * Return the XML required to open a sitemap index file
355 *
356 * @return string
357 */
358 function openIndex() {
359 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
360 }
361
362 /**
363 * Return the XML for a single sitemap indexfile entry
364 *
365 * @static
366 *
367 * @param string $filename The filename of the sitemap file
368 *
369 * @return string
370 */
371 function indexEntry( $filename ) {
372 return
373 "\t<sitemap>\n" .
374 "\t\t<loc>$filename</loc>\n" .
375 "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
376 "\t</sitemap>\n";
377 }
378
379 /**
380 * Return the XML required to close a sitemap index file
381 *
382 * @static
383 *
384 * @return string
385 */
386 function closeIndex() {
387 return "</sitemapindex>\n";
388 }
389
390 /**
391 * Return the XML required to open a sitemap file
392 *
393 * @return string
394 */
395 function openFile() {
396 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
397 }
398
399 /**
400 * Return the XML for a single sitemap entry
401 *
402 * @static
403 *
404 * @param string $url An RFC 2396 compilant URL
405 * @param string $date A ISO 8601 date
406 * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
407 *
408 * @return string
409 */
410 function fileEntry( $url, $date, $priority ) {
411 return
412 "\t<url>\n" .
413 "\t\t<loc>$url</loc>\n" .
414 "\t\t<lastmod>$date</lastmod>\n" .
415 "\t\t<priority>$priority</priority>\n" .
416 "\t</url>\n";
417 }
418
419 /**
420 * Return the XML required to close sitemap file
421 *
422 * @static
423 * @return string
424 */
425 function closeFile() {
426 return "</urlset>\n";
427 }
428
429 /**
430 * Write a string to stderr followed by a UNIX newline
431 */
432 function debug( $str ) {
433 fwrite( $this->stderr, "$str\n" );
434 }
435
436 /**
437 * Populate $this->limit
438 */
439 function generateLimit( $namespace ) {
440 $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
441
442 $this->limit = array(
443 strlen( $this->openFile() ),
444 strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), $this->priority( $namespace ) ) ),
445 strlen( $this->closeFile() )
446 );
447 }
448 }
449
450 if ( in_array( '--help', $argv ) ) {
451 echo
452 "Usage: php generateSitemap.php [host] [options]\n" .
453 "\thost = hostname\n" .
454 "\toptions:\n" .
455 "\t\t--help\tshow this message\n" .
456 "\t\t--fspath\tThe file system path to save to, e.g /tmp/sitemap/\n" .
457 "\t\t--path\tThe http path to use, e.g. /wiki\n" .
458 "\t\t--compress=[yes|no]\tcompress the sitemap files, default yes\n";
459 die( -1 );
460 }
461
462 if ( isset( $argv[1] ) && strpos( $argv[1], '--' ) !== 0 )
463 $_SERVER['SERVER_NAME'] = $argv[1];
464
465 $optionsWithArgs = array( 'fspath', 'path', 'compress' );
466 require_once 'commandLine.inc';
467
468 $gs = new GenerateSitemap( @$options['fspath'], @$options['path'], @$options['compress'] !== 'no' );
469 $gs->main();
470 ?>