2 define( 'GS_MAIN', -2 );
3 define( 'GS_TALK', -1 );
5 * Creates a Google sitemap for the site
8 * @subpackage Maintenance
10 * @copyright Copyright © 2005, Ævar Arnfjörð Bjarmason
11 * @copyright Copyright © 2005, Jens Frank <jeluf@gmx.de>
12 * @copyright Copyright © 2005, Brion Vibber <brion@pobox.com>
14 * @link http://www.google.com/webmasters/sitemaps/docs/en/about.html
15 * @link http://www.google.com/schemas/sitemap/0.84/sitemap.xsd
17 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
20 class GenerateSitemap
{
22 * The path to prepend to the filename
29 * The path to append to the domain name
36 * Whether or not to use compression
43 * The number of entries to save in each sitemap file
50 * Key => value entries of namespaces and their priorities
54 var $priorities = array(
55 // Custom main namespaces
57 // Custom talk namesspaces
59 // MediaWiki standard namespaces
63 NS_USER_TALK
=> '0.1',
65 NS_PROJECT_TALK
=> '0.1',
67 NS_IMAGE_TALK
=> '0.1',
68 NS_MEDIAWIKI
=> '0.0',
69 NS_MEDIAWIKI_TALK
=> '0.1',
71 NS_TEMPLATE_TALK
=> '0.1',
73 NS_HELP_TALK
=> '0.1',
75 NS_CATEGORY_TALK
=> '0.1',
79 * A one-dimensional array of namespaces in the wiki
83 var $namespaces = array();
86 * When this sitemap batch was generated
93 * A database slave object
100 * A resource pointing to the sitemap index file
108 * A resource pointing to a sitemap file
115 * A resource pointing to php://stderr
124 * @param string $fspath The path to prepend to the filenames, used to
125 * save them somewhere else than in the root directory
126 * @param string $path The path to append to the domain name
127 * @param bool $compress Whether to compress the sitemap files
129 function GenerateSitemap( $fspath, $path, $compress ) {
130 global $wgDBname, $wgScriptPath;
132 $this->fspath
= isset( $fspath ) ?
$fspath : '';
133 $this->path
= isset( $path ) ?
$path : $wgScriptPath;
134 $this->compress
= $compress;
136 $this->startts
= wfTimestamp( TS_ISO_8601
, wfTimestampNow() );
138 $this->stderr
= fopen( 'php://stderr', 'wt' );
139 $this->dbr
=& wfGetDB( DB_SLAVE
);
140 $this->generateNamespaces();
141 $this->generateLimit( NS_MAIN
);
142 $this->findex
= fopen( "{$this->fspath}sitemap-index-$wgDBname.xml", 'wb' );
146 * Generate a one-dimensional array of existing namespaces
148 function generateNamespaces() {
149 $fname = 'GenerateSitemap::generateNamespaces';
151 $res = $this->dbr
->select( 'page',
152 array( 'page_namespace' ),
156 'GROUP BY' => 'page_namespace',
157 'ORDER BY' => 'page_namespace',
161 while ( $row = $this->dbr
->fetchObject( $res ) )
162 $this->namespaces
[] = $row->page_namespace
;
166 * Get the priority of a given namespace
168 * @param int $namespace The namespace to get the priority for
173 function priority( $namespace ) {
174 return isset( $this->priorities
[$namespace] ) ?
$this->priorities
[$namespace] : $this->guessPriority( $namespace );
178 * If the namespace isn't listed on the priority list return the
179 * default priority for the namespace, varies depending on whether it's
182 * @param int $namespace The namespace to get the priority for
186 function guessPriority( $namespace ) {
187 return Namespace::isMain( $namespace ) ?
$this->priorities
[GS_MAIN
] : $this->priorities
[GS_TALK
];
191 * Return a database resolution of all the pages in a given namespace
193 * @param int $namespace Limit the query to this namespace
197 function getPageRes( $namespace ) {
198 $fname = 'GenerateSitemap::getPageRes';
200 return $this->dbr
->select( 'page',
207 array( 'page_namespace' => $namespace ),
220 fwrite( $this->findex
, $this->openIndex() );
222 foreach ( $this->namespaces
as $namespace ) {
223 $res = $this->getPageRes( $namespace );
227 $this->debug( $namespace );
228 while ( $row = $this->dbr
->fetchObject( $res ) ) {
229 if ( $i %
$this->limit
=== 0 ) {
230 if ( $this->file
!== false ) {
231 $this->write( $this->file
, $this->closeFile() );
232 $this->close( $this->file
);
234 $this->generateLimit( $namespace );
235 $filename = $this->sitemapFilename( $namespace, $smcount++
);
236 $this->file
= $this->open( $this->fspath
. $filename, 'wb' );
237 $this->write( $this->file
, $this->openFile() );
238 fwrite( $this->findex
, $this->indexEntry( $filename ) );
239 $this->debug( "\t$filename" );
242 $title = Title
::makeTitle( $row->page_namespace
, $row->page_title
);
243 $date = wfTimestamp( TS_ISO_8601
, $row->page_touched
);
244 $this->write( $this->file
, $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) ) );
247 $this->write( $this->file
, $this->closeFile() );
248 $this->close( $this->file
);
251 fwrite( $this->findex
, $this->closeIndex() );
252 fclose( $this->findex
);
256 * gzopen() / fopen() wrapper
260 function open( $file, $flags ) {
261 return $this->compress ?
gzopen( $file, $flags ) : fopen( $file, $flags );
265 * gzwrite() / fwrite() wrapper
267 function write( &$handle, $str ) {
268 if ( $this->compress
)
269 gzwrite( $handle, $str );
271 fwrite( $handle, $str );
275 * gzclose() / fclose() wrapper
277 function close( &$handle ) {
278 if ( $this->compress
)
285 * Get a sitemap filename
289 * @param int $namespace The namespace
290 * @param int $count The count
294 function sitemapFilename( $namespace, $count ) {
297 $ext = $this->compress ?
'.gz' : '';
299 return "sitemap-$wgDBname-NS_$namespace-$count.xml$ext";
303 * Return the XML required to open an XML file
310 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
314 * Return the XML schema being used
320 function xmlSchema() {
321 return 'http://www.google.com/schemas/sitemap/0.84';
325 * Return the XML required to open a sitemap index file
329 function openIndex() {
330 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
334 * Return the XML for a single sitemap indexfile entry
338 * @param string $filename The filename of the sitemap file
342 function indexEntry( $filename ) {
347 "\t\t<loc>$wgServer{$this->path}/$filename</log>\n" .
348 "\t\t<lastmod>{$this->startts}</lastmod>\n" .
353 * Return the XML required to close a sitemap index file
359 function closeIndex() {
360 return "</sitemapindex>\n";
364 * Return the XML required to open a sitemap file
368 function openFile() {
369 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
373 * Return the XML for a single sitemap entry
377 * @param string $url An RFC 2396 compilant URL
378 * @param string $date A ISO 8601 date
379 * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
383 function fileEntry( $url, $date, $priority ) {
386 "\t\t<loc>$url</loc>\n" .
387 "\t\t<lastmod>$date</lastmod>\n" .
388 "\t\t<priority>$priority</priority>\n" .
393 * Return the XML required to close sitemap file
398 function closeFile() {
399 return "</urlset>\n";
403 * Write a string to stderr followed by a UNIX newline
405 function debug( $str ) {
406 fwrite( $this->stderr
, "$str\n" );
410 * According to the sitemap specification each sitemap must contain no
411 * more than 50,000 urls and no more than 2^20 bytes (10MB), this
412 * function calculates how many urls we can have in each file assuming
413 * that we have the worst case of 63 four byte characters and 1 three
414 * byte character in the title (63*4+1*3 = 255)
416 function generateLimit( $namespace ) {
417 $title = Title
::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
419 $olen = strlen( $this->openFile() );
420 $elen = strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601
, wfTimestamp() ), '1.0' ) );
421 $clen = strlen( $this->closeFile() );
423 for ( $i = 1, $etot = $elen; ( $olen +
$clen +
$etot +
$elen ) <= pow( 2, 20 ); ++
$i )
430 if ( in_array( '--help', $argv ) )
432 "Usage: php generateSitemap.php [host] [options]\n" .
433 "\thost = hostname\n" .
435 "\t\t--help\tshow this message\n" .
436 "\t\t--fspath\tThe file system path to save to, e.g /tmp/sitemap/\n" .
437 "\t\t--path\tThe http path to use, e.g. /wiki\n" .
438 "\t\t--compress=[yes|no]\tcompress the sitemap files, default yes\n"
441 if ( isset( $argv[1] ) && strpos( $argv[1], '--' ) !== 0 )
442 $_SERVER['SERVER_NAME'] = $argv[1];
444 $optionsWithArgs = array( 'fspath', 'path', 'compress' );
445 require_once 'commandLine.inc';
447 $gs = new GenerateSitemap( @$options['fspath'], @$options['path'], @$options['compress'] !== 'no' );