2 define( 'GS_MAIN', -2 );
3 define( 'GS_TALK', -1 );
5 * Creates a Google sitemap for the site
8 * @subpackage Maintenance
10 * @copyright Copyright © 2005, Ævar Arnfjörð Bjarmason
11 * @copyright Copyright © 2005, Jens Frank <jeluf@gmx.de>
12 * @copyright Copyright © 2005, Brion Vibber <brion@pobox.com>
14 * @link http://www.google.com/webmasters/sitemaps/docs/en/about.html
15 * @link http://www.google.com/schemas/sitemap/0.84/sitemap.xsd
17 * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
20 class GenerateSitemap
{
22 * The path to prepend to the filename
29 * The path to append to the domain name
36 * Whether or not to use compression
43 * The number of entries to save in each sitemap file
50 * Key => value entries of namespaces and their priorities
54 var $priorities = array(
55 // Custom main namespaces
57 // Custom talk namesspaces
59 // MediaWiki standard namespaces
63 NS_USER_TALK
=> '0.1',
65 NS_PROJECT_TALK
=> '0.1',
67 NS_IMAGE_TALK
=> '0.1',
68 NS_MEDIAWIKI
=> '0.0',
69 NS_MEDIAWIKI_TALK
=> '0.1',
71 NS_TEMPLATE_TALK
=> '0.1',
73 NS_HELP_TALK
=> '0.1',
75 NS_CATEGORY_TALK
=> '0.1',
79 * A one-dimensional array of namespaces in the wiki
83 var $namespaces = array();
86 * When this sitemap batch was generated
93 * A database slave object
100 * A resource pointing to the sitemap index file
108 * A resource pointing to a sitemap file
115 * A resource pointing to php://stderr
124 * @param string $fspath The path to prepend to the filenames, used to
125 * save them somewhere else than in the root directory
126 * @param string $path The path to append to the domain name
127 * @param bool $compress Whether to compress the sitemap files
129 function GenerateSitemap( $fspath, $path, $compress ) {
130 global $wgDBname, $wgScriptPath;
132 $this->fspath
= isset( $fspath ) ?
$fspath : '';
133 $this->path
= isset( $path ) ?
$path : $wgScriptPath;
134 $this->compress
= $compress;
136 $this->stderr
= fopen( 'php://stderr', 'wt' );
137 $this->dbr
=& wfGetDB( DB_SLAVE
);
138 $this->generateNamespaces();
139 $this->generateTimestamp();
140 $this->findex
= fopen( "{$this->fspath}sitemap-index-$wgDBname.xml", 'wb' );
144 * Generate a one-dimensional array of existing namespaces
146 function generateNamespaces() {
147 $fname = 'GenerateSitemap::generateNamespaces';
149 $res = $this->dbr
->select( 'page',
150 array( 'page_namespace' ),
154 'GROUP BY' => 'page_namespace',
155 'ORDER BY' => 'page_namespace',
159 while ( $row = $this->dbr
->fetchObject( $res ) )
160 $this->namespaces
[] = $row->page_namespace
;
164 * Get the priority of a given namespace
166 * @param int $namespace The namespace to get the priority for
171 function priority( $namespace ) {
172 return isset( $this->priorities
[$namespace] ) ?
$this->priorities
[$namespace] : $this->guessPriority( $namespace );
176 * If the namespace isn't listed on the priority list return the
177 * default priority for the namespace, varies depending on whether it's
180 * @param int $namespace The namespace to get the priority for
184 function guessPriority( $namespace ) {
185 return Namespace::isMain( $namespace ) ?
$this->priorities
[GS_MAIN
] : $this->priorities
[GS_TALK
];
189 * Return a database resolution of all the pages in a given namespace
191 * @param int $namespace Limit the query to this namespace
195 function getPageRes( $namespace ) {
196 $fname = 'GenerateSitemap::getPageRes';
198 return $this->dbr
->select( 'page',
205 array( 'page_namespace' => $namespace ),
218 fwrite( $this->findex
, $this->openIndex() );
220 foreach ( $this->namespaces
as $namespace ) {
221 $res = $this->getPageRes( $namespace );
224 $this->generateLimit( $namespace );
226 $this->debug( $namespace );
227 while ( $row = $this->dbr
->fetchObject( $res ) ) {
228 if ( $i++ %
$this->limit
=== 0 ) {
229 if ( $this->file
!== false ) {
230 $this->write( $this->file
, $this->closeFile() );
231 $this->close( $this->file
);
233 $filename = $this->sitemapFilename( $namespace, $smcount++
);
234 $this->file
= $this->open( $this->fspath
. $filename, 'wb' );
235 $this->write( $this->file
, $this->openFile() );
236 fwrite( $this->findex
, $this->indexEntry( $filename ) );
237 $this->debug( "\t$filename" );
239 $title = Title
::makeTitle( $row->page_namespace
, $row->page_title
);
240 $date = wfTimestamp( TS_ISO_8601
, $row->page_touched
);
241 $this->write( $this->file
, $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) ) );
244 $this->write( $this->file
, $this->closeFile() );
245 $this->close( $this->file
);
248 fwrite( $this->findex
, $this->closeIndex() );
249 fclose( $this->findex
);
253 * gzopen() / fopen() wrapper
257 function open( $file, $flags ) {
258 return $this->compress ?
gzopen( $file, $flags ) : fopen( $file, $flags );
262 * gzwrite() / fwrite() wrapper
264 function write( &$handle, $str ) {
265 if ( $this->compress
)
266 gzwrite( $handle, $str );
268 fwrite( $handle, $str );
272 * gzclose() / fclose() wrapper
274 function close( &$handle ) {
275 if ( $this->compress
)
282 * Get a sitemap filename
286 * @param int $namespace The namespace
287 * @param int $count The count
291 function sitemapFilename( $namespace, $count ) {
294 $ext = $this->compress ?
'.gz' : '';
296 return "sitemap-$wgDBname-NS_$namespace-$count.xml$ext";
300 * Return the XML required to open an XML file
307 return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
311 * Return the XML schema being used
317 function xmlSchema() {
318 return 'http://www.google.com/schemas/sitemap/0.84';
322 * Return the XML required to open a sitemap index file
326 function openIndex() {
327 return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
331 * Return the XML for a single sitemap indexfile entry
335 * @param string $filename The filename of the sitemap file
339 function indexEntry( $filename ) {
344 "\t\t<loc>$wgServer{$this->path}/$filename</log>\n" .
345 "\t\t<lastmod>{$this->timestamp}</lastmod>\n" .
350 * Return the XML required to close a sitemap index file
356 function closeIndex() {
357 return "</sitemapindex>\n";
361 * Return the XML required to open a sitemap file
365 function openFile() {
366 return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
370 * Return the XML for a single sitemap entry
374 * @param string $url An RFC 2396 compilant URL
375 * @param string $date A ISO 8601 date
376 * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
380 function fileEntry( $url, $date, $priority ) {
383 "\t\t<loc>$url</loc>\n" .
384 "\t\t<lastmod>$date</lastmod>\n" .
385 "\t\t<priority>$priority</priority>\n" .
390 * Return the XML required to close sitemap file
395 function closeFile() {
396 return "</urlset>\n";
400 * Write a string to stderr followed by a UNIX newline
402 function debug( $str ) {
403 fwrite( $this->stderr
, "$str\n" );
407 * According to the sitemap specification each sitemap must contain no
408 * more than 50,000 urls and no more than 2^20 bytes (10MB), this
409 * function calculates how many urls we can have in each file assuming
410 * that we have the worst case of 63 four byte characters and 1 three
411 * byte character in the title (63*4+1*3 = 255)
413 function generateLimit( $namespace ) {
414 //$title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
415 $count = $this->getAveragePageLength( $namespace );
416 $title = Title
::makeTitle( $namespace, str_repeat( 'a', $count ) );
418 $olen = strlen( $this->openFile() );
419 $elen = strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601
, wfTimestamp() ), '1.0' ) );
420 $clen = strlen( $this->closeFile() );
422 for ( $i = 1, $etot = $elen; $olen +
$clen +
$etot +
$elen <= pow( 2, 20 ) * 10; ++
$i )
428 function getAveragePageLength( $namespace ) {
429 $fname = 'GenerateSitemap::getAveragePageLength';
431 return $this->dbr
->selectField( 'page',
432 'CEIL(AVG(LENGTH(page_title)))',
433 array( 'page_namespace' => $namespace ),
439 * Update $this->timestamp to the current time
441 function generateTimestamp() {
442 $this->timestamp
= wfTimestamp( TS_ISO_8601
, wfTimestampNow() );
446 if ( in_array( '--help', $argv ) )
448 "Usage: php generateSitemap.php [host] [options]\n" .
449 "\thost = hostname\n" .
451 "\t\t--help\tshow this message\n" .
452 "\t\t--fspath\tThe file system path to save to, e.g /tmp/sitemap/\n" .
453 "\t\t--path\tThe http path to use, e.g. /wiki\n" .
454 "\t\t--compress=[yes|no]\tcompress the sitemap files, default yes\n"
457 if ( isset( $argv[1] ) && strpos( $argv[1], '--' ) !== 0 )
458 $_SERVER['SERVER_NAME'] = $argv[1];
460 $optionsWithArgs = array( 'fspath', 'path', 'compress' );
461 require_once 'commandLine.inc';
463 $gs = new GenerateSitemap( @$options['fspath'], @$options['path'], @$options['compress'] !== 'no' );