* Use page_touched rather than rev_timestamp, not as expensive and more correct,...
[lhc/web/wiklou.git] / maintenance / generateSitemap.php
1 <?php
2 /**
3 * @package MediaWiki
4 * @subpackage Maintenance
5 *
6 * Creates a Google sitemap.
7 * https://www.google.com/webmasters/sitemaps/docs/en/about.html
8 */
9
10 # Copyright (C) 2005 Jens Frank <jeluf@gmx.de>, Brion Vibber <brion@pobox.com>
11 # http://www.mediawiki.org/
12 #
13 # This program is free software; you can redistribute it and/or modify
14 # it under the terms of the GNU General Public License as published by
15 # the Free Software Foundation; either version 2 of the License, or
16 # (at your option) any later version.
17 #
18 # This program is distributed in the hope that it will be useful,
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 # GNU General Public License for more details.
22 #
23 # You should have received a copy of the GNU General Public License along
24 # with this program; if not, write to the Free Software Foundation, Inc.,
25 # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
26 # http://www.gnu.org/copyleft/gpl.html
27
28 if ( $argc < 2) {
29 print "Usage: php generateSitemap.php servername [options]\n";
30 print " servername is the name of the website, e.g. mywiki.mydomain.org\n";
31 exit ;
32 }
33 $_SERVER['HOSTNAME'] = $argv[1];
34 print $argv[1] . "\n";
35
36
37 /** */
38 require_once( "commandLine.inc" );
39 print "DB name: $wgDBname\n";
40 print "DB user: $wgDBuser\n";
41
42 $priorities = array (
43 NS_MAIN => 0.9,
44 NS_TALK => 0.4,
45 NS_USER => 0.3,
46 NS_USER_TALK => 0.3,
47 NS_PROJECT => 0.5,
48 NS_PROJECT_TALK => 0.2,
49 NS_IMAGE => 0.2,
50 NS_IMAGE_TALK => 0.1,
51 NS_MEDIAWIKI => 0.1,
52 NS_MEDIAWIKI_TALK => 0.1,
53 NS_TEMPLATE => 0.1,
54 NS_TEMPLATE_TALK => 0.1,
55 NS_HELP => 0.3,
56 NS_HELP_TALK => 0.1,
57 NS_CATEGORY => 0.3,
58 NS_CATEGORY_TALK => 0.1,
59 );
60
61 $dbr =& wfGetDB( DB_SLAVE );
62 $page = $dbr->tableName( 'page' );
63
64 $findex = fopen( "sitemap-index-$wgDBname.xml", "wb" );
65 fwrite( $findex, '<?xml version="1.0" encoding="UTF-8"?>' . "\n" .
66 '<sitemapindex xmlns="http://www.google.com/schemas/sitemap/0.84">' . "\n" );
67
68 foreach ( $priorities as $ns => $priority) {
69 $sql = "SELECT page_namespace,page_title,page_is_redirect,page_touched FROM $page WHERE page_namespace = $ns";
70 print "DB query : $sql\nprocessing ...";
71 $res = $dbr->query( $sql );
72 print " done\n";
73
74 $gzfile = false;
75 $rowcount=0;
76 $sitemapcount=0;
77 while ( $row = $dbr->fetchObject( $res ) ) {
78 if ( $rowcount % 9000 == 0 ) {
79 if ( $gzfile !== false ) {
80 gzwrite( $gzfile, '</urlset>' );
81 gzclose( $gzfile );
82 }
83 $sitemapcount ++;
84 $fname = "sitemap-{$wgDBname}-NS{$ns}-{$sitemapcount}.xml.gz";
85 $gzfile = gzopen( $fname, "wb" );
86 gzwrite( $gzfile, '<?xml version="1.0" encoding="UTF-8"?>' . "\n" .
87 '<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">' . "\n" );
88 fwrite( $findex, "\t<sitemap>\n\t\t<loc>$wgServer/$fname</loc>\n\t</sitemap>\n" );
89 print "$fname\n";
90 }
91 $rowcount ++;
92 $nt = Title::makeTitle( $row->page_namespace, $row->page_title );
93 $date = substr($row->page_touched, 0, 4). '-' .
94 substr($row->page_touched, 4, 2). '-' .
95 substr($row->page_touched, 6, 2);
96 gzwrite( $gzfile, "\t<url>\n\t\t<loc>" . $nt->getFullURL() .
97 "</loc>\n\t\t<lastmod>$date</lastmod>\n" .
98 "\t\t<priority>$priority</priority>\n" .
99 "\t</url>\n" );
100 }
101 if ( $gzfile ) {
102 gzwrite( $gzfile, "</urlset>\n" );
103 gzclose( $gzfile );
104 }
105 print "\n";
106 }
107 fwrite( $findex, "</sitemapindex>\n" );
108 fclose( $findex );
109
110
111 ?>