X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=maintenance%2FgenerateSitemap.php;h=040586896953ecb2b5da586576e19a505bb80da6;hb=d3f564d27fb0377e70db3460e281dc06c47f8858;hp=b9b306908c0cf2bebaeb263133eee9033932b713;hpb=b33f582b4918e17cbcdf90422a1c7af0651b9848;p=lhc%2Fweb%2Fwiklou.git diff --git a/maintenance/generateSitemap.php b/maintenance/generateSitemap.php index b9b306908c..0405868969 100644 --- a/maintenance/generateSitemap.php +++ b/maintenance/generateSitemap.php @@ -1,116 +1,481 @@ + * @copyright Copyright © 2005, Brion Vibber + * + * @see http://www.sitemaps.org/ + * @see http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd + * + * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later */ -# Copyright (C) 2005 Jens Frank , Brion Vibber -# http://www.mediawiki.org/ -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -# http://www.gnu.org/copyleft/gpl.html - -if ( $argc < 2) { - print "Usage: php generateSitemap.php servername [options]\n"; - print " servername is the name of the website, e.g. mywiki.mydomain.org\n"; - exit ; -} -$_SERVER['HOSTNAME'] = $argv[1]; -print $argv[1] . "\n"; - - -/** */ -require_once( "commandLine.inc" ); - print "DB name: $wgDBname\n"; - print "DB user: $wgDBuser\n"; -# print "DB password: $wgDBpassword\n"; - - -$priorities = array ( - NS_MAIN => 0.9, - NS_TALK => 0.4, - NS_USER => 0.3, - NS_USER_TALK => 0.3, - NS_PROJECT => 0.5, - NS_PROJECT_TALK => 0.2, - NS_IMAGE => 0.2, - NS_IMAGE_TALK => 0.1, - NS_MEDIAWIKI => 0.1, - NS_MEDIAWIKI_TALK => 0.1, - NS_TEMPLATE => 0.1, - NS_TEMPLATE_TALK => 0.1, - NS_HELP => 0.3, - NS_HELP_TALK => 0.1, - NS_CATEGORY => 0.3, - NS_CATEGORY_TALK => 0.1, -); - -$dbr =& wfGetDB( DB_SLAVE ); -$page = $dbr->tableName( 'page' ); -$rev = $dbr->tableName( 'revision' ); - -$findex = fopen( "sitemap_index.xml", "wb" ); -fwrite( $findex, ' - - ' ); - -foreach ( $priorities as $ns => $priority) { - $sql = "SELECT page_namespace,page_title,page_is_redirect,rev_timestamp FROM $page, $rev ". - "WHERE page_namespace = $ns AND page_latest = rev_id "; - print "DB query : $sql\nprocessing ..."; - $res = $dbr->query( $sql ); - print " done\n"; - - $gzfile = false; - $rowcount=0; - $sitemapcount=0; - while ( $row = $dbr->fetchObject( $res ) ) { - if ( $rowcount % 9000 == 0 ) { - if ( $gzfile !== false ) { - gzwrite( $gzfile, '' ); - gzclose( $gzfile ); +class GenerateSitemap { + /** + * The maximum amount of urls in a sitemap file + * + * @link http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd + * + * @var int + */ + var $url_limit; + + /** + * The maximum size of a sitemap file + * + * @link http://www.sitemaps.org/faq.php#faq_sitemap_size + * + * @var int + */ + var $size_limit; + + /** + * The path to prepend to the filename + * + * @var string + */ + var $fspath; + + /** + * The path to append to the domain name + * + * @var string + */ + var $path; + + /** + * Whether or not to use compression + * + * @var bool + */ + var $compress; + + /** + * The number of entries to save in each sitemap file + * + * @var array + */ + var $limit = array(); + + /** + * Key => value entries of namespaces and their priorities + * + * @var array + */ + var $priorities = array( + // Custom main namespaces + GS_MAIN => '0.5', + // Custom talk namesspaces + GS_TALK => '0.1', + // MediaWiki standard namespaces + NS_MAIN => '1.0', + NS_TALK => '0.1', + NS_USER => '0.5', + NS_USER_TALK => '0.1', + NS_PROJECT => '0.5', + NS_PROJECT_TALK => '0.1', + NS_IMAGE => '0.5', + NS_IMAGE_TALK => '0.1', + NS_MEDIAWIKI => '0.0', + NS_MEDIAWIKI_TALK => '0.1', + NS_TEMPLATE => '0.0', + NS_TEMPLATE_TALK => '0.1', + NS_HELP => '0.5', + NS_HELP_TALK => '0.1', + NS_CATEGORY => '0.5', + NS_CATEGORY_TALK => '0.1', + ); + + /** + * A one-dimensional array of namespaces in the wiki + * + * @var array + */ + var $namespaces = array(); + + /** + * When this sitemap batch was generated + * + * @var string + */ + var $timestamp; + + /** + * A database slave object + * + * @var object + */ + var $dbr; + + /** + * A resource pointing to the sitemap index file + * + * @var resource + */ + var $findex; + + + /** + * A resource pointing to a sitemap file + * + * @var resource + */ + var $file; + + /** + * A resource pointing to php://stderr + * + * @var resource + */ + var $stderr; + + /** + * Constructor + * + * @param string $fspath The path to prepend to the filenames, used to + * save them somewhere else than in the root directory + * @param string $path The path to append to the domain name + * @param bool $compress Whether to compress the sitemap files + */ + function GenerateSitemap( $fspath, $compress ) { + global $wgScriptPath; + + $this->url_limit = 50000; + $this->size_limit = pow( 2, 20 ) * 10; + $this->fspath = isset( $fspath ) ? $fspath : ''; + $this->compress = $compress; + + $this->stderr = fopen( 'php://stderr', 'wt' ); + $this->dbr = wfGetDB( DB_SLAVE ); + $this->generateNamespaces(); + $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() ); + $this->findex = fopen( "{$this->fspath}sitemap-index-" . wfWikiID() . ".xml", 'wb' ); + } + + /** + * Generate a one-dimensional array of existing namespaces + */ + function generateNamespaces() { + $fname = 'GenerateSitemap::generateNamespaces'; + + // Only generate for specific namespaces if $wgSitemapNamespaces is an array. + global $wgSitemapNamespaces; + if( is_array( $wgSitemapNamespaces ) ) { + $this->namespaces = $wgSitemapNamespaces; + return; + } + + $res = $this->dbr->select( 'page', + array( 'page_namespace' ), + array(), + $fname, + array( + 'GROUP BY' => 'page_namespace', + 'ORDER BY' => 'page_namespace', + ) + ); + + while ( $row = $this->dbr->fetchObject( $res ) ) + $this->namespaces[] = $row->page_namespace; + } + + /** + * Get the priority of a given namespace + * + * @param int $namespace The namespace to get the priority for + + + * @return string + */ + + function priority( $namespace ) { + return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace ); + } + + /** + * If the namespace isn't listed on the priority list return the + * default priority for the namespace, varies depending on whether it's + * a talkpage or not. + * + * @param int $namespace The namespace to get the priority for + * + * @return string + */ + function guessPriority( $namespace ) { + return MWNamespace::isMain( $namespace ) ? $this->priorities[GS_MAIN] : $this->priorities[GS_TALK]; + } + + /** + * Return a database resolution of all the pages in a given namespace + * + * @param int $namespace Limit the query to this namespace + * + * @return resource + */ + function getPageRes( $namespace ) { + $fname = 'GenerateSitemap::getPageRes'; + + return $this->dbr->select( 'page', + array( + 'page_namespace', + 'page_title', + 'page_touched', + ), + array( 'page_namespace' => $namespace ), + $fname + ); + } + + /** + * Main loop + * + * @access public + */ + function main() { + global $wgContLang; + + fwrite( $this->findex, $this->openIndex() ); + + foreach ( $this->namespaces as $namespace ) { + $res = $this->getPageRes( $namespace ); + $this->file = false; + $this->generateLimit( $namespace ); + $length = $this->limit[0]; + $i = $smcount = 0; + + $fns = $wgContLang->getFormattedNsText( $namespace ); + $this->debug( "$namespace ($fns)" ); + while ( $row = $this->dbr->fetchObject( $res ) ) { + if ( $i++ === 0 || $i === $this->url_limit + 1 || $length + $this->limit[1] + $this->limit[2] > $this->size_limit ) { + if ( $this->file !== false ) { + $this->write( $this->file, $this->closeFile() ); + $this->close( $this->file ); + } + $filename = $this->sitemapFilename( $namespace, $smcount++ ); + $this->file = $this->open( $this->fspath . $filename, 'wb' ); + $this->write( $this->file, $this->openFile() ); + fwrite( $this->findex, $this->indexEntry( $filename ) ); + $this->debug( "\t$filename" ); + $length = $this->limit[0]; + $i = 1; + } + $title = Title::makeTitle( $row->page_namespace, $row->page_title ); + $date = wfTimestamp( TS_ISO_8601, $row->page_touched ); + $entry = $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) ); + $length += strlen( $entry ); + $this->write( $this->file, $entry ); + // generate pages for language variants + if($wgContLang->hasVariants()){ + $variants = $wgContLang->getVariants(); + foreach($variants as $vCode){ + if($vCode==$wgContLang->getCode()) continue; // we don't want default variant + $entry = $this->fileEntry( $title->getFullURL('',$vCode), $date, $this->priority( $namespace ) ); + $length += strlen( $entry ); + $this->write( $this->file, $entry ); + } + } + } + if ( $this->file ) { + $this->write( $this->file, $this->closeFile() ); + $this->close( $this->file ); } - $sitemapcount ++; - $fname = "sitemap-NS".$ns."-".$sitemapcount.".xml.gz"; - $gzfile = gzopen( $fname, "wb" ); - gzwrite( $gzfile, ' - < urlset xmlns="http://www.google.com/schemas/sitemap/0.84">' ); - fwrite( $findex, ''.$wgServer.'/'.$fname."\n" ); - print "$fname\n"; } - $rowcount ++; - $nt = Title::makeTitle( $row->page_namespace, $row->page_title ); - $date = substr($row->rev_timestamp, 0, 4). '-' . - substr($row->rev_timestamp, 4, 2). '-' . - substr($row->rev_timestamp, 6, 2); - gzwrite( $gzfile, "\n " . $nt->getFullURL() . - "\n ".$date."\n " . - '' . $priority . '' . - "\n\n" ); - } - if ( $gzfile ) { - gzwrite( $gzfile, "\n" ); - gzclose( $gzfile ); - } - print "\n"; + fwrite( $this->findex, $this->closeIndex() ); + fclose( $this->findex ); + } + + /** + * gzopen() / fopen() wrapper + * + * @return resource + */ + function open( $file, $flags ) { + return $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags ); + } + + /** + * gzwrite() / fwrite() wrapper + */ + function write( &$handle, $str ) { + if ( $this->compress ) + gzwrite( $handle, $str ); + else + fwrite( $handle, $str ); + } + + /** + * gzclose() / fclose() wrapper + */ + function close( &$handle ) { + if ( $this->compress ) + gzclose( $handle ); + else + fclose( $handle ); + } + + /** + * Get a sitemap filename + * + * @static + * + * @param int $namespace The namespace + * @param int $count The count + * + * @return string + */ + function sitemapFilename( $namespace, $count ) { + $ext = $this->compress ? '.gz' : ''; + return "sitemap-".wfWikiID()."-NS_$namespace-$count.xml$ext"; + } + + /** + * Return the XML required to open an XML file + * + * @static + * + * @return string + */ + function xmlHead() { + return '' . "\n"; + } + + /** + * Return the XML schema being used + * + * @static + * + * @returns string + */ + function xmlSchema() { + return 'http://www.sitemaps.org/schemas/sitemap/0.9'; + } + + /** + * Return the XML required to open a sitemap index file + * + * @return string + */ + function openIndex() { + return $this->xmlHead() . '' . "\n"; + } + + /** + * Return the XML for a single sitemap indexfile entry + * + * @static + * + * @param string $filename The filename of the sitemap file + * + * @return string + */ + function indexEntry( $filename ) { + return + "\t\n" . + "\t\t$filename\n" . + "\t\t{$this->timestamp}\n" . + "\t\n"; + } + + /** + * Return the XML required to close a sitemap index file + * + * @static + * + * @return string + */ + function closeIndex() { + return "\n"; + } + + /** + * Return the XML required to open a sitemap file + * + * @return string + */ + function openFile() { + return $this->xmlHead() . '' . "\n"; + } + + /** + * Return the XML for a single sitemap entry + * + * @static + * + * @param string $url An RFC 2396 compilant URL + * @param string $date A ISO 8601 date + * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize + * + * @return string + */ + function fileEntry( $url, $date, $priority ) { + return + "\t\n" . + "\t\t$url\n" . + "\t\t$date\n" . + "\t\t$priority\n" . + "\t\n"; + } + + /** + * Return the XML required to close sitemap file + * + * @static + * @return string + */ + function closeFile() { + return "\n"; + } + + /** + * Write a string to stderr followed by a UNIX newline + */ + function debug( $str ) { + fwrite( $this->stderr, "$str\n" ); + } + + /** + * Populate $this->limit + */ + function generateLimit( $namespace ) { + $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" ); + + $this->limit = array( + strlen( $this->openFile() ), + strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), $this->priority( $namespace ) ) ), + strlen( $this->closeFile() ) + ); + } +} + +if ( in_array( '--help', $argv ) ) { + echo << The file system path to save to, e.g /tmp/sitemap/ + + --server= The protocol and server name to use in URLs, e.g. + http://en.wikipedia.org. This is sometimes necessary because + server name detection may fail in command line scripts. + + --compress=[yes|no] compress the sitemap files, default yes + +EOT; + die( -1 ); +} + +$optionsWithArgs = array( 'fspath', 'server', 'compress' ); +require_once( dirname( __FILE__ ) . '/commandLine.inc' ); + +if ( isset( $options['server'] ) ) { + $wgServer = $options['server']; } -fwrite( $findex, "\n" ); -fclose( $findex ); +$gs = new GenerateSitemap( @$options['fspath'], @$options['compress'] !== 'no' ); +$gs->main(); -?>