X-Git-Url: http://git.heureux-cyclage.org/?a=blobdiff_plain;f=maintenance%2FgenerateSitemap.php;h=52dc33ae51bae1a96b05bd1a3cc43c6eac811031;hb=7deb05f35a9b5d01c4110e1e1a10c4285b806998;hp=dcd124e5a956ee2bd172878483658a05226fad19;hpb=5408e81fdf27bbc52830e7a1007f0e21df6804c7;p=lhc%2Fweb%2Fwiklou.git diff --git a/maintenance/generateSitemap.php b/maintenance/generateSitemap.php index dcd124e5a9..52dc33ae51 100644 --- a/maintenance/generateSitemap.php +++ b/maintenance/generateSitemap.php @@ -2,22 +2,39 @@ define( 'GS_MAIN', -2 ); define( 'GS_TALK', -1 ); /** - * Creates a Google sitemap for the site + * Creates a sitemap for the site * - * @package MediaWiki - * @subpackage Maintenance + * @ingroup Maintenance * * @copyright Copyright © 2005, Ævar Arnfjörð Bjarmason * @copyright Copyright © 2005, Jens Frank * @copyright Copyright © 2005, Brion Vibber * - * @link http://www.google.com/webmasters/sitemaps/docs/en/about.html - * @link http://www.google.com/schemas/sitemap/0.84/sitemap.xsd + * @see http://www.sitemaps.org/ + * @see http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd * * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later */ class GenerateSitemap { + /** + * The maximum amount of urls in a sitemap file + * + * @link http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd + * + * @var int + */ + var $url_limit; + + /** + * The maximum size of a sitemap file + * + * @link http://www.sitemaps.org/faq.php#faq_sitemap_size + * + * @var int + */ + var $size_limit; + /** * The path to prepend to the filename * @@ -31,20 +48,20 @@ class GenerateSitemap { * @var string */ var $path; - + /** * Whether or not to use compression * * @var bool */ var $compress; - + /** * The number of entries to save in each sitemap file * - * @var int + * @var array */ - var $limit; + var $limit = array(); /** * Key => value entries of namespaces and their priorities @@ -63,8 +80,8 @@ class GenerateSitemap { NS_USER_TALK => '0.1', NS_PROJECT => '0.5', NS_PROJECT_TALK => '0.1', - NS_IMAGE => '0.5', - NS_IMAGE_TALK => '0.1', + NS_FILE => '0.5', + NS_FILE_TALK => '0.1', NS_MEDIAWIKI => '0.0', NS_MEDIAWIKI_TALK => '0.1', NS_TEMPLATE => '0.0', @@ -95,14 +112,14 @@ class GenerateSitemap { * @var object */ var $dbr; - + /** * A resource pointing to the sitemap index file * * @var resource */ var $findex; - + /** * A resource pointing to a sitemap file @@ -126,18 +143,37 @@ class GenerateSitemap { * @param string $path The path to append to the domain name * @param bool $compress Whether to compress the sitemap files */ - function GenerateSitemap( $fspath, $path, $compress ) { - global $wgDBname, $wgScriptPath; - - $this->fspath = isset( $fspath ) ? $fspath : ''; - $this->path = isset( $path ) ? $path : $wgScriptPath; + function GenerateSitemap( $fspath, $compress ) { + global $wgScriptPath; + + $this->url_limit = 50000; + $this->size_limit = pow( 2, 20 ) * 10; + $this->fspath = self::init_path( $fspath ); + $this->compress = $compress; $this->stderr = fopen( 'php://stderr', 'wt' ); - $this->dbr =& wfGetDB( DB_SLAVE ); + $this->dbr = wfGetDB( DB_SLAVE ); $this->generateNamespaces(); - $this->generateLimit( NS_MAIN ); - $this->findex = fopen( "{$this->fspath}sitemap-index-$wgDBname.xml", 'wb' ); + $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() ); + + + $this->findex = fopen( "{$this->fspath}sitemap-index-" . wfWikiID() . ".xml", 'wb' ); + } + + /** + * Create directory if it does not exist and return pathname with a trailing slash + */ + private static function init_path( $fspath ) { + if( !isset( $fspath ) ) { + return null; + } + # Create directory if needed + if( $fspath && !is_dir( $fspath ) ) { + mkdir( $fspath, 0755 ) or die("Can not create directory $fspath.\n"); + } + + return realpath( $fspath ). DIRECTORY_SEPARATOR ; } /** @@ -145,7 +181,14 @@ class GenerateSitemap { */ function generateNamespaces() { $fname = 'GenerateSitemap::generateNamespaces'; - + + // Only generate for specific namespaces if $wgSitemapNamespaces is an array. + global $wgSitemapNamespaces; + if( is_array( $wgSitemapNamespaces ) ) { + $this->namespaces = $wgSitemapNamespaces; + return; + } + $res = $this->dbr->select( 'page', array( 'page_namespace' ), array(), @@ -176,13 +219,13 @@ class GenerateSitemap { * If the namespace isn't listed on the priority list return the * default priority for the namespace, varies depending on whether it's * a talkpage or not. - * + * * @param int $namespace The namespace to get the priority for * * @return string */ function guessPriority( $namespace ) { - return Namespace::isMain( $namespace ) ? $this->priorities[GS_MAIN] : $this->priorities[GS_TALK]; + return MWNamespace::isMain( $namespace ) ? $this->priorities[GS_MAIN] : $this->priorities[GS_TALK]; } /** @@ -196,10 +239,9 @@ class GenerateSitemap { $fname = 'GenerateSitemap::getPageRes'; return $this->dbr->select( 'page', - array( + array( 'page_namespace', 'page_title', - 'page_is_redirect', 'page_touched', ), array( 'page_namespace' => $namespace ), @@ -213,33 +255,48 @@ class GenerateSitemap { * @access public */ function main() { - global $wgDBname; + global $wgContLang; fwrite( $this->findex, $this->openIndex() ); - + foreach ( $this->namespaces as $namespace ) { $res = $this->getPageRes( $namespace ); $this->file = false; + $this->generateLimit( $namespace ); + $length = $this->limit[0]; $i = $smcount = 0; - - $this->debug( $namespace ); + + $fns = $wgContLang->getFormattedNsText( $namespace ); + $this->debug( "$namespace ($fns)" ); while ( $row = $this->dbr->fetchObject( $res ) ) { - if ( $i++ % $this->limit === 0 ) { + if ( $i++ === 0 || $i === $this->url_limit + 1 || $length + $this->limit[1] + $this->limit[2] > $this->size_limit ) { if ( $this->file !== false ) { $this->write( $this->file, $this->closeFile() ); $this->close( $this->file ); } - $this->generateLimit( $namespace ); - $this->generateTimestamp(); $filename = $this->sitemapFilename( $namespace, $smcount++ ); $this->file = $this->open( $this->fspath . $filename, 'wb' ); $this->write( $this->file, $this->openFile() ); fwrite( $this->findex, $this->indexEntry( $filename ) ); - $this->debug( "\t$filename" ); + $this->debug( "\t$this->fspath$filename" ); + $length = $this->limit[0]; + $i = 1; } $title = Title::makeTitle( $row->page_namespace, $row->page_title ); $date = wfTimestamp( TS_ISO_8601, $row->page_touched ); - $this->write( $this->file, $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) ) ); + $entry = $this->fileEntry( $title->getFullURL(), $date, $this->priority( $namespace ) ); + $length += strlen( $entry ); + $this->write( $this->file, $entry ); + // generate pages for language variants + if($wgContLang->hasVariants()){ + $variants = $wgContLang->getVariants(); + foreach($variants as $vCode){ + if($vCode==$wgContLang->getCode()) continue; // we don't want default variant + $entry = $this->fileEntry( $title->getFullURL('',$vCode), $date, $this->priority( $namespace ) ); + $length += strlen( $entry ); + $this->write( $this->file, $entry ); + } + } } if ( $this->file ) { $this->write( $this->file, $this->closeFile() ); @@ -258,14 +315,14 @@ class GenerateSitemap { function open( $file, $flags ) { return $this->compress ? gzopen( $file, $flags ) : fopen( $file, $flags ); } - + /** * gzwrite() / fwrite() wrapper */ function write( &$handle, $str ) { if ( $this->compress ) gzwrite( $handle, $str ); - else + else fwrite( $handle, $str ); } @@ -290,11 +347,8 @@ class GenerateSitemap { * @return string */ function sitemapFilename( $namespace, $count ) { - global $wgDBname; - $ext = $this->compress ? '.gz' : ''; - - return "sitemap-$wgDBname-NS_$namespace-$count.xml$ext"; + return "sitemap-".wfWikiID()."-NS_$namespace-$count.xml$ext"; } /** @@ -316,7 +370,7 @@ class GenerateSitemap { * @returns string */ function xmlSchema() { - return 'http://www.google.com/schemas/sitemap/0.84'; + return 'http://www.sitemaps.org/schemas/sitemap/0.9'; } /** @@ -338,12 +392,10 @@ class GenerateSitemap { * @return string */ function indexEntry( $filename ) { - global $wgServer; - return "\t\n" . - "\t\t$wgServer{$this->path}/$filename\n" . - "\t\t{$this->timestamp}\n" . + "\t\t$filename\n" . + "\t\t{$this->timestamp}\n" . "\t\n"; } @@ -372,7 +424,7 @@ class GenerateSitemap { * * @static * - * @param string $url An RFC 2396 compilant URL + * @param string $url An RFC 2396 compliant URL * @param string $date A ISO 8601 date * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize * @@ -405,50 +457,43 @@ class GenerateSitemap { } /** - * According to the sitemap specification each sitemap must contain no - * more than 50,000 urls and no more than 2^20 bytes (10MB), this - * function calculates how many urls we can have in each file assuming - * that we have the worst case of 63 four byte characters and 1 three - * byte character in the title (63*4+1*3 = 255) + * Populate $this->limit */ function generateLimit( $namespace ) { $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" ); - - $olen = strlen( $this->openFile() ); - $elen = strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), '1.0' ) ); - $clen = strlen( $this->closeFile() ); - - for ( $i = 1, $etot = $elen; ( $olen + $clen + $etot + $elen ) <= pow( 2, 20 ); ++$i ) - $etot += $elen; - - $this->limit = $i; - } - /** - * Update $this->timestamp to the current time - */ - function generateTimestamp() { - $this->timestamp = wfTimestamp( TS_ISO_8601, wfTimestampNow() ); + $this->limit = array( + strlen( $this->openFile() ), + strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), $this->priority( $namespace ) ) ), + strlen( $this->closeFile() ) + ); } } -if ( in_array( '--help', $argv ) ) - die( - "Usage: php generateSitemap.php [host] [options]\n" . - "\thost = hostname\n" . - "\toptions:\n" . - "\t\t--help\tshow this message\n" . - "\t\t--fspath\tThe file system path to save to, e.g /tmp/sitemap/\n" . - "\t\t--path\tThe http path to use, e.g. /wiki\n" . - "\t\t--compress=[yes|no]\tcompress the sitemap files, default yes\n" - ); +if ( in_array( '--help', $argv ) ) { + echo << The file system path to save to, e.g /tmp/sitemap -if ( isset( $argv[1] ) && strpos( $argv[1], '--' ) !== 0 ) - $_SERVER['SERVER_NAME'] = $argv[1]; + --server= The protocol and server name to use in URLs, e.g. + http://en.wikipedia.org. This is sometimes necessary because + server name detection may fail in command line scripts. -$optionsWithArgs = array( 'fspath', 'path', 'compress' ); -require_once 'commandLine.inc'; + --compress=[yes|no] compress the sitemap files, default yes -$gs = new GenerateSitemap( @$options['fspath'], @$options['path'], @$options['compress'] !== 'no' ); +EOT; + die( -1 ); +} + +$optionsWithArgs = array( 'fspath', 'server', 'compress' ); +require_once( dirname( __FILE__ ) . '/commandLine.inc' ); + +if ( isset( $options['server'] ) ) { + $wgServer = $options['server']; +} + +$gs = new GenerateSitemap( @$options['fspath'], @$options['compress'] !== 'no' ); $gs->main(); -?> +