* Fixed regression: It's now possible to specify a custom hostname again
authorÆvar Arnfjörð Bjarmason <avar@users.mediawiki.org>
Thu, 3 Nov 2005 04:23:02 +0000 (04:23 +0000)
committerÆvar Arnfjörð Bjarmason <avar@users.mediawiki.org>
Thu, 3 Nov 2005 04:23:02 +0000 (04:23 +0000)
* Added magic code to make sure that the size of the sitemap file never exceeds that allowed by the standard
* Documented every function
* Made --path really work

maintenance/generateSitemap.php

index b832808..2e933d8 100644 (file)
  * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
  */
 
-$optionsWithArgs = array( 'host', 'path' );
+if ( isset( $argv[1] ) )
+       $_SERVER['SERVER_NAME'] = $argv[1];
+
+$optionsWithArgs = array( 'path' );
 /* */
 require_once 'commandLine.inc';
 
-if ( ! isset( $options['host'] ) ) {
-       echo "Usage: php generateSitemap.php --host=hostname [--path=/pa/th/]\n";
-       exit(1);
-} else {
-       $_SERVER['HOSTNAME'] = $options['host'];
-}
+define( 'GS_MAIN', -2 );
+define( 'GS_TALK', -1 );
 
-$gs = new GenerateSitemap( $options['host'], $options['path'] );
+$gs = new GenerateSitemap( @$options['path'] );
 $gs->main();
 
 class GenerateSitemap {
-       var $host;
-       var $cutoff = 9000;
+       /**
+        * The number of entries to save in each sitemap file
+        *
+        * @var int
+        */
+       var $limit;
+
+       /**
+        * Key => value entries of namespaces and their priorities
+        *
+        * @var array
+        */
        var $priorities = array(
                // Custom main namespaces
-               -2                      => '0.5',
+               GS_MAIN                 => '0.5',
                // Custom talk namesspaces
-               -1                      => '0.1',       
+               GS_TALK                 => '0.1',
+               // MediaWiki standard namespaces
                NS_MAIN                 => '1.0',
                NS_TALK                 => '0.1',
                NS_USER                 => '0.5',
@@ -54,23 +64,64 @@ class GenerateSitemap {
                NS_CATEGORY             => '0.5',
                NS_CATEGORY_TALK        => '0.1',
        );
+
+       /**
+        * A one-dimensional array of namespaces in the wiki
+        *
+        * @var array
+        */
        var $namespaces = array();
+
+       /**
+        * A database slave object
+        *
+        * @var object
+        */
        var $dbr;
-       var $path, $file, $findex;
-       var $stderr;
        
-       function GenerateSitemap( $host, $path ) {
+       /**
+        * A resource pointing to the sitemap index file
+        *
+        * @var resource
+        */
+       var $findex;
+       
+
+       /**
+        * A resource pointing to a sitemap file
+        *
+        * @var resource
+        */
+       var $file;
+
+       /**
+        * A resource pointing to php://stderr
+        *
+        * @var resource
+        */
+       var $stderr;
+
+       /**
+        * Constructor
+        *
+        * @param string $path The path to prepend to the filenames, used to
+        *                     save them somewhere else than in the root directory
+        */
+       function GenerateSitemap( $path ) {
                global $wgDBname;
 
                $this->path = isset( $path ) ? $path : '';
                $this->stderr = fopen( 'php://stderr', 'wt' );
                
-               $this->host = $host;
                $this->dbr =& wfGetDB( DB_SLAVE );
                $this->generateNamespaces();
+               $this->generateLimit( NS_MAIN );
                $this->findex = fopen( "{$this->path}sitemap-index-$wgDBname.xml", 'wb' );
        }
 
+       /**
+        * Generate a one-dimensional array of existing namespaces
+        */
        function generateNamespaces() {
                $fname = 'GenerateSitemap::generateNamespaces';
                
@@ -88,14 +139,38 @@ class GenerateSitemap {
                        $this->namespaces[] = $row->page_namespace;
        }
 
+       /**
+        * Get the priority of a given namespace
+        *
+        * @param int $namespace The namespace to get the priority for
+        +
+        * @return string
+        */
+
        function priority( $namespace ) {
                return isset( $this->priorities[$namespace] ) ? $this->priorities[$namespace] : $this->guessPriority( $namespace );
        }
 
+       /**
+        * If the namespace isn't listed on the priority list return the
+        * default priority for the namespace, varies depending on whether it's
+        * a talkpage or not.
+        * 
+        * @param int $namespace The namespace to get the priority for
+        *
+        * @return string
+        */
        function guessPriority( $namespace ) {
-               return Namespace::isMain( $namespace ) ? $this->priorities[-2] : $this->priorities[-1];
+               return Namespace::isMain( $namespace ) ? $this->priorities[GS_MAIN] : $this->priorities[GS_TALK];
        }
 
+       /**
+        * Return a database resolution of all the pages in a given namespace
+        *
+        * @param int $namespace Limit the query to this namespace
+        *
+        * @return resource
+        */
        function getPageRes( $namespace ) {
                $fname = 'GenerateSitemap::getPageRes';
 
@@ -111,6 +186,11 @@ class GenerateSitemap {
                );
        }
 
+       /**
+        * Main loop
+        *
+        * @access public
+        */
        function main() {
                global $wgDBname;
 
@@ -123,14 +203,15 @@ class GenerateSitemap {
                        
                        $this->debug( $namespace );
                        while ( $row = $this->dbr->fetchObject( $res ) ) {
-                               if ( $i % $this->cutoff == 0 ) {
+                               if ( $i % $this->limit === 0 ) {
                                        if ( $this->file !== false ) {
                                                gzwrite( $this->file, $this->closeFile() );
                                                gzclose( $this->file );
                                        }
-                                       $filename = "{$this->path}sitemap-$wgDBname-NS_$namespace-$smcount.xml.gz";
+                                       $this->generateLimit( $namespace );
+                                       $filename = "sitemap-$wgDBname-NS_$namespace-$smcount.xml.gz";
                                        ++$smcount;
-                                       $this->file = gzopen( $filename, 'wb' );
+                                       $this->file = gzopen( $this->path . $filename, 'wb' );
                                        gzwrite( $this->file, $this->openFile() );
                                        fwrite( $this->findex, $this->indexEntry( $filename ) );
                                        $this->debug( "\t$filename" );
@@ -149,35 +230,87 @@ class GenerateSitemap {
                fclose( $this->findex );
        }
 
+       /**
+        * Return the XML required to open an XML file
+        *
+        * @static
+        *
+        * @return string
+        */
        function xmlHead() {
                return '<?xml version="1.0" encoding="UTF-8"?>' . "\n";
        }
 
+       /**
+        * Return the XML schema being used
+        *
+        * @static
+        *
+        * @returns string
+        */
        function xmlSchema() {
                return 'http://www.google.com/schemas/sitemap/0.84';
        }
 
+       /**
+        * Return the XML required to open a sitemap index file
+        *
+        * @return string
+        */
        function openIndex() {
                return $this->xmlHead() . '<sitemapindex xmlns="' . $this->xmlSchema() . '">' . "\n";
        }
 
+       /**
+        * Return the XML for a single sitemap indexfile entry
+        *
+        * @static
+        *
+        * @param string $filename The filename of the sitemap file
+        *
+        * @return string
+        */
        function indexEntry( $filename ) {
-               global $wgServer;
+               global $wgServer, $wgScriptPath;
                
                return
                        "\t<sitemap>\n" .
-                       "\t\t<loc>$wgServer/$filename</log>\n" .
+                       "\t\t<loc>$wgServer$wgScriptPath/$filename</log>\n" .
                        "\t</sitemap>\n";
        }
 
+       /**
+        * Return the XML required to close a sitemap index file
+        *
+        * @static
+        *
+        * @return string
+        */
        function closeIndex() {
                return "</sitemapindex>\n";
        }
-       
+
+       /**
+        * Return the XML required to open a sitemap file
+        *
+        * @return string
+        */
        function openFile() {
                return $this->xmlHead() . '<urlset xmlns="' . $this->xmlSchema() . '">' . "\n";
        }
-       
+
+       /**
+        * Return the XML for a single sitemap entry
+        *
+        * @static
+        *
+        * @param string $url An RFC 2396 compilant URL
+        * @param string $date A ISO 8601 date
+        * @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
+        *
+        r
+        * @return string
+        */
        function fileEntry( $url, $date, $priority ) {
                return
                        "\t<url>\n" .
@@ -187,13 +320,42 @@ class GenerateSitemap {
                        "\t</url>\n";
        }
 
+       /**
+        * Return the XML required to close sitemap file
+        *
+        * @static
+        * @return string
+        */
        function closeFile() {
                return "</urlset>\n";
        }
-       
+
+       /**
+        * Write a string to stderr followed by a UNIX newline
+        */
        function debug( $str ) {
                fwrite( $this->stderr, "$str\n" );
        }
+
+       /**
+        * According to the sitemap specification each sitemap must contain no
+        * more than 50,000 urls and no more than 2^20 bytes (10MB), this
+        * function calculates how many urls we can have in each file assuming
+        * that we have the worst case of 63 four byte characters and 1 three
+        * byte character in the title (63*4+1*3 = 255)
+        */
+       function generateLimit( $namespace ) {
+               $title = Title::makeTitle( $namespace, str_repeat( "\xf0\xa8\xae\x81", 63 ) . "\xe5\x96\x83" );
+               
+               $olen = strlen( $this->openFile() );
+               $elen = strlen( $this->fileEntry( $title->getFullUrl(), wfTimestamp( TS_ISO_8601, wfTimestamp() ), '1.0' ) );
+               $clen = strlen( $this->closeFile() );
+
+               for ( $i = 1, $etot = $elen; ( $olen + $clen + $etot + $elen ) <= pow( 2, 20 ); ++$i )
+                       $etot += $elen;
+               
+               $this->limit = $i;
+       }
 }
 
 ?>