refreshLinks.php: allow refreshing by categories, tracking or not
authorMax Semenik <maxsem.wiki@gmail.com>
Tue, 29 Nov 2016 23:04:07 +0000 (15:04 -0800)
committerMax Semenik <maxsem.wiki@gmail.com>
Mon, 23 Jan 2017 22:30:16 +0000 (14:30 -0800)
Needed for selective updates of pages using a particular feature.
Intended to be run in production, so needs to scale.

Bug: T149723
Change-Id: If20fb1f91de8d4227def5b07d6d52b91161ed3fd

RELEASE-NOTES-1.29
autoload.php
includes/TrackingCategories.php [new file with mode: 0644]
includes/parser/ParserOutput.php
includes/specials/SpecialTrackingCategories.php
maintenance/refreshLinks.php

index d0738e2..3bf50ac 100644 (file)
@@ -206,6 +206,8 @@ changes to languages because of Phabricator reports.
 * Article::doEditContent() was marked as deprecated, to be removed in 1.30
   or later.
 * ContentHandler::runLegacyHooks() was removed.
+* refreshLinks.php now can be limited to a particular category with --category=...
+  or a tracking category with --tracking-category=...
 
 == Compatibility ==
 
index 7ed08df..e7c97ad 100644 (file)
@@ -1459,6 +1459,7 @@ $wgAutoloadLocalClasses = [
        'TitlePrefixSearch' => __DIR__ . '/includes/PrefixSearch.php',
        'TitleValue' => __DIR__ . '/includes/title/TitleValue.php',
        'TrackBlobs' => __DIR__ . '/maintenance/storage/trackBlobs.php',
+       'TrackingCategories' => __DIR__ . '/includes/TrackingCategories.php',
        'TraditionalImageGallery' => __DIR__ . '/includes/gallery/TraditionalImageGallery.php',
        'TransactionProfiler' => __DIR__ . '/includes/libs/rdbms/TransactionProfiler.php',
        'TransformParameterError' => __DIR__ . '/includes/media/MediaTransformOutput.php',
diff --git a/includes/TrackingCategories.php b/includes/TrackingCategories.php
new file mode 100644 (file)
index 0000000..825860a
--- /dev/null
@@ -0,0 +1,130 @@
+<?php
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @file
+ * @ingroup Categories
+ */
+
+/**
+ * This class performs some operations related to tracking categories, such as creating
+ * a list of all such categories.
+ */
+class TrackingCategories {
+       /** @var Config */
+       private $config;
+
+       /**
+        * Tracking categories that exist in core
+        *
+        * @var array
+        */
+       private static $coreTrackingCategories = [
+               'index-category',
+               'noindex-category',
+               'duplicate-args-category',
+               'expensive-parserfunction-category',
+               'post-expand-template-argument-category',
+               'post-expand-template-inclusion-category',
+               'hidden-category-category',
+               'broken-file-category',
+               'node-count-exceeded-category',
+               'expansion-depth-exceeded-category',
+               'restricted-displaytitle-ignored',
+               'deprecated-self-close-category',
+       ];
+
+       /**
+        * @param Config $config
+        */
+       public function __construct( Config $config ) {
+               $this->config = $config;
+       }
+
+       /**
+        * Read the global and extract title objects from the corresponding messages
+        * @return array Array( 'msg' => Title, 'cats' => Title[] )
+        */
+       public function getTrackingCategories() {
+               $categories = array_merge(
+                       self::$coreTrackingCategories,
+                       ExtensionRegistry::getInstance()->getAttribute( 'TrackingCategories' ),
+                       $this->config->get( 'TrackingCategories' ) // deprecated
+               );
+
+               // Only show magic link tracking categories if they are enabled
+               $enableMagicLinks = $this->config->get( 'EnableMagicLinks' );
+               if ( $enableMagicLinks['ISBN'] ) {
+                       $categories[] = 'magiclink-tracking-isbn';
+               }
+               if ( $enableMagicLinks['RFC'] ) {
+                       $categories[] = 'magiclink-tracking-rfc';
+               }
+               if ( $enableMagicLinks['PMID'] ) {
+                       $categories[] = 'magiclink-tracking-pmid';
+               }
+
+               $trackingCategories = [];
+               foreach ( $categories as $catMsg ) {
+                       /*
+                        * Check if the tracking category varies by namespace
+                        * Otherwise only pages in the current namespace will be displayed
+                        * If it does vary, show pages considering all namespaces
+                        */
+                       $msgObj = wfMessage( $catMsg )->inContentLanguage();
+                       $allCats = [];
+                       $catMsgTitle = Title::makeTitleSafe( NS_MEDIAWIKI, $catMsg );
+                       if ( !$catMsgTitle ) {
+                               continue;
+                       }
+
+                       // Match things like {{NAMESPACE}} and {{NAMESPACENUMBER}}.
+                       // False positives are ok, this is just an efficiency shortcut
+                       if ( strpos( $msgObj->plain(), '{{' ) !== false ) {
+                               $ns = MWNamespace::getValidNamespaces();
+                               foreach ( $ns as $namesp ) {
+                                       $tempTitle = Title::makeTitleSafe( $namesp, $catMsg );
+                                       if ( !$tempTitle ) {
+                                               continue;
+                                       }
+                                       $catName = $msgObj->title( $tempTitle )->text();
+                                       # Allow tracking categories to be disabled by setting them to "-"
+                                       if ( $catName !== '-' ) {
+                                               $catTitle = Title::makeTitleSafe( NS_CATEGORY, $catName );
+                                               if ( $catTitle ) {
+                                                       $allCats[] = $catTitle;
+                                               }
+                                       }
+                               }
+                       } else {
+                               $catName = $msgObj->text();
+                               # Allow tracking categories to be disabled by setting them to "-"
+                               if ( $catName !== '-' ) {
+                                       $catTitle = Title::makeTitleSafe( NS_CATEGORY, $catName );
+                                       if ( $catTitle ) {
+                                               $allCats[] = $catTitle;
+                                       }
+                               }
+                       }
+                       $trackingCategories[$catMsg] = [
+                               'cats' => $allCats,
+                               'msg' => $catMsgTitle,
+                       ];
+               }
+
+               return $trackingCategories;
+       }
+}
index 7bf848f..0c162b4 100644 (file)
@@ -696,6 +696,8 @@ class ParserOutput extends CacheTime {
         * to SpecialTrackingCategories::$coreTrackingCategories, and extensions
         * should add to "TrackingCategories" in their extension.json.
         *
+        * @todo Migrate some code to TrackingCategories
+        *
         * @param string $msg Message key
         * @param Title $title title of the page which is being tracked
         * @return bool Whether the addition was successful
index 8ff0527..e503d92 100644 (file)
@@ -36,26 +36,6 @@ class SpecialTrackingCategories extends SpecialPage {
                parent::__construct( 'TrackingCategories' );
        }
 
-       /**
-        * Tracking categories that exist in core
-        *
-        * @var array
-        */
-       private static $coreTrackingCategories = [
-               'index-category',
-               'noindex-category',
-               'duplicate-args-category',
-               'expensive-parserfunction-category',
-               'post-expand-template-argument-category',
-               'post-expand-template-inclusion-category',
-               'hidden-category-category',
-               'broken-file-category',
-               'node-count-exceeded-category',
-               'expansion-depth-exceeded-category',
-               'restricted-displaytitle-ignored',
-               'deprecated-self-close-category',
-       ];
-
        function execute( $par ) {
                $this->setHeaders();
                $this->outputHeader();
@@ -76,10 +56,11 @@ class SpecialTrackingCategories extends SpecialPage {
                        </tr></thead>"
                );
 
-               $trackingCategories = $this->prepareTrackingCategoriesData();
+               $trackingCategories = new TrackingCategories( $this->getConfig() );
+               $categoryList = $trackingCategories->getTrackingCategories();
 
                $batch = new LinkBatch();
-               foreach ( $trackingCategories as $catMsg => $data ) {
+               foreach ( $categoryList as $catMsg => $data ) {
                        $batch->addObj( $data['msg'] );
                        foreach ( $data['cats'] as $catTitle ) {
                                $batch->addObj( $catTitle );
@@ -87,11 +68,11 @@ class SpecialTrackingCategories extends SpecialPage {
                }
                $batch->execute();
 
-               Hooks::run( 'SpecialTrackingCategories::preprocess', [ $this, $trackingCategories ] );
+               Hooks::run( 'SpecialTrackingCategories::preprocess', [ $this, $categoryList ] );
 
                $linkRenderer = $this->getLinkRenderer();
 
-               foreach ( $trackingCategories as $catMsg => $data ) {
+               foreach ( $categoryList as $catMsg => $data ) {
                        $allMsgs = [];
                        $catDesc = $catMsg . '-desc';
 
@@ -143,80 +124,6 @@ class SpecialTrackingCategories extends SpecialPage {
                $this->getOutput()->addHTML( Html::closeElement( 'table' ) );
        }
 
-       /**
-        * Read the global and extract title objects from the corresponding messages
-        * @return array Array( 'msg' => Title, 'cats' => Title[] )
-        */
-       private function prepareTrackingCategoriesData() {
-               $categories = array_merge(
-                       self::$coreTrackingCategories,
-                       ExtensionRegistry::getInstance()->getAttribute( 'TrackingCategories' ),
-                       $this->getConfig()->get( 'TrackingCategories' ) // deprecated
-               );
-
-               // Only show magic link tracking categories if they are enabled
-               $enableMagicLinks = $this->getConfig()->get( 'EnableMagicLinks' );
-               if ( $enableMagicLinks['ISBN'] ) {
-                       $categories[] = 'magiclink-tracking-isbn';
-               }
-               if ( $enableMagicLinks['RFC'] ) {
-                       $categories[] = 'magiclink-tracking-rfc';
-               }
-               if ( $enableMagicLinks['PMID'] ) {
-                       $categories[] = 'magiclink-tracking-pmid';
-               }
-
-               $trackingCategories = [];
-               foreach ( $categories as $catMsg ) {
-                       /*
-                        * Check if the tracking category varies by namespace
-                        * Otherwise only pages in the current namespace will be displayed
-                        * If it does vary, show pages considering all namespaces
-                        */
-                       $msgObj = $this->msg( $catMsg )->inContentLanguage();
-                       $allCats = [];
-                       $catMsgTitle = Title::makeTitleSafe( NS_MEDIAWIKI, $catMsg );
-                       if ( !$catMsgTitle ) {
-                               continue;
-                       }
-
-                       // Match things like {{NAMESPACE}} and {{NAMESPACENUMBER}}.
-                       // False positives are ok, this is just an efficiency shortcut
-                       if ( strpos( $msgObj->plain(), '{{' ) !== false ) {
-                               $ns = MWNamespace::getValidNamespaces();
-                               foreach ( $ns as $namesp ) {
-                                       $tempTitle = Title::makeTitleSafe( $namesp, $catMsg );
-                                       if ( !$tempTitle ) {
-                                               continue;
-                                       }
-                                       $catName = $msgObj->title( $tempTitle )->text();
-                                       # Allow tracking categories to be disabled by setting them to "-"
-                                       if ( $catName !== '-' ) {
-                                               $catTitle = Title::makeTitleSafe( NS_CATEGORY, $catName );
-                                               if ( $catTitle ) {
-                                                       $allCats[] = $catTitle;
-                                               }
-                                       }
-                               }
-                       } else {
-                               $catName = $msgObj->text();
-                               # Allow tracking categories to be disabled by setting them to "-"
-                               if ( $catName !== '-' ) {
-                                       $catTitle = Title::makeTitleSafe( NS_CATEGORY, $catName );
-                                       if ( $catTitle ) {
-                                               $allCats[] = $catTitle;
-                                       }
-                               }
-                       }
-                       $trackingCategories[$catMsg] = [
-                               'cats' => $allCats,
-                               'msg' => $catMsgTitle,
-                       ];
-               }
-
-               return $trackingCategories;
-       }
-
        protected function getGroupName() {
                return 'pages';
        }
index e7a4d06..fb24a1d 100644 (file)
@@ -29,6 +29,8 @@ require_once __DIR__ . '/Maintenance.php';
  * @ingroup Maintenance
  */
 class RefreshLinks extends Maintenance {
+       const REPORTING_INTERVAL = 100;
+
        /** @var int|bool */
        protected $namespace = false;
 
@@ -43,6 +45,8 @@ class RefreshLinks extends Maintenance {
                $this->addOption( 'dfn-chunk-size', 'Maximum number of existent IDs to check per ' .
                        'query, default 100000', false, true );
                $this->addOption( 'namespace', 'Only fix pages in this namespace', false, true );
+               $this->addOption( 'category', 'Only fix pages in this category', false, true );
+               $this->addOption( 'tracking-category', 'Only fix pages in this tracking category', false, true );
                $this->addArg( 'start', 'Page_id to start from, default 1', false );
                $this->setBatchSize( 100 );
        }
@@ -61,7 +65,15 @@ class RefreshLinks extends Maintenance {
                } else {
                        $this->namespace = (int)$ns;
                }
-               if ( !$this->hasOption( 'dfn-only' ) ) {
+               if ( ( $category = $this->getOption( 'category', false ) ) !== false ) {
+                       $title = Title::makeTitleSafe( NS_CATEGORY, $category );
+                       if ( !$title ) {
+                               $this->error( "'$category' is an invalid category name!\n", true );
+                       }
+                       $this->refreshCategory( $category );
+               } elseif ( ( $category = $this->getOption( 'tracking-category', false ) ) !== false ) {
+                       $this->refreshTrackingCategory( $category );
+               } elseif ( !$this->hasOption( 'dfn-only' ) ) {
                        $new = $this->getOption( 'new-only', false );
                        $redir = $this->getOption( 'redirects-only', false );
                        $oldRedir = $this->getOption( 'old-redirects-only', false );
@@ -89,7 +101,6 @@ class RefreshLinks extends Maintenance {
        private function doRefreshLinks( $start, $newOnly = false,
                $end = null, $redirectsOnly = false, $oldRedirectsOnly = false
        ) {
-               $reportingInterval = 100;
                $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
 
                if ( $start === null ) {
@@ -124,7 +135,7 @@ class RefreshLinks extends Maintenance {
                        $i = 0;
 
                        foreach ( $res as $row ) {
-                               if ( !( ++$i % $reportingInterval ) ) {
+                               if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
                                        $this->output( "$i\n" );
                                        wfWaitForSlaves();
                                }
@@ -145,7 +156,7 @@ class RefreshLinks extends Maintenance {
 
                        $i = 0;
                        foreach ( $res as $row ) {
-                               if ( !( ++$i % $reportingInterval ) ) {
+                               if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
                                        $this->output( "$i\n" );
                                        wfWaitForSlaves();
                                }
@@ -166,7 +177,7 @@ class RefreshLinks extends Maintenance {
 
                        for ( $id = $start; $id <= $end; $id++ ) {
 
-                               if ( !( $id % $reportingInterval ) ) {
+                               if ( !( $id % self::REPORTING_INTERVAL ) ) {
                                        $this->output( "$id\n" );
                                        wfWaitForSlaves();
                                }
@@ -179,7 +190,7 @@ class RefreshLinks extends Maintenance {
 
                                for ( $id = $start; $id <= $end; $id++ ) {
 
-                                       if ( !( $id % $reportingInterval ) ) {
+                                       if ( !( $id % self::REPORTING_INTERVAL ) ) {
                                                $this->output( "$id\n" );
                                                wfWaitForSlaves();
                                        }
@@ -379,6 +390,7 @@ class RefreshLinks extends Maintenance {
         * @param string $var Field name
         * @param mixed $start First value to include or null
         * @param mixed $end Last value to include or null
+        * @return string
         */
        private static function intervalCond( IDatabase $db, $var, $start, $end ) {
                if ( $start === null && $end === null ) {
@@ -391,6 +403,87 @@ class RefreshLinks extends Maintenance {
                        return "$var BETWEEN {$db->addQuotes( $start )} AND {$db->addQuotes( $end )}";
                }
        }
+
+       /**
+        * Refershes links for pages in a tracking category
+        *
+        * @param string $category Category key
+        */
+       private function refreshTrackingCategory( $category ) {
+               $cats = $this->getPossibleCategories( $category );
+
+               if ( !$cats ) {
+                       $this->error( "Tracking category '$category' is disabled\n" );
+                       // Output to stderr but don't bail out,
+               }
+
+               foreach ( $cats as $cat ) {
+                       $this->refreshCategory( $cat );
+               }
+       }
+
+       /**
+        * Refreshes links to a category
+        *
+        * @param Title $category
+        */
+       private function refreshCategory( Title $category ) {
+               $this->output( "Refreshing pages in category '{$category->getText()}'...\n" );
+
+               $dbr = $this->getDB( DB_REPLICA );
+               $conds = [
+                       'page_id=cl_from',
+                       'cl_to' => $category->getDBkey(),
+               ];
+               if ( $this->namespace !== false ) {
+                       $conds['page_namespace'] = $this->namespace;
+               }
+
+               $i = 0;
+               $timestamp = '';
+               $lastId = 0;
+               do {
+                       $finalConds = $conds;
+                       $timestamp = $dbr->addQuotes( $timestamp );
+                       $finalConds []=
+                               "(cl_timestamp > $timestamp OR (cl_timestamp = $timestamp AND cl_from > $lastId))";
+                       $res = $dbr->select( [ 'page', 'categorylinks' ],
+                               [ 'page_id', 'cl_timestamp' ],
+                               $finalConds,
+                               __METHOD__,
+                               [
+                                       'ORDER BY' => [ 'cl_timestamp', 'cl_from' ],
+                                       'LIMIT' => $this->mBatchSize,
+                               ]
+                       );
+
+                       foreach ( $res as $row ) {
+                               if ( !( ++$i % self::REPORTING_INTERVAL ) ) {
+                                       $this->output( "$i\n" );
+                                       wfWaitForSlaves();
+                               }
+                               $lastId = $row->page_id;
+                               $timestamp = $row->cl_timestamp;
+                               self::fixLinksFromArticle( $row->page_id );
+                       }
+
+               } while ( $res->numRows() == $this->mBatchSize );
+       }
+
+       /**
+        * Returns a list of possible categories for a given tracking category key
+        *
+        * @param string $categoryKey
+        * @return Title[]
+        */
+       private function getPossibleCategories( $categoryKey ) {
+               $trackingCategories = new TrackingCategories( $this->getConfig() );
+               $cats = $trackingCategories->getTrackingCategories();
+               if ( isset( $cats[$categoryKey] ) ) {
+                       return $cats[$categoryKey]['cats'];
+               }
+               $this->error( "Unknown tracking category {$categoryKey}\n", true );
+       }
 }
 
 $maintClass = 'RefreshLinks';