Fix SQLite patch-(page|template)links-fix-pk.sql column order
[lhc/web/wiklou.git] / maintenance / categoryChangesAsRdf.php
index d613352..1d85dcc 100644 (file)
@@ -157,8 +157,12 @@ SPARQLDI;
                $this->handleMoves( $dbr, $output );
                // We need to handle restores too since delete may have happened in previous update.
                $this->handleRestores( $dbr, $output );
+               // Process newly added pages
                $this->handleAdds( $dbr, $output );
-               $this->handleChanges( $dbr, $output );
+               // Process page edits
+               $this->handleEdits( $dbr, $output );
+               // Process categorization changes
+               $this->handleCategorization( $dbr, $output );
 
                // Update timestamp
                fwrite( $output, $this->updateTS( $this->endTS ) );
@@ -198,9 +202,10 @@ SPARQLDI;
        }
 
        /**
-        * Write data for a set of categories
+        * Write parent data for a set of categories.
+        * The list has the child categories.
         * @param IDatabase $dbr
-        * @param string[] $pages List of categories: id => title
+        * @param string[] $pages List of child categories: id => title
         */
        private function writeParentCategories( IDatabase $dbr, $pages ) {
                foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
@@ -302,7 +307,7 @@ SPARQL;
                        'rc_type' => RC_LOG,
                ] );
                $it->addJoinConditions( [
-                       'page' => [ 'INNER JOIN', 'rc_cur_id = page_id' ],
+                       'page' => [ 'JOIN', 'rc_cur_id = page_id' ],
                ] );
                $this->addIndex( $it );
                return $it;
@@ -356,16 +361,17 @@ SPARQL;
        }
 
        /**
-        * Fetch categorization changes
+        * Fetch categorization changes or edits
         * @param IDatabase $dbr
         * @return BatchRowIterator
         */
-       protected function getChangedCatsIterator( IDatabase $dbr ) {
-               $it = $this->setupChangesIterator( $dbr );
+       protected function getChangedCatsIterator( IDatabase $dbr, $type ) {
+               $it =
+                       $this->setupChangesIterator( $dbr );
                $it->addConditions( [
                        'rc_namespace' => NS_CATEGORY,
                        'rc_new' => 0,
-                       'rc_type' => [ RC_EDIT, RC_CATEGORIZE ],
+                       'rc_type' => $type,
                ] );
                $this->addIndex( $it );
                return $it;
@@ -396,7 +402,7 @@ SPARQL;
        /**
         * Get iterator for links for categories.
         * @param IDatabase $dbr
-        * @param array $ids List of page IDs
+        * @param int[] $ids List of page IDs
         * @return Traversable
         */
        protected function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
@@ -540,14 +546,21 @@ SPARQL;
        }
 
        /**
+        * Handle edits for category texts
         * @param IDatabase $dbr
         * @param resource $output
         */
-       public function handleChanges( IDatabase $dbr, $output ) {
-               foreach ( $this->getChangedCatsIterator( $dbr ) as $batch ) {
+       public function handleEdits( IDatabase $dbr, $output ) {
+               // Editing category can change hidden flag and add new parents.
+               // TODO: it's pretty expensive to update all edited categories, and most edits
+               // aren't actually interesting for us. Some way to know which are interesting?
+               // We can capture recategorization on the next step, but not change in hidden status.
+               foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT ) as $batch ) {
                        $pages = [];
                        $deleteUrls = [];
                        foreach ( $batch as $row ) {
+                               // Note that on categorization event, cur_id points to
+                               // the child page, not the parent category!
                                if ( isset( $this->processed[$row->rc_cur_id] ) ) {
                                        // We already captured this one before
                                        continue;
@@ -558,6 +571,121 @@ SPARQL;
                                $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
                        }
 
+                       fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
+               }
+       }
+
+       /**
+        * Handles categorization changes
+        * @param IDatabase $dbr
+        * @param resource $output
+        */
+       public function handleCategorization( IDatabase $dbr, $output ) {
+               $processedTitle = [];
+               // Categorization change can add new parents and change counts
+               // for the parent category.
+               foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE ) as $batch ) {
+                       /*
+                        * Note that on categorization event, cur_id points to
+                        * the child page, not the parent category!
+                        * So we need to have a two-stage process, since we have ID from one
+                        * category and title from another, and we need both for proper updates.
+                        * TODO: For now, we do full update even though some data hasn't changed,
+                        * e.g. parents for parent cat and counts for child cat.
+                        */
+                       foreach ( $batch as $row ) {
+                               $childPages[$row->rc_cur_id] = true;
+                               $parentCats[$row->rc_title] = true;
+                       }
+
+                       $joinConditions = [
+                               'page_props' => [
+                                       'LEFT JOIN',
+                                       [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
+                               ],
+                               'category' => [
+                                       'LEFT JOIN',
+                                       [ 'cat_title = page_title' ],
+                               ],
+                       ];
+
+                       $pages = [];
+                       $deleteUrls = [];
+
+                       if ( !empty( $childPages ) ) {
+                               // Load child rows by ID
+                               $childRows = $dbr->select(
+                                       [ 'page', 'page_props', 'category' ],
+                                       [
+                                               'page_id',
+                                               'rc_title' => 'page_title',
+                                               'pp_propname',
+                                               'cat_pages',
+                                               'cat_subcats',
+                                               'cat_files',
+                                       ],
+                                       [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ],
+                                       __METHOD__,
+                                       [],
+                                       $joinConditions
+                               );
+                               foreach ( $childRows as $row ) {
+                                       if ( isset( $this->processed[$row->page_id] ) ) {
+                                               // We already captured this one before
+                                               continue;
+                                       }
+                                       $this->writeCategoryData( $row );
+                                       $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
+                                       $this->processed[$row->page_id] = true;
+                               }
+                       }
+
+                       if ( !empty( $parentCats ) ) {
+                               // Load parent rows by title
+                               $joinConditions = [
+                                       'page' => [
+                                               'LEFT JOIN',
+                                               [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ],
+                                       ],
+                                       'page_props' => [
+                                               'LEFT JOIN',
+                                               [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
+                                       ],
+                               ];
+
+                               $parentRows = $dbr->select(
+                                       [ 'category', 'page', 'page_props' ],
+                                       [
+                                               'page_id',
+                                               'rc_title' => 'cat_title',
+                                               'pp_propname',
+                                               'cat_pages',
+                                               'cat_subcats',
+                                               'cat_files',
+                                       ],
+                                       [ 'cat_title' => array_keys( $parentCats ) ],
+                                       __METHOD__,
+                                       [],
+                                       $joinConditions
+                               );
+                               foreach ( $parentRows as $row ) {
+                                       if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
+                                               // We already captured this one before
+                                               continue;
+                                       }
+                                       if ( isset( $processedTitle[$row->rc_title] ) ) {
+                                               // We already captured this one before
+                                               continue;
+                                       }
+                                       $this->writeCategoryData( $row );
+                                       $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
+                                       if ( $row->page_id ) {
+                                               $this->processed[$row->page_id] = true;
+                                       }
+                                       $processedTitle[$row->rc_title] = true;
+                               }
+                       }
+
                        fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
                }
        }