X-Git-Url: https://git.heureux-cyclage.org/?a=blobdiff_plain;f=maintenance%2FcategoryChangesAsRdf.php;h=1d85dccd151bbcb1072e06e733e64d71c23eaf53;hb=d965b0b4652b566b1f53be756c13190b958dd7fa;hp=d61335212c300d7b8dcc2afe5b660ef9359973bd;hpb=fd08137ebf8b34cb1a9711c34d6c4d0357aea5b7;p=lhc%2Fweb%2Fwiklou.git diff --git a/maintenance/categoryChangesAsRdf.php b/maintenance/categoryChangesAsRdf.php index d61335212c..1d85dccd15 100644 --- a/maintenance/categoryChangesAsRdf.php +++ b/maintenance/categoryChangesAsRdf.php @@ -157,8 +157,12 @@ SPARQLDI; $this->handleMoves( $dbr, $output ); // We need to handle restores too since delete may have happened in previous update. $this->handleRestores( $dbr, $output ); + // Process newly added pages $this->handleAdds( $dbr, $output ); - $this->handleChanges( $dbr, $output ); + // Process page edits + $this->handleEdits( $dbr, $output ); + // Process categorization changes + $this->handleCategorization( $dbr, $output ); // Update timestamp fwrite( $output, $this->updateTS( $this->endTS ) ); @@ -198,9 +202,10 @@ SPARQLDI; } /** - * Write data for a set of categories + * Write parent data for a set of categories. + * The list has the child categories. * @param IDatabase $dbr - * @param string[] $pages List of categories: id => title + * @param string[] $pages List of child categories: id => title */ private function writeParentCategories( IDatabase $dbr, $pages ) { foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) { @@ -302,7 +307,7 @@ SPARQL; 'rc_type' => RC_LOG, ] ); $it->addJoinConditions( [ - 'page' => [ 'INNER JOIN', 'rc_cur_id = page_id' ], + 'page' => [ 'JOIN', 'rc_cur_id = page_id' ], ] ); $this->addIndex( $it ); return $it; @@ -356,16 +361,17 @@ SPARQL; } /** - * Fetch categorization changes + * Fetch categorization changes or edits * @param IDatabase $dbr * @return BatchRowIterator */ - protected function getChangedCatsIterator( IDatabase $dbr ) { - $it = $this->setupChangesIterator( $dbr ); + protected function getChangedCatsIterator( IDatabase $dbr, $type ) { + $it = + $this->setupChangesIterator( $dbr ); $it->addConditions( [ 'rc_namespace' => NS_CATEGORY, 'rc_new' => 0, - 'rc_type' => [ RC_EDIT, RC_CATEGORIZE ], + 'rc_type' => $type, ] ); $this->addIndex( $it ); return $it; @@ -396,7 +402,7 @@ SPARQL; /** * Get iterator for links for categories. * @param IDatabase $dbr - * @param array $ids List of page IDs + * @param int[] $ids List of page IDs * @return Traversable */ protected function getCategoryLinksIterator( IDatabase $dbr, array $ids ) { @@ -540,14 +546,21 @@ SPARQL; } /** + * Handle edits for category texts * @param IDatabase $dbr * @param resource $output */ - public function handleChanges( IDatabase $dbr, $output ) { - foreach ( $this->getChangedCatsIterator( $dbr ) as $batch ) { + public function handleEdits( IDatabase $dbr, $output ) { + // Editing category can change hidden flag and add new parents. + // TODO: it's pretty expensive to update all edited categories, and most edits + // aren't actually interesting for us. Some way to know which are interesting? + // We can capture recategorization on the next step, but not change in hidden status. + foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT ) as $batch ) { $pages = []; $deleteUrls = []; foreach ( $batch as $row ) { + // Note that on categorization event, cur_id points to + // the child page, not the parent category! if ( isset( $this->processed[$row->rc_cur_id] ) ) { // We already captured this one before continue; @@ -558,6 +571,121 @@ SPARQL; $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; } + fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) ); + } + } + + /** + * Handles categorization changes + * @param IDatabase $dbr + * @param resource $output + */ + public function handleCategorization( IDatabase $dbr, $output ) { + $processedTitle = []; + // Categorization change can add new parents and change counts + // for the parent category. + foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE ) as $batch ) { + /* + * Note that on categorization event, cur_id points to + * the child page, not the parent category! + * So we need to have a two-stage process, since we have ID from one + * category and title from another, and we need both for proper updates. + * TODO: For now, we do full update even though some data hasn't changed, + * e.g. parents for parent cat and counts for child cat. + */ + foreach ( $batch as $row ) { + $childPages[$row->rc_cur_id] = true; + $parentCats[$row->rc_title] = true; + } + + $joinConditions = [ + 'page_props' => [ + 'LEFT JOIN', + [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ], + ], + 'category' => [ + 'LEFT JOIN', + [ 'cat_title = page_title' ], + ], + ]; + + $pages = []; + $deleteUrls = []; + + if ( !empty( $childPages ) ) { + // Load child rows by ID + $childRows = $dbr->select( + [ 'page', 'page_props', 'category' ], + [ + 'page_id', + 'rc_title' => 'page_title', + 'pp_propname', + 'cat_pages', + 'cat_subcats', + 'cat_files', + ], + [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ], + __METHOD__, + [], + $joinConditions + ); + foreach ( $childRows as $row ) { + if ( isset( $this->processed[$row->page_id] ) ) { + // We already captured this one before + continue; + } + $this->writeCategoryData( $row ); + $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; + $this->processed[$row->page_id] = true; + } + } + + if ( !empty( $parentCats ) ) { + // Load parent rows by title + $joinConditions = [ + 'page' => [ + 'LEFT JOIN', + [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ], + ], + 'page_props' => [ + 'LEFT JOIN', + [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ], + ], + ]; + + $parentRows = $dbr->select( + [ 'category', 'page', 'page_props' ], + [ + 'page_id', + 'rc_title' => 'cat_title', + 'pp_propname', + 'cat_pages', + 'cat_subcats', + 'cat_files', + ], + [ 'cat_title' => array_keys( $parentCats ) ], + __METHOD__, + [], + $joinConditions + ); + foreach ( $parentRows as $row ) { + if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) { + // We already captured this one before + continue; + } + if ( isset( $processedTitle[$row->rc_title] ) ) { + // We already captured this one before + continue; + } + $this->writeCategoryData( $row ); + $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; + if ( $row->page_id ) { + $this->processed[$row->page_id] = true; + } + $processedTitle[$row->rc_title] = true; + } + } + fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) ); } }