addDescription( "Generate RDF dump of category changes in a wiki." ); $this->setBatchSize( 200 ); $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false, true, 'o' ); $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.', true, true, 's' ); $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true, true, 'e' ); } /** * Initialize external service classes. */ public function initialize() { // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer. $this->rdfWriter = new TurtleRdfWriter(); $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter ); } public function execute() { $this->initialize(); $startTS = new MWTimestamp( $this->getOption( "start" ) ); $endTS = new MWTimestamp( $this->getOption( "end" ) ); $now = new MWTimestamp(); $rcMaxAge = $this->getConfig()->get( 'RCMaxAge' ); if ( $now->getTimestamp() - $startTS->getTimestamp() > $rcMaxAge ) { $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" ); } if ( $now->getTimestamp() - $endTS->getTimestamp() > $rcMaxAge ) { $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" ); } $this->startTS = $startTS->getTimestamp(); $this->endTS = $endTS->getTimestamp(); $outFile = $this->getOption( 'output', 'php://stdout' ); if ( $outFile === '-' ) { $outFile = 'php://stdout'; } $output = fopen( $outFile, 'wb' ); $this->categoriesRdf->setupPrefixes(); $this->rdfWriter->start(); $prefixes = $this->getRdf(); // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them // Also strip dot at the end. $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes ); fwrite( $output, $prefixes ); $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); // Deletes go first because if the page was deleted, other changes // do not matter. This only gets true deletes, i.e. not pages that were restored. $this->handleDeletes( $dbr, $output ); // Moves go before additions because if category is moved, we should not process creation // as it would produce wrong data - because create row has old title $this->handleMoves( $dbr, $output ); // We need to handle restores too since delete may have happened in previous update. $this->handleRestores( $dbr, $output ); // Process newly added pages $this->handleAdds( $dbr, $output ); // Process page edits $this->handleEdits( $dbr, $output ); // Process categorization changes $this->handleCategorization( $dbr, $output ); // Update timestamp fwrite( $output, $this->updateTS( $this->endTS ) ); } /** * Get the text of SPARQL INSERT DATA clause * @return string */ private function getInsertRdf() { $rdfText = $this->getRdf(); if ( !$rdfText ) { return ""; } return sprintf( self::SPARQL_INSERT, $rdfText ); } /** * Get SPARQL for updating set of categories * @param IDatabase $dbr * @param string[] $deleteUrls List of URIs to be deleted, with <> * @param string[] $pages List of categories: id => title * @param string $mark Marks which operation requests the query * @return string SPARQL query */ private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) { if ( empty( $deleteUrls ) ) { return ""; } if ( !empty( $pages ) ) { $this->writeParentCategories( $dbr, $pages ); } return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) . $this->getInsertRdf(); } /** * Write parent data for a set of categories. * The list has the child categories. * @param IDatabase $dbr * @param string[] $pages List of child categories: id => title */ private function writeParentCategories( IDatabase $dbr, $pages ) { foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) { $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to ); } } /** * Generate SPARQL Update code for updating dump timestamp * @param string|int $timestamp Timestamp for last change * @return string SPARQL Update query for timestamp. */ public function updateTS( $timestamp ) { $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>'; $ts = wfTimestamp( TS_ISO_8601, $timestamp ); $tsQuery = <<mBatchSize ); $this->addTimestampConditions( $it, $dbr ); $it->addJoinConditions( [ 'page_props' => [ 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ] ], 'category' => [ 'LEFT JOIN', [ 'cat_title = rc_title' ] ] ] ); $it->setFetchColumns( array_merge( $columns, [ 'rc_title', 'rc_cur_id', 'pp_propname', 'cat_pages', 'cat_subcats', 'cat_files' ] ) ); return $it; } /** * Fetch newly created categories * @param IDatabase $dbr * @return BatchRowIterator */ protected function getNewCatsIterator( IDatabase $dbr ) { $it = $this->setupChangesIterator( $dbr ); $it->addConditions( [ 'rc_namespace' => NS_CATEGORY, 'rc_new' => 1, ] ); return $it; } /** * Fetch moved categories * @param IDatabase $dbr * @return BatchRowIterator */ protected function getMovedCatsIterator( IDatabase $dbr ) { $it = $this->setupChangesIterator( $dbr, [ 'page_title', 'page_namespace' ], [ 'page' ] ); $it->addConditions( [ 'rc_namespace' => NS_CATEGORY, 'rc_new' => 0, 'rc_log_type' => 'move', 'rc_type' => RC_LOG, ] ); $it->addJoinConditions( [ 'page' => [ 'JOIN', 'rc_cur_id = page_id' ], ] ); $this->addIndex( $it ); return $it; } /** * Fetch deleted categories * @param IDatabase $dbr * @return BatchRowIterator */ protected function getDeletedCatsIterator( IDatabase $dbr ) { $it = new BatchRowIterator( $dbr, 'recentchanges', [ 'rc_timestamp' ], $this->mBatchSize ); $this->addTimestampConditions( $it, $dbr ); $it->addConditions( [ 'rc_namespace' => NS_CATEGORY, 'rc_new' => 0, 'rc_log_type' => 'delete', 'rc_log_action' => 'delete', 'rc_type' => RC_LOG, // We will fetch ones that do not have page record. If they do, // this means they were restored, thus restoring handler will pick it up. 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)', ] ); $this->addIndex( $it ); $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] ); return $it; } /** * Fetch restored categories * @param IDatabase $dbr * @return BatchRowIterator */ protected function getRestoredCatsIterator( IDatabase $dbr ) { $it = $this->setupChangesIterator( $dbr ); $it->addConditions( [ 'rc_namespace' => NS_CATEGORY, 'rc_new' => 0, 'rc_log_type' => 'delete', 'rc_log_action' => 'restore', 'rc_type' => RC_LOG, // We will only fetch ones that have page record 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)', ] ); $this->addIndex( $it ); return $it; } /** * Fetch categorization changes or edits * @param IDatabase $dbr * @return BatchRowIterator */ protected function getChangedCatsIterator( IDatabase $dbr, $type ) { $it = $this->setupChangesIterator( $dbr ); $it->addConditions( [ 'rc_namespace' => NS_CATEGORY, 'rc_new' => 0, 'rc_type' => $type, ] ); $this->addIndex( $it ); return $it; } /** * Add timestamp limits to iterator * @param BatchRowIterator $it Iterator * @param IDatabase $dbr */ private function addTimestampConditions( BatchRowIterator $it, IDatabase $dbr ) { $it->addConditions( [ 'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ), 'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ), ] ); } /** * Need to force index, somehow on terbium the optimizer chooses wrong one * @param BatchRowIterator $it */ private function addIndex( BatchRowIterator $it ) { $it->addOptions( [ 'USE INDEX' => [ 'recentchanges' => 'new_name_timestamp' ] ] ); } /** * Get iterator for links for categories. * @param IDatabase $dbr * @param int[] $ids List of page IDs * @return Traversable */ protected function getCategoryLinksIterator( IDatabase $dbr, array $ids ) { $it = new BatchRowIterator( $dbr, 'categorylinks', [ 'cl_from', 'cl_to' ], $this->mBatchSize ); $it->addConditions( [ 'cl_type' => 'subcat', 'cl_from' => $ids ] ); $it->setFetchColumns( [ 'cl_from', 'cl_to' ] ); return new RecursiveIteratorIterator( $it ); } /** * Get accumulated RDF. * @return string */ public function getRdf() { return $this->rdfWriter->drain(); } /** * Handle category deletes. * @param IDatabase $dbr * @param resource $output File to write the output */ public function handleDeletes( IDatabase $dbr, $output ) { // This only does "true" deletes - i.e. those that the page stays deleted foreach ( $this->getDeletedCatsIterator( $dbr ) as $batch ) { $deleteUrls = []; foreach ( $batch as $row ) { // This can produce duplicates, we don't care $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; $this->processed[$row->rc_cur_id] = true; } fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) ); } } /** * Write category data to RDF. * @param stdclass $row Database row */ private function writeCategoryData( $row ) { $this->categoriesRdf->writeCategoryData( $row->rc_title, $row->pp_propname === 'hiddencat', (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files, (int)$row->cat_subcats ); } /** * @param IDatabase $dbr * @param resource $output */ public function handleMoves( IDatabase $dbr, $output ) { foreach ( $this->getMovedCatsIterator( $dbr ) as $batch ) { $pages = []; $deleteUrls = []; foreach ( $batch as $row ) { $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; if ( isset( $this->processed[$row->rc_cur_id] ) ) { // We already captured this one before continue; } if ( $row->page_namespace != NS_CATEGORY ) { // If page was moved out of Category:, we'll just delete continue; } $row->rc_title = $row->page_title; $this->writeCategoryData( $row ); $pages[$row->rc_cur_id] = $row->page_title; $this->processed[$row->rc_cur_id] = true; } fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) ); } } /** * @param IDatabase $dbr * @param resource $output */ public function handleRestores( IDatabase $dbr, $output ) { fwrite( $output, "# Restores\n" ); // This will only find those restores that were not deleted later. foreach ( $this->getRestoredCatsIterator( $dbr ) as $batch ) { $pages = []; foreach ( $batch as $row ) { if ( isset( $this->processed[$row->rc_cur_id] ) ) { // We already captured this one before continue; } $this->writeCategoryData( $row ); $pages[$row->rc_cur_id] = $row->rc_title; $this->processed[$row->rc_cur_id] = true; } if ( empty( $pages ) ) { continue; } $this->writeParentCategories( $dbr, $pages ); fwrite( $output, $this->getInsertRdf() ); } } /** * @param IDatabase $dbr * @param resource $output */ public function handleAdds( IDatabase $dbr, $output ) { fwrite( $output, "# Additions\n" ); foreach ( $this->getNewCatsIterator( $dbr ) as $batch ) { $pages = []; foreach ( $batch as $row ) { if ( isset( $this->processed[$row->rc_cur_id] ) ) { // We already captured this one before continue; } $this->writeCategoryData( $row ); $pages[$row->rc_cur_id] = $row->rc_title; $this->processed[$row->rc_cur_id] = true; } if ( empty( $pages ) ) { continue; } $this->writeParentCategories( $dbr, $pages ); fwrite( $output, $this->getInsertRdf() ); } } /** * Handle edits for category texts * @param IDatabase $dbr * @param resource $output */ public function handleEdits( IDatabase $dbr, $output ) { // Editing category can change hidden flag and add new parents. // TODO: it's pretty expensive to update all edited categories, and most edits // aren't actually interesting for us. Some way to know which are interesting? // We can capture recategorization on the next step, but not change in hidden status. foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT ) as $batch ) { $pages = []; $deleteUrls = []; foreach ( $batch as $row ) { // Note that on categorization event, cur_id points to // the child page, not the parent category! if ( isset( $this->processed[$row->rc_cur_id] ) ) { // We already captured this one before continue; } $this->writeCategoryData( $row ); $pages[$row->rc_cur_id] = $row->rc_title; $this->processed[$row->rc_cur_id] = true; $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; } fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) ); } } /** * Handles categorization changes * @param IDatabase $dbr * @param resource $output */ public function handleCategorization( IDatabase $dbr, $output ) { $processedTitle = []; // Categorization change can add new parents and change counts // for the parent category. foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE ) as $batch ) { /* * Note that on categorization event, cur_id points to * the child page, not the parent category! * So we need to have a two-stage process, since we have ID from one * category and title from another, and we need both for proper updates. * TODO: For now, we do full update even though some data hasn't changed, * e.g. parents for parent cat and counts for child cat. */ $childPages = []; $parentCats = []; foreach ( $batch as $row ) { $childPages[$row->rc_cur_id] = true; $parentCats[$row->rc_title] = true; } $joinConditions = [ 'page_props' => [ 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ], ], 'category' => [ 'LEFT JOIN', [ 'cat_title = page_title' ], ], ]; $pages = []; $deleteUrls = []; if ( $childPages ) { // Load child rows by ID $childRows = $dbr->select( [ 'page', 'page_props', 'category' ], [ 'page_id', 'rc_title' => 'page_title', 'pp_propname', 'cat_pages', 'cat_subcats', 'cat_files', ], [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ], __METHOD__, [], $joinConditions ); foreach ( $childRows as $row ) { if ( isset( $this->processed[$row->page_id] ) ) { // We already captured this one before continue; } $this->writeCategoryData( $row ); $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; $this->processed[$row->page_id] = true; } } if ( $parentCats ) { // Load parent rows by title $joinConditions = [ 'page' => [ 'LEFT JOIN', [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ], ], 'page_props' => [ 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ], ], ]; $parentRows = $dbr->select( [ 'category', 'page', 'page_props' ], [ 'page_id', 'rc_title' => 'cat_title', 'pp_propname', 'cat_pages', 'cat_subcats', 'cat_files', ], [ 'cat_title' => array_keys( $parentCats ) ], __METHOD__, [], $joinConditions ); foreach ( $parentRows as $row ) { if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) { // We already captured this one before continue; } if ( isset( $processedTitle[$row->rc_title] ) ) { // We already captured this one before continue; } $this->writeCategoryData( $row ); $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>'; if ( $row->page_id ) { $this->processed[$row->page_id] = true; } $processedTitle[$row->rc_title] = true; } } fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) ); } } } $maintClass = CategoryChangesAsRdf::class; require_once RUN_MAINTENANCE_IF_MAIN;