Merge "Make DBAccessBase use DBConnRef, rename $wiki, and hide getLoadBalancer()"
[lhc/web/wiklou.git] / maintenance / categoryChangesAsRdf.php
1 <?php
2 /**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
17 *
18 */
19 use Wikimedia\Purtle\RdfWriter;
20 use Wikimedia\Purtle\TurtleRdfWriter;
21 use Wikimedia\Rdbms\IDatabase;
22
23 require_once __DIR__ . '/Maintenance.php';
24
25 /**
26 * Maintenance script to provide RDF representation of the recent changes in category tree.
27 *
28 * @ingroup Maintenance
29 * @since 1.30
30 */
31 class CategoryChangesAsRdf extends Maintenance {
32 /**
33 * Insert query
34 */
35 const SPARQL_INSERT = <<<SPARQL
36 INSERT DATA {
37 %s
38 };
39
40 SPARQL;
41
42 /**
43 * Delete query
44 */
45 const SPARQL_DELETE = <<<SPARQLD
46 DELETE {
47 ?category ?x ?y
48 } WHERE {
49 ?category ?x ?y
50 VALUES ?category {
51 %s
52 }
53 };
54
55 SPARQLD;
56
57 /**
58 * Delete/Insert query
59 */
60 const SPARQL_DELETE_INSERT = <<<SPARQLDI
61 DELETE {
62 ?category ?x ?y
63 } INSERT {
64 %s
65 } WHERE {
66 ?category ?x ?y
67 VALUES ?category {
68 %s
69 }
70 };
71
72 SPARQLDI;
73
74 /**
75 * @var RdfWriter
76 */
77 private $rdfWriter;
78 /**
79 * Categories RDF helper.
80 * @var CategoriesRdf
81 */
82 private $categoriesRdf;
83
84 private $startTS;
85 private $endTS;
86
87 /**
88 * List of processed page IDs,
89 * so we don't try to process same thing twice
90 * @var int[]
91 */
92 protected $processed = [];
93
94 public function __construct() {
95 parent::__construct();
96
97 $this->addDescription( "Generate RDF dump of category changes in a wiki." );
98
99 $this->setBatchSize( 200 );
100 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
101 true, 'o' );
102 $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.',
103 true, true, 's' );
104 $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true,
105 true, 'e' );
106 }
107
108 /**
109 * Initialize external service classes.
110 */
111 public function initialize() {
112 // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
113 $this->rdfWriter = new TurtleRdfWriter();
114 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
115 }
116
117 public function execute() {
118 $this->initialize();
119 $startTS = new MWTimestamp( $this->getOption( "start" ) );
120
121 $endTS = new MWTimestamp( $this->getOption( "end" ) );
122 $now = new MWTimestamp();
123 $rcMaxAge = $this->getConfig()->get( 'RCMaxAge' );
124
125 if ( $now->getTimestamp() - $startTS->getTimestamp() > $rcMaxAge ) {
126 $this->error( "Start timestamp too old, maximum RC age is $rcMaxAge!" );
127 }
128 if ( $now->getTimestamp() - $endTS->getTimestamp() > $rcMaxAge ) {
129 $this->error( "End timestamp too old, maximum RC age is $rcMaxAge!" );
130 }
131
132 $this->startTS = $startTS->getTimestamp();
133 $this->endTS = $endTS->getTimestamp();
134
135 $outFile = $this->getOption( 'output', 'php://stdout' );
136 if ( $outFile === '-' ) {
137 $outFile = 'php://stdout';
138 }
139
140 $output = fopen( $outFile, 'wb' );
141
142 $this->categoriesRdf->setupPrefixes();
143 $this->rdfWriter->start();
144
145 $prefixes = $this->getRdf();
146 // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
147 // Also strip dot at the end.
148 $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
149 fwrite( $output, $prefixes );
150
151 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
152
153 // Deletes go first because if the page was deleted, other changes
154 // do not matter. This only gets true deletes, i.e. not pages that were restored.
155 $this->handleDeletes( $dbr, $output );
156 // Moves go before additions because if category is moved, we should not process creation
157 // as it would produce wrong data - because create row has old title
158 $this->handleMoves( $dbr, $output );
159 // We need to handle restores too since delete may have happened in previous update.
160 $this->handleRestores( $dbr, $output );
161 // Process newly added pages
162 $this->handleAdds( $dbr, $output );
163 // Process page edits
164 $this->handleEdits( $dbr, $output );
165 // Process categorization changes
166 $this->handleCategorization( $dbr, $output );
167
168 // Update timestamp
169 fwrite( $output, $this->updateTS( $this->endTS ) );
170 }
171
172 /**
173 * Get the text of SPARQL INSERT DATA clause
174 * @return string
175 */
176 private function getInsertRdf() {
177 $rdfText = $this->getRdf();
178 if ( !$rdfText ) {
179 return "";
180 }
181 return sprintf( self::SPARQL_INSERT, $rdfText );
182 }
183
184 /**
185 * Get SPARQL for updating set of categories
186 * @param IDatabase $dbr
187 * @param string[] $deleteUrls List of URIs to be deleted, with <>
188 * @param string[] $pages List of categories: id => title
189 * @param string $mark Marks which operation requests the query
190 * @return string SPARQL query
191 */
192 private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) {
193 if ( empty( $deleteUrls ) ) {
194 return "";
195 }
196
197 if ( !empty( $pages ) ) {
198 $this->writeParentCategories( $dbr, $pages );
199 }
200
201 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
202 $this->getInsertRdf();
203 }
204
205 /**
206 * Write parent data for a set of categories.
207 * The list has the child categories.
208 * @param IDatabase $dbr
209 * @param string[] $pages List of child categories: id => title
210 */
211 private function writeParentCategories( IDatabase $dbr, $pages ) {
212 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
213 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
214 }
215 }
216
217 /**
218 * Generate SPARQL Update code for updating dump timestamp
219 * @param string|int $timestamp Timestamp for last change
220 * @return string SPARQL Update query for timestamp.
221 */
222 public function updateTS( $timestamp ) {
223 $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
224 $ts = wfTimestamp( TS_ISO_8601, $timestamp );
225 $tsQuery = <<<SPARQL
226 DELETE {
227 $dumpUrl schema:dateModified ?o .
228 }
229 WHERE {
230 $dumpUrl schema:dateModified ?o .
231 };
232 INSERT DATA {
233 $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
234 }
235
236 SPARQL;
237 return $tsQuery;
238 }
239
240 /**
241 * Set up standard iterator for retrieving category changes.
242 * @param IDatabase $dbr
243 * @param string[] $columns List of additional fields to get
244 * @param string[] $extra_tables List of additional tables to join
245 * @return BatchRowIterator
246 */
247 private function setupChangesIterator(
248 IDatabase $dbr,
249 array $columns = [],
250 array $extra_tables = []
251 ) {
252 $tables = [ 'recentchanges', 'page_props', 'category' ];
253 if ( $extra_tables ) {
254 $tables = array_merge( $tables, $extra_tables );
255 }
256 $it = new BatchRowIterator( $dbr,
257 $tables,
258 [ 'rc_timestamp' ],
259 $this->mBatchSize
260 );
261 $this->addTimestampConditions( $it, $dbr );
262 $it->addJoinConditions(
263 [
264 'page_props' => [
265 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ]
266 ],
267 'category' => [
268 'LEFT JOIN', [ 'cat_title = rc_title' ]
269 ]
270 ]
271 );
272 $it->setFetchColumns( array_merge( $columns, [
273 'rc_title',
274 'rc_cur_id',
275 'pp_propname',
276 'cat_pages',
277 'cat_subcats',
278 'cat_files'
279 ] ) );
280 return $it;
281 }
282
283 /**
284 * Fetch newly created categories
285 * @param IDatabase $dbr
286 * @return BatchRowIterator
287 */
288 protected function getNewCatsIterator( IDatabase $dbr ) {
289 $it = $this->setupChangesIterator( $dbr );
290 $it->addConditions( [
291 'rc_namespace' => NS_CATEGORY,
292 'rc_new' => 1,
293 ] );
294 return $it;
295 }
296
297 /**
298 * Fetch moved categories
299 * @param IDatabase $dbr
300 * @return BatchRowIterator
301 */
302 protected function getMovedCatsIterator( IDatabase $dbr ) {
303 $it = $this->setupChangesIterator( $dbr, [ 'page_title', 'page_namespace' ], [ 'page' ] );
304 $it->addConditions( [
305 'rc_namespace' => NS_CATEGORY,
306 'rc_new' => 0,
307 'rc_log_type' => 'move',
308 'rc_type' => RC_LOG,
309 ] );
310 $it->addJoinConditions( [
311 'page' => [ 'JOIN', 'rc_cur_id = page_id' ],
312 ] );
313 $this->addIndex( $it );
314 return $it;
315 }
316
317 /**
318 * Fetch deleted categories
319 * @param IDatabase $dbr
320 * @return BatchRowIterator
321 */
322 protected function getDeletedCatsIterator( IDatabase $dbr ) {
323 $it = new BatchRowIterator( $dbr,
324 'recentchanges',
325 [ 'rc_timestamp' ],
326 $this->mBatchSize
327 );
328 $this->addTimestampConditions( $it, $dbr );
329 $it->addConditions( [
330 'rc_namespace' => NS_CATEGORY,
331 'rc_new' => 0,
332 'rc_log_type' => 'delete',
333 'rc_log_action' => 'delete',
334 'rc_type' => RC_LOG,
335 // We will fetch ones that do not have page record. If they do,
336 // this means they were restored, thus restoring handler will pick it up.
337 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
338 ] );
339 $this->addIndex( $it );
340 $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
341 return $it;
342 }
343
344 /**
345 * Fetch restored categories
346 * @param IDatabase $dbr
347 * @return BatchRowIterator
348 */
349 protected function getRestoredCatsIterator( IDatabase $dbr ) {
350 $it = $this->setupChangesIterator( $dbr );
351 $it->addConditions( [
352 'rc_namespace' => NS_CATEGORY,
353 'rc_new' => 0,
354 'rc_log_type' => 'delete',
355 'rc_log_action' => 'restore',
356 'rc_type' => RC_LOG,
357 // We will only fetch ones that have page record
358 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
359 ] );
360 $this->addIndex( $it );
361 return $it;
362 }
363
364 /**
365 * Fetch categorization changes or edits
366 * @param IDatabase $dbr
367 * @return BatchRowIterator
368 */
369 protected function getChangedCatsIterator( IDatabase $dbr, $type ) {
370 $it =
371 $this->setupChangesIterator( $dbr );
372 $it->addConditions( [
373 'rc_namespace' => NS_CATEGORY,
374 'rc_new' => 0,
375 'rc_type' => $type,
376 ] );
377 $this->addIndex( $it );
378 return $it;
379 }
380
381 /**
382 * Add timestamp limits to iterator
383 * @param BatchRowIterator $it Iterator
384 * @param IDatabase $dbr
385 */
386 private function addTimestampConditions( BatchRowIterator $it, IDatabase $dbr ) {
387 $it->addConditions( [
388 'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ),
389 'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ),
390 ] );
391 }
392
393 /**
394 * Need to force index, somehow on terbium the optimizer chooses wrong one
395 * @param BatchRowIterator $it
396 */
397 private function addIndex( BatchRowIterator $it ) {
398 $it->addOptions( [
399 'USE INDEX' => [ 'recentchanges' => 'new_name_timestamp' ]
400 ] );
401 }
402
403 /**
404 * Get iterator for links for categories.
405 * @param IDatabase $dbr
406 * @param int[] $ids List of page IDs
407 * @return Traversable
408 */
409 protected function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
410 $it = new BatchRowIterator(
411 $dbr,
412 'categorylinks',
413 [ 'cl_from', 'cl_to' ],
414 $this->mBatchSize
415 );
416 $it->addConditions( [
417 'cl_type' => 'subcat',
418 'cl_from' => $ids
419 ] );
420 $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
421 return new RecursiveIteratorIterator( $it );
422 }
423
424 /**
425 * Get accumulated RDF.
426 * @return string
427 */
428 public function getRdf() {
429 return $this->rdfWriter->drain();
430 }
431
432 /**
433 * Handle category deletes.
434 * @param IDatabase $dbr
435 * @param resource $output File to write the output
436 */
437 public function handleDeletes( IDatabase $dbr, $output ) {
438 // This only does "true" deletes - i.e. those that the page stays deleted
439 foreach ( $this->getDeletedCatsIterator( $dbr ) as $batch ) {
440 $deleteUrls = [];
441 foreach ( $batch as $row ) {
442 // This can produce duplicates, we don't care
443 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
444 $this->processed[$row->rc_cur_id] = true;
445 }
446 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
447 }
448 }
449
450 /**
451 * Write category data to RDF.
452 * @param stdclass $row Database row
453 */
454 private function writeCategoryData( $row ) {
455 $this->categoriesRdf->writeCategoryData(
456 $row->rc_title,
457 $row->pp_propname === 'hiddencat',
458 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
459 (int)$row->cat_subcats
460 );
461 }
462
463 /**
464 * @param IDatabase $dbr
465 * @param resource $output
466 */
467 public function handleMoves( IDatabase $dbr, $output ) {
468 foreach ( $this->getMovedCatsIterator( $dbr ) as $batch ) {
469 $pages = [];
470 $deleteUrls = [];
471 foreach ( $batch as $row ) {
472 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
473
474 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
475 // We already captured this one before
476 continue;
477 }
478
479 if ( $row->page_namespace != NS_CATEGORY ) {
480 // If page was moved out of Category:, we'll just delete
481 continue;
482 }
483 $row->rc_title = $row->page_title;
484 $this->writeCategoryData( $row );
485 $pages[$row->rc_cur_id] = $row->page_title;
486 $this->processed[$row->rc_cur_id] = true;
487 }
488
489 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
490 }
491 }
492
493 /**
494 * @param IDatabase $dbr
495 * @param resource $output
496 */
497 public function handleRestores( IDatabase $dbr, $output ) {
498 fwrite( $output, "# Restores\n" );
499 // This will only find those restores that were not deleted later.
500 foreach ( $this->getRestoredCatsIterator( $dbr ) as $batch ) {
501 $pages = [];
502 foreach ( $batch as $row ) {
503 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
504 // We already captured this one before
505 continue;
506 }
507 $this->writeCategoryData( $row );
508 $pages[$row->rc_cur_id] = $row->rc_title;
509 $this->processed[$row->rc_cur_id] = true;
510 }
511
512 if ( empty( $pages ) ) {
513 continue;
514 }
515
516 $this->writeParentCategories( $dbr, $pages );
517
518 fwrite( $output, $this->getInsertRdf() );
519 }
520 }
521
522 /**
523 * @param IDatabase $dbr
524 * @param resource $output
525 */
526 public function handleAdds( IDatabase $dbr, $output ) {
527 fwrite( $output, "# Additions\n" );
528 foreach ( $this->getNewCatsIterator( $dbr ) as $batch ) {
529 $pages = [];
530 foreach ( $batch as $row ) {
531 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
532 // We already captured this one before
533 continue;
534 }
535 $this->writeCategoryData( $row );
536 $pages[$row->rc_cur_id] = $row->rc_title;
537 $this->processed[$row->rc_cur_id] = true;
538 }
539
540 if ( empty( $pages ) ) {
541 continue;
542 }
543
544 $this->writeParentCategories( $dbr, $pages );
545 fwrite( $output, $this->getInsertRdf() );
546 }
547 }
548
549 /**
550 * Handle edits for category texts
551 * @param IDatabase $dbr
552 * @param resource $output
553 */
554 public function handleEdits( IDatabase $dbr, $output ) {
555 // Editing category can change hidden flag and add new parents.
556 // TODO: it's pretty expensive to update all edited categories, and most edits
557 // aren't actually interesting for us. Some way to know which are interesting?
558 // We can capture recategorization on the next step, but not change in hidden status.
559 foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT ) as $batch ) {
560 $pages = [];
561 $deleteUrls = [];
562 foreach ( $batch as $row ) {
563 // Note that on categorization event, cur_id points to
564 // the child page, not the parent category!
565 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
566 // We already captured this one before
567 continue;
568 }
569 $this->writeCategoryData( $row );
570 $pages[$row->rc_cur_id] = $row->rc_title;
571 $this->processed[$row->rc_cur_id] = true;
572 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
573 }
574
575 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
576 }
577 }
578
579 /**
580 * Handles categorization changes
581 * @param IDatabase $dbr
582 * @param resource $output
583 */
584 public function handleCategorization( IDatabase $dbr, $output ) {
585 $processedTitle = [];
586 // Categorization change can add new parents and change counts
587 // for the parent category.
588 foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE ) as $batch ) {
589 /*
590 * Note that on categorization event, cur_id points to
591 * the child page, not the parent category!
592 * So we need to have a two-stage process, since we have ID from one
593 * category and title from another, and we need both for proper updates.
594 * TODO: For now, we do full update even though some data hasn't changed,
595 * e.g. parents for parent cat and counts for child cat.
596 */
597 $childPages = [];
598 $parentCats = [];
599 foreach ( $batch as $row ) {
600 $childPages[$row->rc_cur_id] = true;
601 $parentCats[$row->rc_title] = true;
602 }
603
604 $joinConditions = [
605 'page_props' => [
606 'LEFT JOIN',
607 [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
608 ],
609 'category' => [
610 'LEFT JOIN',
611 [ 'cat_title = page_title' ],
612 ],
613 ];
614
615 $pages = [];
616 $deleteUrls = [];
617
618 if ( $childPages ) {
619 // Load child rows by ID
620 $childRows = $dbr->select(
621 [ 'page', 'page_props', 'category' ],
622 [
623 'page_id',
624 'rc_title' => 'page_title',
625 'pp_propname',
626 'cat_pages',
627 'cat_subcats',
628 'cat_files',
629 ],
630 [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ],
631 __METHOD__,
632 [],
633 $joinConditions
634 );
635 foreach ( $childRows as $row ) {
636 if ( isset( $this->processed[$row->page_id] ) ) {
637 // We already captured this one before
638 continue;
639 }
640 $this->writeCategoryData( $row );
641 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
642 $this->processed[$row->page_id] = true;
643 }
644 }
645
646 if ( $parentCats ) {
647 // Load parent rows by title
648 $joinConditions = [
649 'page' => [
650 'LEFT JOIN',
651 [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ],
652 ],
653 'page_props' => [
654 'LEFT JOIN',
655 [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
656 ],
657 ];
658
659 $parentRows = $dbr->select(
660 [ 'category', 'page', 'page_props' ],
661 [
662 'page_id',
663 'rc_title' => 'cat_title',
664 'pp_propname',
665 'cat_pages',
666 'cat_subcats',
667 'cat_files',
668 ],
669 [ 'cat_title' => array_keys( $parentCats ) ],
670 __METHOD__,
671 [],
672 $joinConditions
673 );
674 foreach ( $parentRows as $row ) {
675 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
676 // We already captured this one before
677 continue;
678 }
679 if ( isset( $processedTitle[$row->rc_title] ) ) {
680 // We already captured this one before
681 continue;
682 }
683 $this->writeCategoryData( $row );
684 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
685 if ( $row->page_id ) {
686 $this->processed[$row->page_id] = true;
687 }
688 $processedTitle[$row->rc_title] = true;
689 }
690 }
691
692 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
693 }
694 }
695 }
696
697 $maintClass = CategoryChangesAsRdf::class;
698 require_once RUN_MAINTENANCE_IF_MAIN;