Merge "Improve docs for Title::getInternalURL/getCanonicalURL"
[lhc/web/wiklou.git] / maintenance / categoryChangesAsRdf.php
1 <?php
2 /**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
17 *
18 */
19 use Wikimedia\Purtle\RdfWriter;
20 use Wikimedia\Purtle\TurtleRdfWriter;
21 use Wikimedia\Rdbms\IDatabase;
22
23 require_once __DIR__ . '/Maintenance.php';
24
25 /**
26 * Maintenance script to provide RDF representation of the recent changes in category tree.
27 *
28 * @ingroup Maintenance
29 * @since 1.30
30 */
31 class CategoryChangesAsRdf extends Maintenance {
32 /**
33 * Insert query
34 */
35 const SPARQL_INSERT = <<<SPARQL
36 INSERT DATA {
37 %s
38 };
39
40 SPARQL;
41
42 /**
43 * Delete query
44 */
45 const SPARQL_DELETE = <<<SPARQLD
46 DELETE {
47 ?category ?x ?y
48 } WHERE {
49 VALUES ?category {
50 %s
51 }
52 };
53
54 SPARQLD;
55
56 /**
57 * Delete/Insert query
58 */
59 const SPARQL_DELETE_INSERT = <<<SPARQLDI
60 DELETE {
61 ?category ?x ?y
62 } INSERT {
63 %s
64 } WHERE {
65 VALUES ?category {
66 %s
67 }
68 };
69
70 SPARQLDI;
71
72 /**
73 * @var RdfWriter
74 */
75 private $rdfWriter;
76 /**
77 * Categories RDF helper.
78 * @var CategoriesRdf
79 */
80 private $categoriesRdf;
81
82 private $startTS;
83 private $endTS;
84
85 /**
86 * List of processed page IDs,
87 * so we don't try to process same thing twice
88 * @var int[]
89 */
90 protected $processed = [];
91
92 public function __construct() {
93 parent::__construct();
94
95 $this->addDescription( "Generate RDF dump of category changes in a wiki." );
96
97 $this->setBatchSize( 200 );
98 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
99 true, 'o' );
100 $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.',
101 true, true, 's' );
102 $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true,
103 true, 'e' );
104 }
105
106 /**
107 * Initialize external service classes.
108 */
109 public function initialize() {
110 // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
111 $this->rdfWriter = new TurtleRdfWriter();
112 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
113 }
114
115 public function execute() {
116 global $wgRCMaxAge;
117
118 $this->initialize();
119 $startTS = new MWTimestamp( $this->getOption( "start" ) );
120
121 $endTS = new MWTimestamp( $this->getOption( "end" ) );
122 $now = new MWTimestamp();
123
124 if ( $now->getTimestamp() - $startTS->getTimestamp() > $wgRCMaxAge ) {
125 $this->error( "Start timestamp too old, maximum RC age is $wgRCMaxAge!" );
126 }
127 if ( $now->getTimestamp() - $endTS->getTimestamp() > $wgRCMaxAge ) {
128 $this->error( "End timestamp too old, maximum RC age is $wgRCMaxAge!" );
129 }
130
131 $this->startTS = $startTS->getTimestamp();
132 $this->endTS = $endTS->getTimestamp();
133
134 $outFile = $this->getOption( 'output', 'php://stdout' );
135 if ( $outFile === '-' ) {
136 $outFile = 'php://stdout';
137 }
138
139 $output = fopen( $outFile, 'wb' );
140
141 $this->categoriesRdf->setupPrefixes();
142 $this->rdfWriter->start();
143
144 $prefixes = $this->getRdf();
145 // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
146 // Also strip dot at the end.
147 $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
148 fwrite( $output, $prefixes );
149
150 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
151
152 // Deletes go first because if the page was deleted, other changes
153 // do not matter. This only gets true deletes, i.e. not pages that were restored.
154 $this->handleDeletes( $dbr, $output );
155 // Moves go before additions because if category is moved, we should not process creation
156 // as it would produce wrong data - because create row has old title
157 $this->handleMoves( $dbr, $output );
158 // We need to handle restores too since delete may have happened in previous update.
159 $this->handleRestores( $dbr, $output );
160 // Process newly added pages
161 $this->handleAdds( $dbr, $output );
162 // Process page edits
163 $this->handleEdits( $dbr, $output );
164 // Process categorization changes
165 $this->handleCategorization( $dbr, $output );
166
167 // Update timestamp
168 fwrite( $output, $this->updateTS( $this->endTS ) );
169 }
170
171 /**
172 * Get the text of SPARQL INSERT DATA clause
173 * @return string
174 */
175 private function getInsertRdf() {
176 $rdfText = $this->getRdf();
177 if ( !$rdfText ) {
178 return "";
179 }
180 return sprintf( self::SPARQL_INSERT, $rdfText );
181 }
182
183 /**
184 * Get SPARQL for updating set of categories
185 * @param IDatabase $dbr
186 * @param string[] $deleteUrls List of URIs to be deleted, with <>
187 * @param string[] $pages List of categories: id => title
188 * @param string $mark Marks which operation requests the query
189 * @return string SPARQL query
190 */
191 private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) {
192 if ( empty( $deleteUrls ) ) {
193 return "";
194 }
195
196 if ( !empty( $pages ) ) {
197 $this->writeParentCategories( $dbr, $pages );
198 }
199
200 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
201 $this->getInsertRdf();
202 }
203
204 /**
205 * Write parent data for a set of categories.
206 * The list has the child categories.
207 * @param IDatabase $dbr
208 * @param string[] $pages List of child categories: id => title
209 */
210 private function writeParentCategories( IDatabase $dbr, $pages ) {
211 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
212 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
213 }
214 }
215
216 /**
217 * Generate SPARQL Update code for updating dump timestamp
218 * @param string|int $timestamp Timestamp for last change
219 * @return string SPARQL Update query for timestamp.
220 */
221 public function updateTS( $timestamp ) {
222 $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
223 $ts = wfTimestamp( TS_ISO_8601, $timestamp );
224 $tsQuery = <<<SPARQL
225 DELETE {
226 $dumpUrl schema:dateModified ?o .
227 }
228 WHERE {
229 $dumpUrl schema:dateModified ?o .
230 };
231 INSERT DATA {
232 $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
233 }
234
235 SPARQL;
236 return $tsQuery;
237 }
238
239 /**
240 * Set up standard iterator for retrieving category changes.
241 * @param IDatabase $dbr
242 * @param string[] $columns List of additional fields to get
243 * @param string[] $extra_tables List of additional tables to join
244 * @return BatchRowIterator
245 */
246 private function setupChangesIterator(
247 IDatabase $dbr,
248 array $columns = [],
249 array $extra_tables = []
250 ) {
251 $tables = [ 'recentchanges', 'page_props', 'category' ];
252 if ( $extra_tables ) {
253 $tables = array_merge( $tables, $extra_tables );
254 }
255 $it = new BatchRowIterator( $dbr,
256 $tables,
257 [ 'rc_timestamp' ],
258 $this->mBatchSize
259 );
260 $this->addTimestampConditions( $it, $dbr );
261 $it->addJoinConditions(
262 [
263 'page_props' => [
264 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ]
265 ],
266 'category' => [
267 'LEFT JOIN', [ 'cat_title = rc_title' ]
268 ]
269 ]
270 );
271 $it->setFetchColumns( array_merge( $columns, [
272 'rc_title',
273 'rc_cur_id',
274 'pp_propname',
275 'cat_pages',
276 'cat_subcats',
277 'cat_files'
278 ] ) );
279 return $it;
280 }
281
282 /**
283 * Fetch newly created categories
284 * @param IDatabase $dbr
285 * @return BatchRowIterator
286 */
287 protected function getNewCatsIterator( IDatabase $dbr ) {
288 $it = $this->setupChangesIterator( $dbr );
289 $it->addConditions( [
290 'rc_namespace' => NS_CATEGORY,
291 'rc_new' => 1,
292 ] );
293 return $it;
294 }
295
296 /**
297 * Fetch moved categories
298 * @param IDatabase $dbr
299 * @return BatchRowIterator
300 */
301 protected function getMovedCatsIterator( IDatabase $dbr ) {
302 $it = $this->setupChangesIterator( $dbr, [ 'page_title', 'page_namespace' ], [ 'page' ] );
303 $it->addConditions( [
304 'rc_namespace' => NS_CATEGORY,
305 'rc_new' => 0,
306 'rc_log_type' => 'move',
307 'rc_type' => RC_LOG,
308 ] );
309 $it->addJoinConditions( [
310 'page' => [ 'JOIN', 'rc_cur_id = page_id' ],
311 ] );
312 $this->addIndex( $it );
313 return $it;
314 }
315
316 /**
317 * Fetch deleted categories
318 * @param IDatabase $dbr
319 * @return BatchRowIterator
320 */
321 protected function getDeletedCatsIterator( IDatabase $dbr ) {
322 $it = new BatchRowIterator( $dbr,
323 'recentchanges',
324 [ 'rc_timestamp' ],
325 $this->mBatchSize
326 );
327 $this->addTimestampConditions( $it, $dbr );
328 $it->addConditions( [
329 'rc_namespace' => NS_CATEGORY,
330 'rc_new' => 0,
331 'rc_log_type' => 'delete',
332 'rc_log_action' => 'delete',
333 'rc_type' => RC_LOG,
334 // We will fetch ones that do not have page record. If they do,
335 // this means they were restored, thus restoring handler will pick it up.
336 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
337 ] );
338 $this->addIndex( $it );
339 $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
340 return $it;
341 }
342
343 /**
344 * Fetch restored categories
345 * @param IDatabase $dbr
346 * @return BatchRowIterator
347 */
348 protected function getRestoredCatsIterator( IDatabase $dbr ) {
349 $it = $this->setupChangesIterator( $dbr );
350 $it->addConditions( [
351 'rc_namespace' => NS_CATEGORY,
352 'rc_new' => 0,
353 'rc_log_type' => 'delete',
354 'rc_log_action' => 'restore',
355 'rc_type' => RC_LOG,
356 // We will only fetch ones that have page record
357 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
358 ] );
359 $this->addIndex( $it );
360 return $it;
361 }
362
363 /**
364 * Fetch categorization changes or edits
365 * @param IDatabase $dbr
366 * @return BatchRowIterator
367 */
368 protected function getChangedCatsIterator( IDatabase $dbr, $type ) {
369 $it =
370 $this->setupChangesIterator( $dbr );
371 $it->addConditions( [
372 'rc_namespace' => NS_CATEGORY,
373 'rc_new' => 0,
374 'rc_type' => $type,
375 ] );
376 $this->addIndex( $it );
377 return $it;
378 }
379
380 /**
381 * Add timestamp limits to iterator
382 * @param BatchRowIterator $it Iterator
383 * @param IDatabase $dbr
384 */
385 private function addTimestampConditions( BatchRowIterator $it, IDatabase $dbr ) {
386 $it->addConditions( [
387 'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ),
388 'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ),
389 ] );
390 }
391
392 /**
393 * Need to force index, somehow on terbium the optimizer chooses wrong one
394 * @param BatchRowIterator $it
395 */
396 private function addIndex( BatchRowIterator $it ) {
397 $it->addOptions( [
398 'USE INDEX' => [ 'recentchanges' => 'new_name_timestamp' ]
399 ] );
400 }
401
402 /**
403 * Get iterator for links for categories.
404 * @param IDatabase $dbr
405 * @param int[] $ids List of page IDs
406 * @return Traversable
407 */
408 protected function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
409 $it = new BatchRowIterator(
410 $dbr,
411 'categorylinks',
412 [ 'cl_from', 'cl_to' ],
413 $this->mBatchSize
414 );
415 $it->addConditions( [
416 'cl_type' => 'subcat',
417 'cl_from' => $ids
418 ] );
419 $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
420 return new RecursiveIteratorIterator( $it );
421 }
422
423 /**
424 * Get accumulated RDF.
425 * @return string
426 */
427 public function getRdf() {
428 return $this->rdfWriter->drain();
429 }
430
431 /**
432 * Handle category deletes.
433 * @param IDatabase $dbr
434 * @param resource $output File to write the output
435 */
436 public function handleDeletes( IDatabase $dbr, $output ) {
437 // This only does "true" deletes - i.e. those that the page stays deleted
438 foreach ( $this->getDeletedCatsIterator( $dbr ) as $batch ) {
439 $deleteUrls = [];
440 foreach ( $batch as $row ) {
441 // This can produce duplicates, we don't care
442 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
443 $this->processed[$row->rc_cur_id] = true;
444 }
445 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
446 }
447 }
448
449 /**
450 * Write category data to RDF.
451 * @param stdclass $row Database row
452 */
453 private function writeCategoryData( $row ) {
454 $this->categoriesRdf->writeCategoryData(
455 $row->rc_title,
456 $row->pp_propname === 'hiddencat',
457 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
458 (int)$row->cat_subcats
459 );
460 }
461
462 /**
463 * @param IDatabase $dbr
464 * @param resource $output
465 */
466 public function handleMoves( IDatabase $dbr, $output ) {
467 foreach ( $this->getMovedCatsIterator( $dbr ) as $batch ) {
468 $pages = [];
469 $deleteUrls = [];
470 foreach ( $batch as $row ) {
471 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
472
473 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
474 // We already captured this one before
475 continue;
476 }
477
478 if ( $row->page_namespace != NS_CATEGORY ) {
479 // If page was moved out of Category:, we'll just delete
480 continue;
481 }
482 $row->rc_title = $row->page_title;
483 $this->writeCategoryData( $row );
484 $pages[$row->rc_cur_id] = $row->page_title;
485 $this->processed[$row->rc_cur_id] = true;
486 }
487
488 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
489 }
490 }
491
492 /**
493 * @param IDatabase $dbr
494 * @param resource $output
495 */
496 public function handleRestores( IDatabase $dbr, $output ) {
497 fwrite( $output, "# Restores\n" );
498 // This will only find those restores that were not deleted later.
499 foreach ( $this->getRestoredCatsIterator( $dbr ) as $batch ) {
500 $pages = [];
501 foreach ( $batch as $row ) {
502 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
503 // We already captured this one before
504 continue;
505 }
506 $this->writeCategoryData( $row );
507 $pages[$row->rc_cur_id] = $row->rc_title;
508 $this->processed[$row->rc_cur_id] = true;
509 }
510
511 if ( empty( $pages ) ) {
512 continue;
513 }
514
515 $this->writeParentCategories( $dbr, $pages );
516
517 fwrite( $output, $this->getInsertRdf() );
518 }
519 }
520
521 /**
522 * @param IDatabase $dbr
523 * @param resource $output
524 */
525 public function handleAdds( IDatabase $dbr, $output ) {
526 fwrite( $output, "# Additions\n" );
527 foreach ( $this->getNewCatsIterator( $dbr ) as $batch ) {
528 $pages = [];
529 foreach ( $batch as $row ) {
530 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
531 // We already captured this one before
532 continue;
533 }
534 $this->writeCategoryData( $row );
535 $pages[$row->rc_cur_id] = $row->rc_title;
536 $this->processed[$row->rc_cur_id] = true;
537 }
538
539 if ( empty( $pages ) ) {
540 continue;
541 }
542
543 $this->writeParentCategories( $dbr, $pages );
544 fwrite( $output, $this->getInsertRdf() );
545 }
546 }
547
548 /**
549 * Handle edits for category texts
550 * @param IDatabase $dbr
551 * @param resource $output
552 */
553 public function handleEdits( IDatabase $dbr, $output ) {
554 // Editing category can change hidden flag and add new parents.
555 // TODO: it's pretty expensive to update all edited categories, and most edits
556 // aren't actually interesting for us. Some way to know which are interesting?
557 // We can capture recategorization on the next step, but not change in hidden status.
558 foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT ) as $batch ) {
559 $pages = [];
560 $deleteUrls = [];
561 foreach ( $batch as $row ) {
562 // Note that on categorization event, cur_id points to
563 // the child page, not the parent category!
564 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
565 // We already captured this one before
566 continue;
567 }
568 $this->writeCategoryData( $row );
569 $pages[$row->rc_cur_id] = $row->rc_title;
570 $this->processed[$row->rc_cur_id] = true;
571 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
572 }
573
574 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
575 }
576 }
577
578 /**
579 * Handles categorization changes
580 * @param IDatabase $dbr
581 * @param resource $output
582 */
583 public function handleCategorization( IDatabase $dbr, $output ) {
584 $processedTitle = [];
585 // Categorization change can add new parents and change counts
586 // for the parent category.
587 foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE ) as $batch ) {
588 /*
589 * Note that on categorization event, cur_id points to
590 * the child page, not the parent category!
591 * So we need to have a two-stage process, since we have ID from one
592 * category and title from another, and we need both for proper updates.
593 * TODO: For now, we do full update even though some data hasn't changed,
594 * e.g. parents for parent cat and counts for child cat.
595 */
596 foreach ( $batch as $row ) {
597 $childPages[$row->rc_cur_id] = true;
598 $parentCats[$row->rc_title] = true;
599 }
600
601 $joinConditions = [
602 'page_props' => [
603 'LEFT JOIN',
604 [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
605 ],
606 'category' => [
607 'LEFT JOIN',
608 [ 'cat_title = page_title' ],
609 ],
610 ];
611
612 $pages = [];
613 $deleteUrls = [];
614
615 if ( !empty( $childPages ) ) {
616 // Load child rows by ID
617 $childRows = $dbr->select(
618 [ 'page', 'page_props', 'category' ],
619 [
620 'page_id',
621 'rc_title' => 'page_title',
622 'pp_propname',
623 'cat_pages',
624 'cat_subcats',
625 'cat_files',
626 ],
627 [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ],
628 __METHOD__,
629 [],
630 $joinConditions
631 );
632 foreach ( $childRows as $row ) {
633 if ( isset( $this->processed[$row->page_id] ) ) {
634 // We already captured this one before
635 continue;
636 }
637 $this->writeCategoryData( $row );
638 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
639 $this->processed[$row->page_id] = true;
640 }
641 }
642
643 if ( !empty( $parentCats ) ) {
644 // Load parent rows by title
645 $joinConditions = [
646 'page' => [
647 'LEFT JOIN',
648 [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ],
649 ],
650 'page_props' => [
651 'LEFT JOIN',
652 [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
653 ],
654 ];
655
656 $parentRows = $dbr->select(
657 [ 'category', 'page', 'page_props' ],
658 [
659 'page_id',
660 'rc_title' => 'cat_title',
661 'pp_propname',
662 'cat_pages',
663 'cat_subcats',
664 'cat_files',
665 ],
666 [ 'cat_title' => array_keys( $parentCats ) ],
667 __METHOD__,
668 [],
669 $joinConditions
670 );
671 foreach ( $parentRows as $row ) {
672 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
673 // We already captured this one before
674 continue;
675 }
676 if ( isset( $processedTitle[$row->rc_title] ) ) {
677 // We already captured this one before
678 continue;
679 }
680 $this->writeCategoryData( $row );
681 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
682 if ( $row->page_id ) {
683 $this->processed[$row->page_id] = true;
684 }
685 $processedTitle[$row->rc_title] = true;
686 }
687 }
688
689 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
690 }
691 }
692 }
693
694 $maintClass = CategoryChangesAsRdf::class;
695 require_once RUN_MAINTENANCE_IF_MAIN;