Merge "filebackend: add idiom constant to FileBackend for null results"
[lhc/web/wiklou.git] / maintenance / categoryChangesAsRdf.php
1 <?php
2 /**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
17 *
18 */
19 use Wikimedia\Purtle\RdfWriter;
20 use Wikimedia\Purtle\TurtleRdfWriter;
21 use Wikimedia\Rdbms\IDatabase;
22
23 require_once __DIR__ . '/Maintenance.php';
24
25 /**
26 * Maintenance script to provide RDF representation of the recent changes in category tree.
27 *
28 * @ingroup Maintenance
29 * @since 1.30
30 */
31 class CategoryChangesAsRdf extends Maintenance {
32 /**
33 * Insert query
34 */
35 const SPARQL_INSERT = <<<SPARQL
36 INSERT DATA {
37 %s
38 };
39
40 SPARQL;
41
42 /**
43 * Delete query
44 */
45 const SPARQL_DELETE = <<<SPARQLD
46 DELETE {
47 ?category ?x ?y
48 } WHERE {
49 ?category ?x ?y
50 VALUES ?category {
51 %s
52 }
53 };
54
55 SPARQLD;
56
57 /**
58 * Delete/Insert query
59 */
60 const SPARQL_DELETE_INSERT = <<<SPARQLDI
61 DELETE {
62 ?category ?x ?y
63 } INSERT {
64 %s
65 } WHERE {
66 ?category ?x ?y
67 VALUES ?category {
68 %s
69 }
70 };
71
72 SPARQLDI;
73
74 /**
75 * @var RdfWriter
76 */
77 private $rdfWriter;
78 /**
79 * Categories RDF helper.
80 * @var CategoriesRdf
81 */
82 private $categoriesRdf;
83
84 private $startTS;
85 private $endTS;
86
87 /**
88 * List of processed page IDs,
89 * so we don't try to process same thing twice
90 * @var int[]
91 */
92 protected $processed = [];
93
94 public function __construct() {
95 parent::__construct();
96
97 $this->addDescription( "Generate RDF dump of category changes in a wiki." );
98
99 $this->setBatchSize( 200 );
100 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
101 true, 'o' );
102 $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.',
103 true, true, 's' );
104 $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true,
105 true, 'e' );
106 }
107
108 /**
109 * Initialize external service classes.
110 */
111 public function initialize() {
112 // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
113 $this->rdfWriter = new TurtleRdfWriter();
114 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
115 }
116
117 public function execute() {
118 global $wgRCMaxAge;
119
120 $this->initialize();
121 $startTS = new MWTimestamp( $this->getOption( "start" ) );
122
123 $endTS = new MWTimestamp( $this->getOption( "end" ) );
124 $now = new MWTimestamp();
125
126 if ( $now->getTimestamp() - $startTS->getTimestamp() > $wgRCMaxAge ) {
127 $this->error( "Start timestamp too old, maximum RC age is $wgRCMaxAge!" );
128 }
129 if ( $now->getTimestamp() - $endTS->getTimestamp() > $wgRCMaxAge ) {
130 $this->error( "End timestamp too old, maximum RC age is $wgRCMaxAge!" );
131 }
132
133 $this->startTS = $startTS->getTimestamp();
134 $this->endTS = $endTS->getTimestamp();
135
136 $outFile = $this->getOption( 'output', 'php://stdout' );
137 if ( $outFile === '-' ) {
138 $outFile = 'php://stdout';
139 }
140
141 $output = fopen( $outFile, 'wb' );
142
143 $this->categoriesRdf->setupPrefixes();
144 $this->rdfWriter->start();
145
146 $prefixes = $this->getRdf();
147 // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
148 // Also strip dot at the end.
149 $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
150 fwrite( $output, $prefixes );
151
152 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
153
154 // Deletes go first because if the page was deleted, other changes
155 // do not matter. This only gets true deletes, i.e. not pages that were restored.
156 $this->handleDeletes( $dbr, $output );
157 // Moves go before additions because if category is moved, we should not process creation
158 // as it would produce wrong data - because create row has old title
159 $this->handleMoves( $dbr, $output );
160 // We need to handle restores too since delete may have happened in previous update.
161 $this->handleRestores( $dbr, $output );
162 // Process newly added pages
163 $this->handleAdds( $dbr, $output );
164 // Process page edits
165 $this->handleEdits( $dbr, $output );
166 // Process categorization changes
167 $this->handleCategorization( $dbr, $output );
168
169 // Update timestamp
170 fwrite( $output, $this->updateTS( $this->endTS ) );
171 }
172
173 /**
174 * Get the text of SPARQL INSERT DATA clause
175 * @return string
176 */
177 private function getInsertRdf() {
178 $rdfText = $this->getRdf();
179 if ( !$rdfText ) {
180 return "";
181 }
182 return sprintf( self::SPARQL_INSERT, $rdfText );
183 }
184
185 /**
186 * Get SPARQL for updating set of categories
187 * @param IDatabase $dbr
188 * @param string[] $deleteUrls List of URIs to be deleted, with <>
189 * @param string[] $pages List of categories: id => title
190 * @param string $mark Marks which operation requests the query
191 * @return string SPARQL query
192 */
193 private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) {
194 if ( empty( $deleteUrls ) ) {
195 return "";
196 }
197
198 if ( !empty( $pages ) ) {
199 $this->writeParentCategories( $dbr, $pages );
200 }
201
202 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
203 $this->getInsertRdf();
204 }
205
206 /**
207 * Write parent data for a set of categories.
208 * The list has the child categories.
209 * @param IDatabase $dbr
210 * @param string[] $pages List of child categories: id => title
211 */
212 private function writeParentCategories( IDatabase $dbr, $pages ) {
213 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
214 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
215 }
216 }
217
218 /**
219 * Generate SPARQL Update code for updating dump timestamp
220 * @param string|int $timestamp Timestamp for last change
221 * @return string SPARQL Update query for timestamp.
222 */
223 public function updateTS( $timestamp ) {
224 $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
225 $ts = wfTimestamp( TS_ISO_8601, $timestamp );
226 $tsQuery = <<<SPARQL
227 DELETE {
228 $dumpUrl schema:dateModified ?o .
229 }
230 WHERE {
231 $dumpUrl schema:dateModified ?o .
232 };
233 INSERT DATA {
234 $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
235 }
236
237 SPARQL;
238 return $tsQuery;
239 }
240
241 /**
242 * Set up standard iterator for retrieving category changes.
243 * @param IDatabase $dbr
244 * @param string[] $columns List of additional fields to get
245 * @param string[] $extra_tables List of additional tables to join
246 * @return BatchRowIterator
247 */
248 private function setupChangesIterator(
249 IDatabase $dbr,
250 array $columns = [],
251 array $extra_tables = []
252 ) {
253 $tables = [ 'recentchanges', 'page_props', 'category' ];
254 if ( $extra_tables ) {
255 $tables = array_merge( $tables, $extra_tables );
256 }
257 $it = new BatchRowIterator( $dbr,
258 $tables,
259 [ 'rc_timestamp' ],
260 $this->mBatchSize
261 );
262 $this->addTimestampConditions( $it, $dbr );
263 $it->addJoinConditions(
264 [
265 'page_props' => [
266 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ]
267 ],
268 'category' => [
269 'LEFT JOIN', [ 'cat_title = rc_title' ]
270 ]
271 ]
272 );
273 $it->setFetchColumns( array_merge( $columns, [
274 'rc_title',
275 'rc_cur_id',
276 'pp_propname',
277 'cat_pages',
278 'cat_subcats',
279 'cat_files'
280 ] ) );
281 return $it;
282 }
283
284 /**
285 * Fetch newly created categories
286 * @param IDatabase $dbr
287 * @return BatchRowIterator
288 */
289 protected function getNewCatsIterator( IDatabase $dbr ) {
290 $it = $this->setupChangesIterator( $dbr );
291 $it->addConditions( [
292 'rc_namespace' => NS_CATEGORY,
293 'rc_new' => 1,
294 ] );
295 return $it;
296 }
297
298 /**
299 * Fetch moved categories
300 * @param IDatabase $dbr
301 * @return BatchRowIterator
302 */
303 protected function getMovedCatsIterator( IDatabase $dbr ) {
304 $it = $this->setupChangesIterator( $dbr, [ 'page_title', 'page_namespace' ], [ 'page' ] );
305 $it->addConditions( [
306 'rc_namespace' => NS_CATEGORY,
307 'rc_new' => 0,
308 'rc_log_type' => 'move',
309 'rc_type' => RC_LOG,
310 ] );
311 $it->addJoinConditions( [
312 'page' => [ 'JOIN', 'rc_cur_id = page_id' ],
313 ] );
314 $this->addIndex( $it );
315 return $it;
316 }
317
318 /**
319 * Fetch deleted categories
320 * @param IDatabase $dbr
321 * @return BatchRowIterator
322 */
323 protected function getDeletedCatsIterator( IDatabase $dbr ) {
324 $it = new BatchRowIterator( $dbr,
325 'recentchanges',
326 [ 'rc_timestamp' ],
327 $this->mBatchSize
328 );
329 $this->addTimestampConditions( $it, $dbr );
330 $it->addConditions( [
331 'rc_namespace' => NS_CATEGORY,
332 'rc_new' => 0,
333 'rc_log_type' => 'delete',
334 'rc_log_action' => 'delete',
335 'rc_type' => RC_LOG,
336 // We will fetch ones that do not have page record. If they do,
337 // this means they were restored, thus restoring handler will pick it up.
338 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
339 ] );
340 $this->addIndex( $it );
341 $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
342 return $it;
343 }
344
345 /**
346 * Fetch restored categories
347 * @param IDatabase $dbr
348 * @return BatchRowIterator
349 */
350 protected function getRestoredCatsIterator( IDatabase $dbr ) {
351 $it = $this->setupChangesIterator( $dbr );
352 $it->addConditions( [
353 'rc_namespace' => NS_CATEGORY,
354 'rc_new' => 0,
355 'rc_log_type' => 'delete',
356 'rc_log_action' => 'restore',
357 'rc_type' => RC_LOG,
358 // We will only fetch ones that have page record
359 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
360 ] );
361 $this->addIndex( $it );
362 return $it;
363 }
364
365 /**
366 * Fetch categorization changes or edits
367 * @param IDatabase $dbr
368 * @return BatchRowIterator
369 */
370 protected function getChangedCatsIterator( IDatabase $dbr, $type ) {
371 $it =
372 $this->setupChangesIterator( $dbr );
373 $it->addConditions( [
374 'rc_namespace' => NS_CATEGORY,
375 'rc_new' => 0,
376 'rc_type' => $type,
377 ] );
378 $this->addIndex( $it );
379 return $it;
380 }
381
382 /**
383 * Add timestamp limits to iterator
384 * @param BatchRowIterator $it Iterator
385 * @param IDatabase $dbr
386 */
387 private function addTimestampConditions( BatchRowIterator $it, IDatabase $dbr ) {
388 $it->addConditions( [
389 'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ),
390 'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ),
391 ] );
392 }
393
394 /**
395 * Need to force index, somehow on terbium the optimizer chooses wrong one
396 * @param BatchRowIterator $it
397 */
398 private function addIndex( BatchRowIterator $it ) {
399 $it->addOptions( [
400 'USE INDEX' => [ 'recentchanges' => 'new_name_timestamp' ]
401 ] );
402 }
403
404 /**
405 * Get iterator for links for categories.
406 * @param IDatabase $dbr
407 * @param int[] $ids List of page IDs
408 * @return Traversable
409 */
410 protected function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
411 $it = new BatchRowIterator(
412 $dbr,
413 'categorylinks',
414 [ 'cl_from', 'cl_to' ],
415 $this->mBatchSize
416 );
417 $it->addConditions( [
418 'cl_type' => 'subcat',
419 'cl_from' => $ids
420 ] );
421 $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
422 return new RecursiveIteratorIterator( $it );
423 }
424
425 /**
426 * Get accumulated RDF.
427 * @return string
428 */
429 public function getRdf() {
430 return $this->rdfWriter->drain();
431 }
432
433 /**
434 * Handle category deletes.
435 * @param IDatabase $dbr
436 * @param resource $output File to write the output
437 */
438 public function handleDeletes( IDatabase $dbr, $output ) {
439 // This only does "true" deletes - i.e. those that the page stays deleted
440 foreach ( $this->getDeletedCatsIterator( $dbr ) as $batch ) {
441 $deleteUrls = [];
442 foreach ( $batch as $row ) {
443 // This can produce duplicates, we don't care
444 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
445 $this->processed[$row->rc_cur_id] = true;
446 }
447 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
448 }
449 }
450
451 /**
452 * Write category data to RDF.
453 * @param stdclass $row Database row
454 */
455 private function writeCategoryData( $row ) {
456 $this->categoriesRdf->writeCategoryData(
457 $row->rc_title,
458 $row->pp_propname === 'hiddencat',
459 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
460 (int)$row->cat_subcats
461 );
462 }
463
464 /**
465 * @param IDatabase $dbr
466 * @param resource $output
467 */
468 public function handleMoves( IDatabase $dbr, $output ) {
469 foreach ( $this->getMovedCatsIterator( $dbr ) as $batch ) {
470 $pages = [];
471 $deleteUrls = [];
472 foreach ( $batch as $row ) {
473 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
474
475 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
476 // We already captured this one before
477 continue;
478 }
479
480 if ( $row->page_namespace != NS_CATEGORY ) {
481 // If page was moved out of Category:, we'll just delete
482 continue;
483 }
484 $row->rc_title = $row->page_title;
485 $this->writeCategoryData( $row );
486 $pages[$row->rc_cur_id] = $row->page_title;
487 $this->processed[$row->rc_cur_id] = true;
488 }
489
490 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
491 }
492 }
493
494 /**
495 * @param IDatabase $dbr
496 * @param resource $output
497 */
498 public function handleRestores( IDatabase $dbr, $output ) {
499 fwrite( $output, "# Restores\n" );
500 // This will only find those restores that were not deleted later.
501 foreach ( $this->getRestoredCatsIterator( $dbr ) as $batch ) {
502 $pages = [];
503 foreach ( $batch as $row ) {
504 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
505 // We already captured this one before
506 continue;
507 }
508 $this->writeCategoryData( $row );
509 $pages[$row->rc_cur_id] = $row->rc_title;
510 $this->processed[$row->rc_cur_id] = true;
511 }
512
513 if ( empty( $pages ) ) {
514 continue;
515 }
516
517 $this->writeParentCategories( $dbr, $pages );
518
519 fwrite( $output, $this->getInsertRdf() );
520 }
521 }
522
523 /**
524 * @param IDatabase $dbr
525 * @param resource $output
526 */
527 public function handleAdds( IDatabase $dbr, $output ) {
528 fwrite( $output, "# Additions\n" );
529 foreach ( $this->getNewCatsIterator( $dbr ) as $batch ) {
530 $pages = [];
531 foreach ( $batch as $row ) {
532 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
533 // We already captured this one before
534 continue;
535 }
536 $this->writeCategoryData( $row );
537 $pages[$row->rc_cur_id] = $row->rc_title;
538 $this->processed[$row->rc_cur_id] = true;
539 }
540
541 if ( empty( $pages ) ) {
542 continue;
543 }
544
545 $this->writeParentCategories( $dbr, $pages );
546 fwrite( $output, $this->getInsertRdf() );
547 }
548 }
549
550 /**
551 * Handle edits for category texts
552 * @param IDatabase $dbr
553 * @param resource $output
554 */
555 public function handleEdits( IDatabase $dbr, $output ) {
556 // Editing category can change hidden flag and add new parents.
557 // TODO: it's pretty expensive to update all edited categories, and most edits
558 // aren't actually interesting for us. Some way to know which are interesting?
559 // We can capture recategorization on the next step, but not change in hidden status.
560 foreach ( $this->getChangedCatsIterator( $dbr, RC_EDIT ) as $batch ) {
561 $pages = [];
562 $deleteUrls = [];
563 foreach ( $batch as $row ) {
564 // Note that on categorization event, cur_id points to
565 // the child page, not the parent category!
566 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
567 // We already captured this one before
568 continue;
569 }
570 $this->writeCategoryData( $row );
571 $pages[$row->rc_cur_id] = $row->rc_title;
572 $this->processed[$row->rc_cur_id] = true;
573 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
574 }
575
576 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Edits" ) );
577 }
578 }
579
580 /**
581 * Handles categorization changes
582 * @param IDatabase $dbr
583 * @param resource $output
584 */
585 public function handleCategorization( IDatabase $dbr, $output ) {
586 $processedTitle = [];
587 // Categorization change can add new parents and change counts
588 // for the parent category.
589 foreach ( $this->getChangedCatsIterator( $dbr, RC_CATEGORIZE ) as $batch ) {
590 /*
591 * Note that on categorization event, cur_id points to
592 * the child page, not the parent category!
593 * So we need to have a two-stage process, since we have ID from one
594 * category and title from another, and we need both for proper updates.
595 * TODO: For now, we do full update even though some data hasn't changed,
596 * e.g. parents for parent cat and counts for child cat.
597 */
598 foreach ( $batch as $row ) {
599 $childPages[$row->rc_cur_id] = true;
600 $parentCats[$row->rc_title] = true;
601 }
602
603 $joinConditions = [
604 'page_props' => [
605 'LEFT JOIN',
606 [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
607 ],
608 'category' => [
609 'LEFT JOIN',
610 [ 'cat_title = page_title' ],
611 ],
612 ];
613
614 $pages = [];
615 $deleteUrls = [];
616
617 if ( !empty( $childPages ) ) {
618 // Load child rows by ID
619 $childRows = $dbr->select(
620 [ 'page', 'page_props', 'category' ],
621 [
622 'page_id',
623 'rc_title' => 'page_title',
624 'pp_propname',
625 'cat_pages',
626 'cat_subcats',
627 'cat_files',
628 ],
629 [ 'page_namespace' => NS_CATEGORY, 'page_id' => array_keys( $childPages ) ],
630 __METHOD__,
631 [],
632 $joinConditions
633 );
634 foreach ( $childRows as $row ) {
635 if ( isset( $this->processed[$row->page_id] ) ) {
636 // We already captured this one before
637 continue;
638 }
639 $this->writeCategoryData( $row );
640 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
641 $this->processed[$row->page_id] = true;
642 }
643 }
644
645 if ( !empty( $parentCats ) ) {
646 // Load parent rows by title
647 $joinConditions = [
648 'page' => [
649 'LEFT JOIN',
650 [ 'page_title = cat_title', 'page_namespace' => NS_CATEGORY ],
651 ],
652 'page_props' => [
653 'LEFT JOIN',
654 [ 'pp_propname' => 'hiddencat', 'pp_page = page_id' ],
655 ],
656 ];
657
658 $parentRows = $dbr->select(
659 [ 'category', 'page', 'page_props' ],
660 [
661 'page_id',
662 'rc_title' => 'cat_title',
663 'pp_propname',
664 'cat_pages',
665 'cat_subcats',
666 'cat_files',
667 ],
668 [ 'cat_title' => array_keys( $parentCats ) ],
669 __METHOD__,
670 [],
671 $joinConditions
672 );
673 foreach ( $parentRows as $row ) {
674 if ( $row->page_id && isset( $this->processed[$row->page_id] ) ) {
675 // We already captured this one before
676 continue;
677 }
678 if ( isset( $processedTitle[$row->rc_title] ) ) {
679 // We already captured this one before
680 continue;
681 }
682 $this->writeCategoryData( $row );
683 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
684 if ( $row->page_id ) {
685 $this->processed[$row->page_id] = true;
686 }
687 $processedTitle[$row->rc_title] = true;
688 }
689 }
690
691 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
692 }
693 }
694 }
695
696 $maintClass = CategoryChangesAsRdf::class;
697 require_once RUN_MAINTENANCE_IF_MAIN;