Add option to populateChangeTagDef not to update the count
[lhc/web/wiklou.git] / maintenance / categoryChangesAsRdf.php
1 <?php
2 /**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
17 *
18 */
19 use Wikimedia\Purtle\RdfWriter;
20 use Wikimedia\Purtle\TurtleRdfWriter;
21 use Wikimedia\Rdbms\IDatabase;
22
23 require_once __DIR__ . '/Maintenance.php';
24
25 /**
26 * Maintenance script to provide RDF representation of the recent changes in category tree.
27 *
28 * @ingroup Maintenance
29 * @since 1.30
30 */
31 class CategoryChangesAsRdf extends Maintenance {
32 /**
33 * Insert query
34 */
35 const SPARQL_INSERT = <<<SPARQL
36 INSERT DATA {
37 %s
38 };
39
40 SPARQL;
41
42 /**
43 * Delete query
44 */
45 const SPARQL_DELETE = <<<SPARQLD
46 DELETE {
47 ?category ?x ?y
48 } WHERE {
49 VALUES ?category {
50 %s
51 }
52 };
53
54 SPARQLD;
55
56 /**
57 * Delete/Insert query
58 */
59 const SPARQL_DELETE_INSERT = <<<SPARQLDI
60 DELETE {
61 ?category ?x ?y
62 } INSERT {
63 %s
64 } WHERE {
65 VALUES ?category {
66 %s
67 }
68 };
69
70 SPARQLDI;
71
72 /**
73 * @var RdfWriter
74 */
75 private $rdfWriter;
76 /**
77 * Categories RDF helper.
78 * @var CategoriesRdf
79 */
80 private $categoriesRdf;
81
82 private $startTS;
83 private $endTS;
84
85 /**
86 * List of processed page IDs,
87 * so we don't try to process same thing twice
88 * @var int[]
89 */
90 protected $processed = [];
91
92 public function __construct() {
93 parent::__construct();
94
95 $this->addDescription( "Generate RDF dump of category changes in a wiki." );
96
97 $this->setBatchSize( 200 );
98 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
99 true, 'o' );
100 $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.',
101 true, true, 's' );
102 $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true,
103 true, 'e' );
104 }
105
106 /**
107 * Initialize external service classes.
108 */
109 public function initialize() {
110 // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
111 $this->rdfWriter = new TurtleRdfWriter();
112 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
113 }
114
115 public function execute() {
116 global $wgRCMaxAge;
117
118 $this->initialize();
119 $startTS = new MWTimestamp( $this->getOption( "start" ) );
120
121 $endTS = new MWTimestamp( $this->getOption( "end" ) );
122 $now = new MWTimestamp();
123
124 if ( $now->getTimestamp() - $startTS->getTimestamp() > $wgRCMaxAge ) {
125 $this->error( "Start timestamp too old, maximum RC age is $wgRCMaxAge!" );
126 }
127 if ( $now->getTimestamp() - $endTS->getTimestamp() > $wgRCMaxAge ) {
128 $this->error( "End timestamp too old, maximum RC age is $wgRCMaxAge!" );
129 }
130
131 $this->startTS = $startTS->getTimestamp();
132 $this->endTS = $endTS->getTimestamp();
133
134 $outFile = $this->getOption( 'output', 'php://stdout' );
135 if ( $outFile === '-' ) {
136 $outFile = 'php://stdout';
137 }
138
139 $output = fopen( $outFile, 'wb' );
140
141 $this->categoriesRdf->setupPrefixes();
142 $this->rdfWriter->start();
143
144 $prefixes = $this->getRdf();
145 // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
146 // Also strip dot at the end.
147 $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
148 fwrite( $output, $prefixes );
149
150 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
151
152 // Deletes go first because if the page was deleted, other changes
153 // do not matter. This only gets true deletes, i.e. not pages that were restored.
154 $this->handleDeletes( $dbr, $output );
155 // Moves go before additions because if category is moved, we should not process creation
156 // as it would produce wrong data - because create row has old title
157 $this->handleMoves( $dbr, $output );
158 // We need to handle restores too since delete may have happened in previous update.
159 $this->handleRestores( $dbr, $output );
160 $this->handleAdds( $dbr, $output );
161 $this->handleChanges( $dbr, $output );
162
163 // Update timestamp
164 fwrite( $output, $this->updateTS( $this->endTS ) );
165 }
166
167 /**
168 * Get the text of SPARQL INSERT DATA clause
169 * @return string
170 */
171 private function getInsertRdf() {
172 $rdfText = $this->getRdf();
173 if ( !$rdfText ) {
174 return "";
175 }
176 return sprintf( self::SPARQL_INSERT, $rdfText );
177 }
178
179 /**
180 * Get SPARQL for updating set of categories
181 * @param IDatabase $dbr
182 * @param string[] $deleteUrls List of URIs to be deleted, with <>
183 * @param string[] $pages List of categories: id => title
184 * @param string $mark Marks which operation requests the query
185 * @return string SPARQL query
186 */
187 private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) {
188 if ( empty( $deleteUrls ) ) {
189 return "";
190 }
191
192 if ( !empty( $pages ) ) {
193 $this->writeParentCategories( $dbr, $pages );
194 }
195
196 return "# $mark\n" . sprintf( self::SPARQL_DELETE, implode( ' ', $deleteUrls ) ) .
197 $this->getInsertRdf();
198 }
199
200 /**
201 * Write data for a set of categories
202 * @param IDatabase $dbr
203 * @param string[] $pages List of categories: id => title
204 */
205 private function writeParentCategories( IDatabase $dbr, $pages ) {
206 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
207 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
208 }
209 }
210
211 /**
212 * Generate SPARQL Update code for updating dump timestamp
213 * @param string|int $timestamp Timestamp for last change
214 * @return string SPARQL Update query for timestamp.
215 */
216 public function updateTS( $timestamp ) {
217 $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
218 $ts = wfTimestamp( TS_ISO_8601, $timestamp );
219 $tsQuery = <<<SPARQL
220 DELETE {
221 $dumpUrl schema:dateModified ?o .
222 }
223 WHERE {
224 $dumpUrl schema:dateModified ?o .
225 };
226 INSERT DATA {
227 $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
228 }
229
230 SPARQL;
231 return $tsQuery;
232 }
233
234 /**
235 * Set up standard iterator for retrieving category changes.
236 * @param IDatabase $dbr
237 * @param string[] $columns List of additional fields to get
238 * @param string[] $extra_tables List of additional tables to join
239 * @return BatchRowIterator
240 */
241 private function setupChangesIterator(
242 IDatabase $dbr,
243 array $columns = [],
244 array $extra_tables = []
245 ) {
246 $tables = [ 'recentchanges', 'page_props', 'category' ];
247 if ( $extra_tables ) {
248 $tables = array_merge( $tables, $extra_tables );
249 }
250 $it = new BatchRowIterator( $dbr,
251 $tables,
252 [ 'rc_timestamp' ],
253 $this->mBatchSize
254 );
255 $this->addTimestampConditions( $it, $dbr );
256 $it->addJoinConditions(
257 [
258 'page_props' => [
259 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ]
260 ],
261 'category' => [
262 'LEFT JOIN', [ 'cat_title = rc_title' ]
263 ]
264 ]
265 );
266 $it->setFetchColumns( array_merge( $columns, [
267 'rc_title',
268 'rc_cur_id',
269 'pp_propname',
270 'cat_pages',
271 'cat_subcats',
272 'cat_files'
273 ] ) );
274 return $it;
275 }
276
277 /**
278 * Fetch newly created categories
279 * @param IDatabase $dbr
280 * @return BatchRowIterator
281 */
282 protected function getNewCatsIterator( IDatabase $dbr ) {
283 $it = $this->setupChangesIterator( $dbr );
284 $it->addConditions( [
285 'rc_namespace' => NS_CATEGORY,
286 'rc_new' => 1,
287 ] );
288 return $it;
289 }
290
291 /**
292 * Fetch moved categories
293 * @param IDatabase $dbr
294 * @return BatchRowIterator
295 */
296 protected function getMovedCatsIterator( IDatabase $dbr ) {
297 $it = $this->setupChangesIterator( $dbr, [ 'page_title', 'page_namespace' ], [ 'page' ] );
298 $it->addConditions( [
299 'rc_namespace' => NS_CATEGORY,
300 'rc_new' => 0,
301 'rc_log_type' => 'move',
302 'rc_type' => RC_LOG,
303 ] );
304 $it->addJoinConditions( [
305 'page' => [ 'INNER JOIN', 'rc_cur_id = page_id' ],
306 ] );
307 $this->addIndex( $it );
308 return $it;
309 }
310
311 /**
312 * Fetch deleted categories
313 * @param IDatabase $dbr
314 * @return BatchRowIterator
315 */
316 protected function getDeletedCatsIterator( IDatabase $dbr ) {
317 $it = new BatchRowIterator( $dbr,
318 'recentchanges',
319 [ 'rc_timestamp' ],
320 $this->mBatchSize
321 );
322 $this->addTimestampConditions( $it, $dbr );
323 $it->addConditions( [
324 'rc_namespace' => NS_CATEGORY,
325 'rc_new' => 0,
326 'rc_log_type' => 'delete',
327 'rc_log_action' => 'delete',
328 'rc_type' => RC_LOG,
329 // We will fetch ones that do not have page record. If they do,
330 // this means they were restored, thus restoring handler will pick it up.
331 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
332 ] );
333 $this->addIndex( $it );
334 $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
335 return $it;
336 }
337
338 /**
339 * Fetch restored categories
340 * @param IDatabase $dbr
341 * @return BatchRowIterator
342 */
343 protected function getRestoredCatsIterator( IDatabase $dbr ) {
344 $it = $this->setupChangesIterator( $dbr );
345 $it->addConditions( [
346 'rc_namespace' => NS_CATEGORY,
347 'rc_new' => 0,
348 'rc_log_type' => 'delete',
349 'rc_log_action' => 'restore',
350 'rc_type' => RC_LOG,
351 // We will only fetch ones that have page record
352 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
353 ] );
354 $this->addIndex( $it );
355 return $it;
356 }
357
358 /**
359 * Fetch categorization changes
360 * @param IDatabase $dbr
361 * @return BatchRowIterator
362 */
363 protected function getChangedCatsIterator( IDatabase $dbr ) {
364 $it = $this->setupChangesIterator( $dbr );
365 $it->addConditions( [
366 'rc_namespace' => NS_CATEGORY,
367 'rc_new' => 0,
368 'rc_type' => [ RC_EDIT, RC_CATEGORIZE ],
369 ] );
370 $this->addIndex( $it );
371 return $it;
372 }
373
374 /**
375 * Add timestamp limits to iterator
376 * @param BatchRowIterator $it Iterator
377 * @param IDatabase $dbr
378 */
379 private function addTimestampConditions( BatchRowIterator $it, IDatabase $dbr ) {
380 $it->addConditions( [
381 'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ),
382 'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ),
383 ] );
384 }
385
386 /**
387 * Need to force index, somehow on terbium the optimizer chooses wrong one
388 * @param BatchRowIterator $it
389 */
390 private function addIndex( BatchRowIterator $it ) {
391 $it->addOptions( [
392 'USE INDEX' => [ 'recentchanges' => 'new_name_timestamp' ]
393 ] );
394 }
395
396 /**
397 * Get iterator for links for categories.
398 * @param IDatabase $dbr
399 * @param array $ids List of page IDs
400 * @return Traversable
401 */
402 protected function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
403 $it = new BatchRowIterator(
404 $dbr,
405 'categorylinks',
406 [ 'cl_from', 'cl_to' ],
407 $this->mBatchSize
408 );
409 $it->addConditions( [
410 'cl_type' => 'subcat',
411 'cl_from' => $ids
412 ] );
413 $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
414 return new RecursiveIteratorIterator( $it );
415 }
416
417 /**
418 * Get accumulated RDF.
419 * @return string
420 */
421 public function getRdf() {
422 return $this->rdfWriter->drain();
423 }
424
425 /**
426 * Handle category deletes.
427 * @param IDatabase $dbr
428 * @param resource $output File to write the output
429 */
430 public function handleDeletes( IDatabase $dbr, $output ) {
431 // This only does "true" deletes - i.e. those that the page stays deleted
432 foreach ( $this->getDeletedCatsIterator( $dbr ) as $batch ) {
433 $deleteUrls = [];
434 foreach ( $batch as $row ) {
435 // This can produce duplicates, we don't care
436 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
437 $this->processed[$row->rc_cur_id] = true;
438 }
439 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
440 }
441 }
442
443 /**
444 * Write category data to RDF.
445 * @param stdclass $row Database row
446 */
447 private function writeCategoryData( $row ) {
448 $this->categoriesRdf->writeCategoryData(
449 $row->rc_title,
450 $row->pp_propname === 'hiddencat',
451 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
452 (int)$row->cat_subcats
453 );
454 }
455
456 /**
457 * @param IDatabase $dbr
458 * @param resource $output
459 */
460 public function handleMoves( IDatabase $dbr, $output ) {
461 foreach ( $this->getMovedCatsIterator( $dbr ) as $batch ) {
462 $pages = [];
463 $deleteUrls = [];
464 foreach ( $batch as $row ) {
465 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
466
467 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
468 // We already captured this one before
469 continue;
470 }
471
472 if ( $row->page_namespace != NS_CATEGORY ) {
473 // If page was moved out of Category:, we'll just delete
474 continue;
475 }
476 $row->rc_title = $row->page_title;
477 $this->writeCategoryData( $row );
478 $pages[$row->rc_cur_id] = $row->page_title;
479 $this->processed[$row->rc_cur_id] = true;
480 }
481
482 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
483 }
484 }
485
486 /**
487 * @param IDatabase $dbr
488 * @param resource $output
489 */
490 public function handleRestores( IDatabase $dbr, $output ) {
491 fwrite( $output, "# Restores\n" );
492 // This will only find those restores that were not deleted later.
493 foreach ( $this->getRestoredCatsIterator( $dbr ) as $batch ) {
494 $pages = [];
495 foreach ( $batch as $row ) {
496 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
497 // We already captured this one before
498 continue;
499 }
500 $this->writeCategoryData( $row );
501 $pages[$row->rc_cur_id] = $row->rc_title;
502 $this->processed[$row->rc_cur_id] = true;
503 }
504
505 if ( empty( $pages ) ) {
506 continue;
507 }
508
509 $this->writeParentCategories( $dbr, $pages );
510
511 fwrite( $output, $this->getInsertRdf() );
512 }
513 }
514
515 /**
516 * @param IDatabase $dbr
517 * @param resource $output
518 */
519 public function handleAdds( IDatabase $dbr, $output ) {
520 fwrite( $output, "# Additions\n" );
521 foreach ( $this->getNewCatsIterator( $dbr ) as $batch ) {
522 $pages = [];
523 foreach ( $batch as $row ) {
524 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
525 // We already captured this one before
526 continue;
527 }
528 $this->writeCategoryData( $row );
529 $pages[$row->rc_cur_id] = $row->rc_title;
530 $this->processed[$row->rc_cur_id] = true;
531 }
532
533 if ( empty( $pages ) ) {
534 continue;
535 }
536
537 $this->writeParentCategories( $dbr, $pages );
538 fwrite( $output, $this->getInsertRdf() );
539 }
540 }
541
542 /**
543 * @param IDatabase $dbr
544 * @param resource $output
545 */
546 public function handleChanges( IDatabase $dbr, $output ) {
547 foreach ( $this->getChangedCatsIterator( $dbr ) as $batch ) {
548 $pages = [];
549 $deleteUrls = [];
550 foreach ( $batch as $row ) {
551 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
552 // We already captured this one before
553 continue;
554 }
555 $this->writeCategoryData( $row );
556 $pages[$row->rc_cur_id] = $row->rc_title;
557 $this->processed[$row->rc_cur_id] = true;
558 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
559 }
560
561 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
562 }
563 }
564 }
565
566 $maintClass = CategoryChangesAsRdf::class;
567 require_once RUN_MAINTENANCE_IF_MAIN;