registration: Improve duplicate config setting exception
[lhc/web/wiklou.git] / maintenance / categoryChangesAsRdf.php
1 <?php
2 /**
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License along
14 * with this program; if not, write to the Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 * http://www.gnu.org/copyleft/gpl.html
17 *
18 */
19 use Wikimedia\Purtle\RdfWriter;
20 use Wikimedia\Purtle\TurtleRdfWriter;
21 use Wikimedia\Rdbms\IDatabase;
22
23 require_once __DIR__ . '/Maintenance.php';
24
25 /**
26 * Maintenance script to provide RDF representation of the recent changes in category tree.
27 *
28 * @ingroup Maintenance
29 * @since 1.30
30 */
31 class CategoryChangesAsRdf extends Maintenance {
32 /**
33 * Insert query
34 */
35 const SPARQL_INSERT = <<<SPARQL
36 INSERT DATA {
37 %s
38 };
39
40 SPARQL;
41
42 /**
43 * Delete/Insert query
44 */
45 const SPARQL_DELETE_INSERT = <<<SPARQLDI
46 DELETE {
47 ?category ?x ?y
48 } INSERT {
49 %s
50 } WHERE {
51 VALUES ?category {
52 %s
53 }
54 };
55
56 SPARQLDI;
57
58 /**
59 * @var RdfWriter
60 */
61 private $rdfWriter;
62 /**
63 * Categories RDF helper.
64 * @var CategoriesRdf
65 */
66 private $categoriesRdf;
67
68 private $startTS;
69 private $endTS;
70
71 /**
72 * List of processed page IDs,
73 * so we don't try to process same thing twice
74 * @var int[]
75 */
76 protected $processed = [];
77
78 public function __construct() {
79 parent::__construct();
80
81 $this->addDescription( "Generate RDF dump of category changes in a wiki." );
82
83 $this->setBatchSize( 200 );
84 $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
85 true, 'o' );
86 $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.',
87 true, true, 's' );
88 $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true,
89 true, 'e' );
90 }
91
92 /**
93 * Initialize external service classes.
94 */
95 public function initialize() {
96 // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
97 $this->rdfWriter = new TurtleRdfWriter();
98 $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
99 }
100
101 public function execute() {
102 global $wgRCMaxAge;
103
104 $this->initialize();
105
106 $startTS = new MWTimestamp( $this->getOption( "start" ) );
107 $endTS = new MWTimestamp( $this->getOption( "end" ) );
108 $now = new MWTimestamp();
109
110 if ( $now->getTimestamp() - $startTS->getTimestamp() > $wgRCMaxAge ) {
111 $this->error( "Start timestamp too old, maximum RC age is $wgRCMaxAge!" );
112 }
113 if ( $now->getTimestamp() - $endTS->getTimestamp() > $wgRCMaxAge ) {
114 $this->error( "End timestamp too old, maximum RC age is $wgRCMaxAge!" );
115 }
116
117 $this->startTS = $startTS->getTimestamp();
118 $this->endTS = $endTS->getTimestamp();
119
120 $outFile = $this->getOption( 'output', 'php://stdout' );
121 if ( $outFile === '-' ) {
122 $outFile = 'php://stdout';
123 }
124
125 $output = fopen( $outFile, 'wb' );
126
127 $this->categoriesRdf->setupPrefixes();
128 $this->rdfWriter->start();
129
130 $prefixes = $this->getRdf();
131 // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
132 // Also strip dot at the end.
133 $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
134 fwrite( $output, $prefixes );
135
136 $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
137
138 // Deletes go first because if the page was deleted, other changes
139 // do not matter. This only gets true deletes, i.e. not pages that were restored.
140 $this->handleDeletes( $dbr, $output );
141 // Moves go before additions because if category is moved, we should not process creation
142 // as it would produce wrong data - because create row has old title
143 $this->handleMoves( $dbr, $output );
144 // We need to handle restores too since delete may have happened in previous update.
145 $this->handleRestores( $dbr, $output );
146 $this->handleAdds( $dbr, $output );
147 $this->handleChanges( $dbr, $output );
148
149 // Update timestamp
150 fwrite( $output, $this->updateTS( $this->endTS ) );
151 }
152
153 /**
154 * Get SPARQL for updating set of categories
155 * @param IDatabase $dbr
156 * @param string[] $deleteUrls List of URIs to be deleted, with <>
157 * @param string[] $pages List of categories: id => title
158 * @param string $mark Marks which operation requests the query
159 * @return string SPARQL query
160 */
161 private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) {
162 if ( empty( $deleteUrls ) ) {
163 return "";
164 }
165
166 if ( !empty( $pages ) ) {
167 $this->writeParentCategories( $dbr, $pages );
168 }
169
170 return "# $mark\n" . sprintf( self::SPARQL_DELETE_INSERT,
171 $this->getRdf(),
172 implode( ' ', $deleteUrls ) );
173 }
174
175 /**
176 * Write data for a set of categories
177 * @param IDatabase $dbr
178 * @param string[] $pages List of categories: id => title
179 */
180 private function writeParentCategories( IDatabase $dbr, $pages ) {
181 foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
182 $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
183 }
184 }
185
186 /**
187 * Generate SPARQL Update code for updating dump timestamp
188 * @param string|int $timestamp Timestamp for last change
189 * @return string SPARQL Update query for timestamp.
190 */
191 public function updateTS( $timestamp ) {
192 $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
193 $ts = wfTimestamp( TS_ISO_8601, $timestamp );
194 $tsQuery = <<<SPARQL
195 DELETE {
196 $dumpUrl schema:dateModified ?o .
197 }
198 WHERE {
199 $dumpUrl schema:dateModified ?o .
200 };
201 INSERT DATA {
202 $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
203 }
204
205 SPARQL;
206 return $tsQuery;
207 }
208
209 /**
210 * Set up standard iterator for retrieving category changes.
211 * @param IDatabase $dbr
212 * @param string[] $columns List of additional fields to get
213 * @param string[] $extra_tables List of additional tables to join
214 * @return BatchRowIterator
215 */
216 private function setupChangesIterator(
217 IDatabase $dbr,
218 array $columns = [],
219 array $extra_tables = []
220 ) {
221 $tables = [ 'recentchanges', 'page_props', 'category' ];
222 if ( $extra_tables ) {
223 $tables += $extra_tables;
224 }
225 $it = new BatchRowIterator( $dbr,
226 $tables,
227 [ 'rc_timestamp' ],
228 $this->mBatchSize
229 );
230 $this->addTimestampConditions( $it, $dbr );
231 $it->addJoinConditions(
232 [
233 'page_props' => [
234 'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ]
235 ],
236 'category' => [
237 'LEFT JOIN', [ 'cat_title = rc_title' ]
238 ]
239 ]
240 );
241 $it->setFetchColumns( array_merge( $columns, [
242 'rc_title',
243 'rc_cur_id',
244 'pp_propname',
245 'cat_pages',
246 'cat_subcats',
247 'cat_files'
248 ] ) );
249 return $it;
250 }
251
252 /**
253 * Fetch newly created categories
254 * @param IDatabase $dbr
255 * @return BatchRowIterator
256 */
257 protected function getNewCatsIterator( IDatabase $dbr ) {
258 $it = $this->setupChangesIterator( $dbr );
259 $it->addConditions( [
260 'rc_namespace' => NS_CATEGORY,
261 'rc_new' => 1,
262 ] );
263 return $it;
264 }
265
266 /**
267 * Fetch moved categories
268 * @param IDatabase $dbr
269 * @return BatchRowIterator
270 */
271 protected function getMovedCatsIterator( IDatabase $dbr ) {
272 $it = $this->setupChangesIterator( $dbr, [ 'page_title', 'page_namespace' ], [ 'page' ] );
273 $it->addConditions( [
274 'rc_namespace' => NS_CATEGORY,
275 'rc_new' => 0,
276 'rc_log_type' => 'move',
277 'rc_type' => RC_LOG,
278 ] );
279 $it->addJoinConditions( [
280 'page' => [ 'INNER JOIN', 'rc_cur_id = page_id' ],
281 ] );
282 $this->addIndex( $it );
283 return $it;
284 }
285
286 /**
287 * Fetch deleted categories
288 * @param IDatabase $dbr
289 * @return BatchRowIterator
290 */
291 protected function getDeletedCatsIterator( IDatabase $dbr ) {
292 $it = new BatchRowIterator( $dbr,
293 'recentchanges',
294 [ 'rc_timestamp' ],
295 $this->mBatchSize
296 );
297 $this->addTimestampConditions( $it, $dbr );
298 $it->addConditions( [
299 'rc_namespace' => NS_CATEGORY,
300 'rc_new' => 0,
301 'rc_log_type' => 'delete',
302 'rc_log_action' => 'delete',
303 'rc_type' => RC_LOG,
304 // We will fetch ones that do not have page record. If they do,
305 // this means they were restored, thus restoring handler will pick it up.
306 'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
307 ] );
308 $this->addIndex( $it );
309 $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
310 return $it;
311 }
312
313 /**
314 * Fetch restored categories
315 * @param IDatabase $dbr
316 * @return BatchRowIterator
317 */
318 protected function getRestoredCatsIterator( IDatabase $dbr ) {
319 $it = $this->setupChangesIterator( $dbr );
320 $it->addConditions( [
321 'rc_namespace' => NS_CATEGORY,
322 'rc_new' => 0,
323 'rc_log_type' => 'delete',
324 'rc_log_action' => 'restore',
325 'rc_type' => RC_LOG,
326 // We will only fetch ones that have page record
327 'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
328 ] );
329 $this->addIndex( $it );
330 return $it;
331 }
332
333 /**
334 * Fetch categorization changes
335 * @param IDatabase $dbr
336 * @return BatchRowIterator
337 */
338 protected function getChangedCatsIterator( IDatabase $dbr ) {
339 $it = $this->setupChangesIterator( $dbr );
340 $it->addConditions( [
341 'rc_namespace' => NS_CATEGORY,
342 'rc_new' => 0,
343 'rc_type' => [ RC_EDIT, RC_CATEGORIZE ],
344 ] );
345 $this->addIndex( $it );
346 return $it;
347 }
348
349 /**
350 * Add timestamp limits to iterator
351 * @param BatchRowIterator $it Iterator
352 * @param IDatabase $dbr
353 */
354 private function addTimestampConditions( BatchRowIterator $it, IDatabase $dbr ) {
355 $it->addConditions( [
356 'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ),
357 'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ),
358 ] );
359 }
360
361 /**
362 * Need to force index, somehow on terbium the optimizer chooses wrong one
363 * @param BatchRowIterator $it
364 */
365 private function addIndex( BatchRowIterator $it ) {
366 $it->addOptions( [
367 'USE INDEX' => [ 'recentchanges' => 'new_name_timestamp' ]
368 ] );
369 }
370
371 /**
372 * Get iterator for links for categories.
373 * @param IDatabase $dbr
374 * @param array $ids List of page IDs
375 * @return Traversable
376 */
377 protected function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
378 $it = new BatchRowIterator(
379 $dbr,
380 'categorylinks',
381 [ 'cl_from', 'cl_to' ],
382 $this->mBatchSize
383 );
384 $it->addConditions( [
385 'cl_type' => 'subcat',
386 'cl_from' => $ids
387 ] );
388 $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
389 return new RecursiveIteratorIterator( $it );
390 }
391
392 /**
393 * Get accumulated RDF.
394 * @return string
395 */
396 public function getRdf() {
397 return $this->rdfWriter->drain();
398 }
399
400 /**
401 * Handle category deletes.
402 * @param IDatabase $dbr
403 * @param resource $output File to write the output
404 */
405 public function handleDeletes( IDatabase $dbr, $output ) {
406 // This only does "true" deletes - i.e. those that the page stays deleted
407 foreach ( $this->getDeletedCatsIterator( $dbr ) as $batch ) {
408 $deleteUrls = [];
409 foreach ( $batch as $row ) {
410 // This can produce duplicates, we don't care
411 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
412 $this->processed[$row->rc_cur_id] = true;
413 }
414 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
415 }
416 }
417
418 /**
419 * Write category data to RDF.
420 * @param stdclass $row Database row
421 */
422 private function writeCategoryData( $row ) {
423 $this->categoriesRdf->writeCategoryData(
424 $row->rc_title,
425 $row->pp_propname === 'hiddencat',
426 (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
427 (int)$row->cat_subcats
428 );
429 }
430
431 /**
432 * @param IDatabase $dbr
433 * @param resource $output
434 */
435 public function handleMoves( IDatabase $dbr, $output ) {
436 foreach ( $this->getMovedCatsIterator( $dbr ) as $batch ) {
437 $pages = [];
438 $deleteUrls = [];
439 foreach ( $batch as $row ) {
440 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
441
442 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
443 // We already captured this one before
444 continue;
445 }
446
447 if ( $row->page_namespace != NS_CATEGORY ) {
448 // If page was moved out of Category:, we'll just delete
449 continue;
450 }
451 $row->rc_title = $row->page_title;
452 $this->writeCategoryData( $row );
453 $pages[$row->rc_cur_id] = $row->page_title;
454 $this->processed[$row->rc_cur_id] = true;
455 }
456
457 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
458 }
459 }
460
461 /**
462 * @param IDatabase $dbr
463 * @param resource $output
464 */
465 public function handleRestores( IDatabase $dbr, $output ) {
466 fwrite( $output, "# Restores\n" );
467 // This will only find those restores that were not deleted later.
468 foreach ( $this->getRestoredCatsIterator( $dbr ) as $batch ) {
469 $pages = [];
470 foreach ( $batch as $row ) {
471 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
472 // We already captured this one before
473 continue;
474 }
475 $this->writeCategoryData( $row );
476 $pages[$row->rc_cur_id] = $row->rc_title;
477 $this->processed[$row->rc_cur_id] = true;
478 }
479
480 if ( empty( $pages ) ) {
481 continue;
482 }
483
484 $this->writeParentCategories( $dbr, $pages );
485
486 fwrite( $output, sprintf( self::SPARQL_INSERT, $this->getRdf() ) );
487 }
488 }
489
490 /**
491 * @param IDatabase $dbr
492 * @param resource $output
493 */
494 public function handleAdds( IDatabase $dbr, $output ) {
495 fwrite( $output, "# Additions\n" );
496 foreach ( $this->getNewCatsIterator( $dbr ) as $batch ) {
497 $pages = [];
498 foreach ( $batch as $row ) {
499 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
500 // We already captured this one before
501 continue;
502 }
503 $this->writeCategoryData( $row );
504 $pages[$row->rc_cur_id] = $row->rc_title;
505 $this->processed[$row->rc_cur_id] = true;
506 }
507
508 if ( empty( $pages ) ) {
509 continue;
510 }
511
512 $this->writeParentCategories( $dbr, $pages );
513 fwrite( $output, sprintf( self::SPARQL_INSERT, $this->getRdf() ) );
514 }
515 }
516
517 /**
518 * @param IDatabase $dbr
519 * @param resource $output
520 */
521 public function handleChanges( IDatabase $dbr, $output ) {
522 foreach ( $this->getChangedCatsIterator( $dbr ) as $batch ) {
523 $pages = [];
524 $deleteUrls = [];
525 foreach ( $batch as $row ) {
526 if ( isset( $this->processed[$row->rc_cur_id] ) ) {
527 // We already captured this one before
528 continue;
529 }
530 $this->writeCategoryData( $row );
531 $pages[$row->rc_cur_id] = $row->rc_title;
532 $this->processed[$row->rc_cur_id] = true;
533 $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
534 }
535
536 fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
537 }
538 }
539 }
540
541 $maintClass = CategoryChangesAsRdf::class;
542 require_once RUN_MAINTENANCE_IF_MAIN;