Create update SPARQL for category changes
authorStanislav Malyshev <smalyshev@gmail.com>
Tue, 22 Aug 2017 00:05:53 +0000 (17:05 -0700)
committerStanislav Malyshev <smalyshev@gmail.com>
Fri, 20 Apr 2018 23:40:35 +0000 (16:40 -0700)
This script creates SPARQL UPDATE statements for changes in a given time
period. These statements can be applied to an existing database to
update it.

See tests for examples of how the statements look like.

Bug: T173774
Change-Id: I9867ad566c0619b55a48a011bd3c55321b1bfcff

autoload.php
maintenance/categoryChangesAsRdf.php [new file with mode: 0644]
tests/phpunit/data/categoriesrdf/change.sparql [new file with mode: 0644]
tests/phpunit/data/categoriesrdf/delete.sparql [new file with mode: 0644]
tests/phpunit/data/categoriesrdf/move.sparql [new file with mode: 0644]
tests/phpunit/data/categoriesrdf/new.sparql [new file with mode: 0644]
tests/phpunit/data/categoriesrdf/restore.sparql [new file with mode: 0644]
tests/phpunit/data/categoriesrdf/updatets.txt [new file with mode: 0644]
tests/phpunit/maintenance/categoryChangesRdfTest.php [new file with mode: 0644]

index bc0e69e..f93d723 100644 (file)
@@ -225,6 +225,7 @@ $wgAutoloadLocalClasses = [
        'CapsCleanup' => __DIR__ . '/maintenance/cleanupCaps.php',
        'CategoriesRdf' => __DIR__ . '/includes/CategoriesRdf.php',
        'Category' => __DIR__ . '/includes/Category.php',
+       'CategoryChangesAsRdf' => __DIR__ . '/maintenance/categoryChangesAsRdf.php',
        'CategoryFinder' => __DIR__ . '/includes/CategoryFinder.php',
        'CategoryMembershipChange' => __DIR__ . '/includes/changes/CategoryMembershipChange.php',
        'CategoryMembershipChangeJob' => __DIR__ . '/includes/jobqueue/jobs/CategoryMembershipChangeJob.php',
diff --git a/maintenance/categoryChangesAsRdf.php b/maintenance/categoryChangesAsRdf.php
new file mode 100644 (file)
index 0000000..a12cda7
--- /dev/null
@@ -0,0 +1,542 @@
+<?php
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+use Wikimedia\Purtle\RdfWriter;
+use Wikimedia\Purtle\TurtleRdfWriter;
+use Wikimedia\Rdbms\IDatabase;
+
+require_once __DIR__ . '/Maintenance.php';
+
+/**
+ * Maintenance script to provide RDF representation of the recent changes in category tree.
+ *
+ * @ingroup Maintenance
+ * @since 1.30
+ */
+class CategoryChangesAsRdf extends Maintenance {
+       /**
+        * Insert query
+        */
+       const SPARQL_INSERT = <<<SPARQL
+INSERT DATA {
+%s
+};
+
+SPARQL;
+
+       /**
+        * Delete/Insert query
+        */
+       const SPARQL_DELETE_INSERT = <<<SPARQLDI
+DELETE {
+?category ?x ?y
+} INSERT {
+%s
+} WHERE {
+   VALUES ?category {
+     %s
+   }
+};
+
+SPARQLDI;
+
+       /**
+        * @var RdfWriter
+        */
+       private $rdfWriter;
+       /**
+        * Categories RDF helper.
+        * @var CategoriesRdf
+        */
+       private $categoriesRdf;
+
+       private $startTS;
+       private $endTS;
+
+       /**
+        * List of processed page IDs,
+        * so we don't try to process same thing twice
+        * @var int[]
+        */
+       protected $processed = [];
+
+       public function __construct() {
+               parent::__construct();
+
+               $this->addDescription( "Generate RDF dump of category changes in a wiki." );
+
+               $this->setBatchSize( 200 );
+               $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false,
+                       true, 'o' );
+               $this->addOption( 'start', 'Starting timestamp (inclusive), in ISO or Mediawiki format.',
+                       true, true, 's' );
+               $this->addOption( 'end', 'Ending timestamp (exclusive), in ISO or Mediawiki format.', true,
+                       true, 'e' );
+       }
+
+       /**
+        * Initialize external service classes.
+        */
+       public function initialize() {
+               // SPARQL Update syntax is close to Turtle format, so we can use Turtle writer.
+               $this->rdfWriter = new TurtleRdfWriter();
+               $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
+       }
+
+       public function execute() {
+               global $wgRCMaxAge;
+
+               $this->initialize();
+
+               $startTS = new MWTimestamp( $this->getOption( "start" ) );
+               $endTS = new MWTimestamp( $this->getOption( "end" ) );
+               $now = new MWTimestamp();
+
+               if ( $now->getTimestamp() - $startTS->getTimestamp() > $wgRCMaxAge ) {
+                       $this->error( "Start timestamp too old, maximum RC age is $wgRCMaxAge!" );
+               }
+               if ( $now->getTimestamp() - $endTS->getTimestamp() > $wgRCMaxAge ) {
+                       $this->error( "End timestamp too old, maximum RC age is $wgRCMaxAge!" );
+               }
+
+               $this->startTS = $startTS->getTimestamp();
+               $this->endTS = $endTS->getTimestamp();
+
+               $outFile = $this->getOption( 'output', 'php://stdout' );
+               if ( $outFile === '-' ) {
+                       $outFile = 'php://stdout';
+               }
+
+               $output = fopen( $outFile, 'wb' );
+
+               $this->categoriesRdf->setupPrefixes();
+               $this->rdfWriter->start();
+
+               $prefixes = $this->getRdf();
+               // We have to strip @ from prefix, since SPARQL UPDATE doesn't use them
+               // Also strip dot at the end.
+               $prefixes = preg_replace( [ '/^@/m', '/\s*[.]$/m' ], '', $prefixes );
+               fwrite( $output, $prefixes );
+
+               $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
+
+               // Deletes go first because if the page was deleted, other changes
+               // do not matter. This only gets true deletes, i.e. not pages that were restored.
+               $this->handleDeletes( $dbr, $output );
+               // Moves go before additions because if category is moved, we should not process creation
+               // as it would produce wrong data - because create row has old title
+               $this->handleMoves( $dbr, $output );
+               // We need to handle restores too since delete may have happened in previous update.
+               $this->handleRestores( $dbr, $output );
+               $this->handleAdds( $dbr, $output );
+               $this->handleChanges( $dbr, $output );
+
+               // Update timestamp
+               fwrite( $output, $this->updateTS( $this->endTS ) );
+       }
+
+       /**
+        * Get SPARQL for updating set of categories
+        * @param IDatabase $dbr
+        * @param string[] $deleteUrls List of URIs to be deleted, with <>
+        * @param string[] $pages List of categories: id => title
+        * @param string $mark Marks which operation requests the query
+        * @return string SPARQL query
+        */
+       private function getCategoriesUpdate( IDatabase $dbr, $deleteUrls, $pages, $mark ) {
+               if ( empty( $deleteUrls ) ) {
+                       return "";
+               }
+
+               if ( !empty( $pages ) ) {
+                       $this->writeParentCategories( $dbr, $pages );
+               }
+
+               return "# $mark\n" . sprintf( self::SPARQL_DELETE_INSERT,
+                               $this->getRdf(),
+                               implode( ' ', $deleteUrls ) );
+       }
+
+       /**
+        * Write data for a set of categories
+        * @param IDatabase $dbr
+        * @param string[] $pages List of categories: id => title
+        */
+       private function writeParentCategories( IDatabase $dbr, $pages ) {
+               foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
+                       $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
+               }
+       }
+
+       /**
+        * Generate SPARQL Update code for updating dump timestamp
+        * @param string|int $timestamp Timestamp for last change
+        * @return string SPARQL Update query for timestamp.
+        */
+       public function updateTS( $timestamp ) {
+               $dumpUrl = '<' . $this->categoriesRdf->getDumpURI() . '>';
+               $ts = wfTimestamp( TS_ISO_8601, $timestamp );
+               $tsQuery = <<<SPARQL
+DELETE {
+  $dumpUrl schema:dateModified ?o .
+}
+WHERE {
+  $dumpUrl schema:dateModified ?o .
+};
+INSERT DATA {
+  $dumpUrl schema:dateModified "$ts"^^xsd:dateTime .
+}
+
+SPARQL;
+               return $tsQuery;
+       }
+
+       /**
+        * Set up standard iterator for retrieving category changes.
+        * @param IDatabase $dbr
+        * @param string[] $columns List of additional fields to get
+        * @param string[] $extra_tables List of additional tables to join
+        * @return BatchRowIterator
+        */
+       private function setupChangesIterator(
+               IDatabase $dbr,
+               array $columns = [],
+               array $extra_tables = []
+       ) {
+               $tables = [ 'recentchanges', 'page_props', 'category' ];
+               if ( $extra_tables ) {
+                       $tables += $extra_tables;
+               }
+               $it = new BatchRowIterator( $dbr,
+                       $tables,
+                       [ 'rc_timestamp' ],
+                       $this->mBatchSize
+               );
+               $this->addTimestampConditions( $it, $dbr );
+               $it->addJoinConditions(
+                       [
+                               'page_props' => [
+                                       'LEFT JOIN', [ 'pp_propname' => 'hiddencat', 'pp_page = rc_cur_id' ]
+                               ],
+                               'category' => [
+                                       'LEFT JOIN', [ 'cat_title = rc_title' ]
+                               ]
+                       ]
+               );
+               $it->setFetchColumns( array_merge( $columns, [
+                       'rc_title',
+                       'rc_cur_id',
+                       'pp_propname',
+                       'cat_pages',
+                       'cat_subcats',
+                       'cat_files'
+               ] ) );
+               return $it;
+       }
+
+       /**
+        * Fetch newly created categories
+        * @param IDatabase $dbr
+        * @return BatchRowIterator
+        */
+       protected function getNewCatsIterator( IDatabase $dbr ) {
+               $it = $this->setupChangesIterator( $dbr );
+               $it->addConditions( [
+                       'rc_namespace' => NS_CATEGORY,
+                       'rc_new' => 1,
+               ] );
+               return $it;
+       }
+
+       /**
+        * Fetch moved categories
+        * @param IDatabase $dbr
+        * @return BatchRowIterator
+        */
+       protected function getMovedCatsIterator( IDatabase $dbr ) {
+               $it = $this->setupChangesIterator( $dbr, [ 'page_title', 'page_namespace' ], [ 'page' ] );
+               $it->addConditions( [
+                       'rc_namespace' => NS_CATEGORY,
+                       'rc_new' => 0,
+                       'rc_log_type' => 'move',
+                       'rc_type' => RC_LOG,
+               ] );
+               $it->addJoinConditions( [
+                       'page' => [ 'INNER JOIN', 'rc_cur_id = page_id' ],
+               ] );
+               $this->addIndex( $it );
+               return $it;
+       }
+
+       /**
+        * Fetch deleted categories
+        * @param IDatabase $dbr
+        * @return BatchRowIterator
+        */
+       protected function getDeletedCatsIterator( IDatabase $dbr ) {
+               $it = new BatchRowIterator( $dbr,
+                       'recentchanges',
+                       [ 'rc_timestamp' ],
+                       $this->mBatchSize
+               );
+               $this->addTimestampConditions( $it, $dbr );
+               $it->addConditions( [
+                       'rc_namespace' => NS_CATEGORY,
+                       'rc_new' => 0,
+                       'rc_log_type' => 'delete',
+                       'rc_log_action' => 'delete',
+                       'rc_type' => RC_LOG,
+                       // We will fetch ones that do not have page record. If they do,
+                       // this means they were restored, thus restoring handler will pick it up.
+                       'NOT EXISTS (SELECT * FROM page WHERE page_id = rc_cur_id)',
+               ] );
+               $this->addIndex( $it );
+               $it->setFetchColumns( [ 'rc_cur_id', 'rc_title' ] );
+               return $it;
+       }
+
+       /**
+        * Fetch restored categories
+        * @param IDatabase $dbr
+        * @return BatchRowIterator
+        */
+       protected function getRestoredCatsIterator( IDatabase $dbr ) {
+               $it = $this->setupChangesIterator( $dbr );
+               $it->addConditions( [
+                       'rc_namespace' => NS_CATEGORY,
+                       'rc_new' => 0,
+                       'rc_log_type' => 'delete',
+                       'rc_log_action' => 'restore',
+                       'rc_type' => RC_LOG,
+                       // We will only fetch ones that have page record
+                       'EXISTS (SELECT page_id FROM page WHERE page_id = rc_cur_id)',
+               ] );
+               $this->addIndex( $it );
+               return $it;
+       }
+
+       /**
+        * Fetch categorization changes
+        * @param IDatabase $dbr
+        * @return BatchRowIterator
+        */
+       protected function getChangedCatsIterator( IDatabase $dbr ) {
+               $it = $this->setupChangesIterator( $dbr );
+               $it->addConditions( [
+                       'rc_namespace' => NS_CATEGORY,
+                       'rc_new' => 0,
+                       'rc_type' => [ RC_EDIT, RC_CATEGORIZE ],
+               ] );
+               $this->addIndex( $it );
+               return $it;
+       }
+
+       /**
+        * Add timestamp limits to iterator
+        * @param BatchRowIterator $it Iterator
+        * @param IDatabase $dbr
+        */
+       private function addTimestampConditions( BatchRowIterator $it, IDatabase $dbr ) {
+               $it->addConditions( [
+                       'rc_timestamp >= ' . $dbr->addQuotes( $dbr->timestamp( $this->startTS ) ),
+                       'rc_timestamp < ' . $dbr->addQuotes( $dbr->timestamp( $this->endTS ) ),
+               ] );
+       }
+
+       /**
+        * Need to force index, somehow on terbium the optimizer chooses wrong one
+        * @param BatchRowIterator $it
+        */
+       private function addIndex( BatchRowIterator $it ) {
+               $it->addOptions( [
+                       'USE INDEX' => [ 'recentchanges' => 'new_name_timestamp' ]
+               ] );
+       }
+
+       /**
+        * Get iterator for links for categories.
+        * @param IDatabase $dbr
+        * @param array $ids List of page IDs
+        * @return Traversable
+        */
+       protected function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
+               $it = new BatchRowIterator(
+                       $dbr,
+                       'categorylinks',
+                       [ 'cl_from', 'cl_to' ],
+                       $this->mBatchSize
+               );
+               $it->addConditions( [
+                       'cl_type' => 'subcat',
+                       'cl_from' => $ids
+               ] );
+               $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
+               return new RecursiveIteratorIterator( $it );
+       }
+
+       /**
+        * Get accumulated RDF.
+        * @return string
+        */
+       public function getRdf() {
+               return $this->rdfWriter->drain();
+       }
+
+       /**
+        * Handle category deletes.
+        * @param IDatabase $dbr
+        * @param resource $output File to write the output
+        */
+       public function handleDeletes( IDatabase $dbr, $output ) {
+               // This only does "true" deletes - i.e. those that the page stays deleted
+               foreach ( $this->getDeletedCatsIterator( $dbr ) as $batch ) {
+                       $deleteUrls = [];
+                       foreach ( $batch as $row ) {
+                               // This can produce duplicates, we don't care
+                               $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
+                               $this->processed[$row->rc_cur_id] = true;
+                       }
+                       fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, [], "Deletes" ) );
+               }
+       }
+
+       /**
+        * Write category data to RDF.
+        * @param stdclass $row Database row
+        */
+       private function writeCategoryData( $row ) {
+               $this->categoriesRdf->writeCategoryData(
+                       $row->rc_title,
+                       $row->pp_propname === 'hiddencat',
+                       (int)$row->cat_pages - (int)$row->cat_subcats - (int)$row->cat_files,
+                       (int)$row->cat_subcats
+               );
+       }
+
+       /**
+        * @param IDatabase $dbr
+        * @param resource $output
+        */
+       public function handleMoves( IDatabase $dbr, $output ) {
+               foreach ( $this->getMovedCatsIterator( $dbr ) as $batch ) {
+                       $pages = [];
+                       $deleteUrls = [];
+                       foreach ( $batch as $row ) {
+                               $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
+
+                               if ( isset( $this->processed[$row->rc_cur_id] ) ) {
+                                       // We already captured this one before
+                                       continue;
+                               }
+
+                               if ( $row->page_namespace != NS_CATEGORY ) {
+                                       // If page was moved out of Category:, we'll just delete
+                                       continue;
+                               }
+                               $row->rc_title = $row->page_title;
+                               $this->writeCategoryData( $row );
+                               $pages[$row->rc_cur_id] = $row->page_title;
+                               $this->processed[$row->rc_cur_id] = true;
+                       }
+
+                       fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Moves" ) );
+               }
+       }
+
+       /**
+        * @param IDatabase $dbr
+        * @param resource $output
+        */
+       public function handleRestores( IDatabase $dbr, $output ) {
+               fwrite( $output, "# Restores\n" );
+               // This will only find those restores that were not deleted later.
+               foreach ( $this->getRestoredCatsIterator( $dbr ) as $batch ) {
+                       $pages = [];
+                       foreach ( $batch as $row ) {
+                               if ( isset( $this->processed[$row->rc_cur_id] ) ) {
+                                       // We already captured this one before
+                                       continue;
+                               }
+                               $this->writeCategoryData( $row );
+                               $pages[$row->rc_cur_id] = $row->rc_title;
+                               $this->processed[$row->rc_cur_id] = true;
+                       }
+
+                       if ( empty( $pages ) ) {
+                               continue;
+                       }
+
+                       $this->writeParentCategories( $dbr, $pages );
+
+                       fwrite( $output, sprintf( self::SPARQL_INSERT, $this->getRdf() ) );
+               }
+       }
+
+       /**
+        * @param IDatabase $dbr
+        * @param resource $output
+        */
+       public function handleAdds( IDatabase $dbr, $output ) {
+               fwrite( $output, "# Additions\n" );
+               foreach ( $this->getNewCatsIterator( $dbr ) as $batch ) {
+                       $pages = [];
+                       foreach ( $batch as $row ) {
+                               if ( isset( $this->processed[$row->rc_cur_id] ) ) {
+                                       // We already captured this one before
+                                       continue;
+                               }
+                               $this->writeCategoryData( $row );
+                               $pages[$row->rc_cur_id] = $row->rc_title;
+                               $this->processed[$row->rc_cur_id] = true;
+                       }
+
+                       if ( empty( $pages ) ) {
+                               continue;
+                       }
+
+                       $this->writeParentCategories( $dbr, $pages );
+                       fwrite( $output, sprintf( self::SPARQL_INSERT, $this->getRdf() ) );
+               }
+       }
+
+       /**
+        * @param IDatabase $dbr
+        * @param resource $output
+        */
+       public function handleChanges( IDatabase $dbr, $output ) {
+               foreach ( $this->getChangedCatsIterator( $dbr ) as $batch ) {
+                       $pages = [];
+                       $deleteUrls = [];
+                       foreach ( $batch as $row ) {
+                               if ( isset( $this->processed[$row->rc_cur_id] ) ) {
+                                       // We already captured this one before
+                                       continue;
+                               }
+                               $this->writeCategoryData( $row );
+                               $pages[$row->rc_cur_id] = $row->rc_title;
+                               $this->processed[$row->rc_cur_id] = true;
+                               $deleteUrls[] = '<' . $this->categoriesRdf->labelToUrl( $row->rc_title ) . '>';
+                       }
+
+                       fwrite( $output, $this->getCategoriesUpdate( $dbr, $deleteUrls, $pages, "Changes" ) );
+               }
+       }
+}
+
+$maintClass = CategoryChangesAsRdf::class;
+require_once RUN_MAINTENANCE_IF_MAIN;
diff --git a/tests/phpunit/data/categoriesrdf/change.sparql b/tests/phpunit/data/categoriesrdf/change.sparql
new file mode 100644 (file)
index 0000000..d7ec83a
--- /dev/null
@@ -0,0 +1,16 @@
+# Changes
+DELETE {
+?category ?x ?y
+} INSERT {
+
+<http://acme.test/wiki/Category:Changed_category> a mediawiki:Category ;
+       rdfs:label "Changed category" ;
+       mediawiki:pages "7"^^xsd:integer ;
+       mediawiki:subcategories "2"^^xsd:integer ;
+       mediawiki:isInCategory <http://acme.test/wiki/Category:Parent_of_30> .
+
+} WHERE {
+   VALUES ?category {
+     <http://acme.test/wiki/Category:Changed_category>
+   }
+};
diff --git a/tests/phpunit/data/categoriesrdf/delete.sparql b/tests/phpunit/data/categoriesrdf/delete.sparql
new file mode 100644 (file)
index 0000000..7fb642d
--- /dev/null
@@ -0,0 +1,10 @@
+# Deletes
+DELETE {
+?category ?x ?y
+} INSERT {
+
+} WHERE {
+   VALUES ?category {
+     <http://acme.test/wiki/Category:Test> <http://acme.test/wiki/Category:Test_2>
+   }
+};
diff --git a/tests/phpunit/data/categoriesrdf/move.sparql b/tests/phpunit/data/categoriesrdf/move.sparql
new file mode 100644 (file)
index 0000000..c9f284e
--- /dev/null
@@ -0,0 +1,24 @@
+# Moves
+DELETE {
+?category ?x ?y
+} INSERT {
+
+<http://acme.test/wiki/Category:MovedTo> a mediawiki:Category ;
+       rdfs:label "MovedTo" ;
+       mediawiki:pages "7"^^xsd:integer ;
+       mediawiki:subcategories "2"^^xsd:integer .
+
+<http://acme.test/wiki/Category:AlsoMoved> a mediawiki:Category ;
+       rdfs:label "AlsoMoved" ;
+       mediawiki:pages "7"^^xsd:integer ;
+       mediawiki:subcategories "2"^^xsd:integer .
+
+<http://acme.test/wiki/Category:MovedTo> mediawiki:isInCategory <http://acme.test/wiki/Category:Parent_of_4> .
+
+<http://acme.test/wiki/Category:AlsoMoved> mediawiki:isInCategory <http://acme.test/wiki/Category:Parent_of_5> .
+
+} WHERE {
+   VALUES ?category {
+     <http://acme.test/wiki/Category:Test> <http://acme.test/wiki/Category:MovedTo> <http://acme.test/wiki/Category:Test_2> <http://acme.test/wiki/Category:Test_3> <http://acme.test/wiki/Category:Test_4>
+   }
+};
diff --git a/tests/phpunit/data/categoriesrdf/new.sparql b/tests/phpunit/data/categoriesrdf/new.sparql
new file mode 100644 (file)
index 0000000..f9a742d
--- /dev/null
@@ -0,0 +1,19 @@
+# Additions
+INSERT DATA {
+
+<http://acme.test/wiki/Category:New_category> a mediawiki:Category ;
+       rdfs:label "New category" ;
+       mediawiki:pages "7"^^xsd:integer ;
+       mediawiki:subcategories "2"^^xsd:integer .
+
+<http://acme.test/wiki/Category:%D0%9D%D0%BE%D0%B2%D0%B0%D1%8F_%D0%BA%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F_%F0%9F%98%83> a mediawiki:Category,
+               mediawiki:HiddenCategory ;
+       rdfs:label "Новая категория 😃" ;
+       mediawiki:pages "7"^^xsd:integer ;
+       mediawiki:subcategories "2"^^xsd:integer .
+
+<http://acme.test/wiki/Category:New_category> mediawiki:isInCategory <http://acme.test/wiki/Category:Parent_of_20> .
+
+<http://acme.test/wiki/Category:%D0%9D%D0%BE%D0%B2%D0%B0%D1%8F_%D0%BA%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F_%F0%9F%98%83> mediawiki:isInCategory <http://acme.test/wiki/Category:Parent_of_21> .
+
+};
diff --git a/tests/phpunit/data/categoriesrdf/restore.sparql b/tests/phpunit/data/categoriesrdf/restore.sparql
new file mode 100644 (file)
index 0000000..16c0561
--- /dev/null
@@ -0,0 +1,10 @@
+# Restores
+INSERT DATA {
+
+<http://acme.test/wiki/Category:Restored_cat> a mediawiki:Category ;
+       rdfs:label "Restored cat" ;
+       mediawiki:pages "7"^^xsd:integer ;
+       mediawiki:subcategories "2"^^xsd:integer ;
+       mediawiki:isInCategory <http://acme.test/wiki/Category:Parent_of_10> .
+
+};
diff --git a/tests/phpunit/data/categoriesrdf/updatets.txt b/tests/phpunit/data/categoriesrdf/updatets.txt
new file mode 100644 (file)
index 0000000..426bb92
--- /dev/null
@@ -0,0 +1,9 @@
+DELETE {
+  <http://acme.test/wiki/Special:CategoryDump> schema:dateModified ?o .
+}
+WHERE {
+  <http://acme.test/wiki/Special:CategoryDump> schema:dateModified ?o .
+};
+INSERT DATA {
+  <http://acme.test/wiki/Special:CategoryDump> schema:dateModified "2017-08-25T00:29:09Z"^^xsd:dateTime .
+}
diff --git a/tests/phpunit/maintenance/categoryChangesRdfTest.php b/tests/phpunit/maintenance/categoryChangesRdfTest.php
new file mode 100644 (file)
index 0000000..30a56f4
--- /dev/null
@@ -0,0 +1,263 @@
+<?php
+
+/**
+ * Tests for CategoryChangesAsRdf recent changes exporter.
+ *  @covers CategoryChangesAsRdf
+ */
+class CategoryChangesRdfTest extends MediaWikiLangTestCase {
+
+       public function setUp() {
+               parent::setUp();
+               $this->setMwGlobals( [
+                       'wgServer' => 'http://acme.test',
+                       'wgCanonicalServer' => 'http://acme.test',
+                       'wgArticlePath' => '/wiki/$1',
+               ] );
+       }
+
+       public function provideCategoryData() {
+               return [
+                       'delete category' => [
+                               __DIR__ . "/../data/categoriesrdf/delete.sparql",
+                               'getDeletedCatsIterator',
+                               'handleDeletes',
+                               [
+                                       (object)[ 'rc_title' => 'Test', 'rc_cur_id' => 1, '_processed' => 1 ],
+                                       (object)[ 'rc_title' => 'Test 2', 'rc_cur_id' => 2, '_processed' => 2 ],
+                               ],
+                       ],
+                       'move category' => [
+                               __DIR__ . "/../data/categoriesrdf/move.sparql",
+                               'getMovedCatsIterator',
+                               'handleMoves',
+                               [
+                                       (object)[
+                                               'rc_title' => 'Test',
+                                               'rc_cur_id' => 4,
+                                               'page_title' => 'MovedTo',
+                                               'page_namespace' => NS_CATEGORY,
+                                               '_processed' => 4,
+                                               'pp_propname' => null,
+                                               'cat_pages' => 10,
+                                               'cat_subcats' => 2,
+                                               'cat_files' => 1,
+                                       ],
+                                       (object)[
+                                               'rc_title' => 'MovedTo',
+                                               'rc_cur_id' => 4,
+                                               'page_title' => 'MovedAgain',
+                                               'page_namespace' => NS_CATEGORY,
+                                               'pp_propname' => 'hiddencat',
+                                               'cat_pages' => 10,
+                                               'cat_subcats' => 2,
+                                               'cat_files' => 1,
+                                       ],
+                                       (object)[
+                                               'rc_title' => 'Test 2',
+                                               'rc_cur_id' => 5,
+                                               'page_title' => 'AlsoMoved',
+                                               'page_namespace' => NS_CATEGORY,
+                                               '_processed' => 5,
+                                               'pp_propname' => null,
+                                               'cat_pages' => 10,
+                                               'cat_subcats' => 2,
+                                               'cat_files' => 1,
+                                       ],
+                                       (object)[
+                                               'rc_title' => 'Test 3',
+                                               'rc_cur_id' => 6,
+                                               'page_title' => 'MovedOut',
+                                               'page_namespace' => NS_MAIN,
+                                               'pp_propname' => null,
+                                               'cat_pages' => 10,
+                                               'cat_subcats' => 2,
+                                               'cat_files' => 1,
+                                       ],
+                                       (object)[
+                                               'rc_title' => 'Test 4',
+                                               'rc_cur_id' => 7,
+                                               'page_title' => 'Already Done',
+                                               'page_namespace' => NS_CATEGORY,
+                                               'pp_propname' => null,
+                                               'cat_pages' => 10,
+                                               'cat_subcats' => 2,
+                                               'cat_files' => 1,
+                                       ],
+                               ],
+                               [ 7 => true ],
+                       ],
+                       'restore deleted category' => [
+                               __DIR__ . "/../data/categoriesrdf/restore.sparql",
+                               'getRestoredCatsIterator',
+                               'handleRestores',
+                               [
+                                       (object)[
+                                               'rc_title' => 'Restored cat',
+                                               'rc_cur_id' => 10,
+                                               '_processed' => 10,
+                                               'pp_propname' => null,
+                                               'cat_pages' => 10,
+                                               'cat_subcats' => 2,
+                                               'cat_files' => 1,
+                                       ],
+                                       (object)[
+                                               'rc_title' => 'Restored again',
+                                               'rc_cur_id' => 10,
+                                               'pp_propname' => null,
+                                               'cat_pages' => 10,
+                                               'cat_subcats' => 2,
+                                               'cat_files' => 1,
+                                       ],
+                                       (object)[
+                                               'rc_title' => 'Already seen',
+                                               'rc_cur_id' => 11,
+                                               'pp_propname' => null,
+                                               'cat_pages' => 10,
+                                               'cat_subcats' => 2,
+                                               'cat_files' => 1,
+                                       ],
+                               ],
+                               [ 11 => true ],
+                       ],
+                       'new page' => [
+                               __DIR__ . "/../data/categoriesrdf/new.sparql",
+                               'getNewCatsIterator',
+                               'handleAdds',
+                               [
+                                       (object)[
+                                               'rc_title' => 'New category',
+                                               'rc_cur_id' => 20,
+                                               '_processed' => 20,
+                                               'pp_propname' => null,
+                                               'cat_pages' => 10,
+                                               'cat_subcats' => 2,
+                                               'cat_files' => 1,
+                                       ],
+                                       (object)[
+                                               'rc_title' => 'Новая категория 😃',
+                                               'rc_cur_id' => 21,
+                                               '_processed' => 21,
+                                               'pp_propname' => 'hiddencat',
+                                               'cat_pages' => 10,
+                                               'cat_subcats' => 2,
+                                               'cat_files' => 1,
+                                       ],
+                                       (object)[
+                                               'rc_title' => 'Processed already',
+                                               'rc_cur_id' => 22,
+                                       ],
+                               ],
+                               [ 22 => true ],
+                       ],
+                       'change in categories' => [
+                               __DIR__ . "/../data/categoriesrdf/change.sparql",
+                               'getChangedCatsIterator',
+                               'handleChanges',
+                               [
+                                       (object)[
+                                               'rc_title' => 'Changed category',
+                                               'rc_cur_id' => 30,
+                                               '_processed' => 30,
+                                               'pp_propname' => null,
+                                               'cat_pages' => 10,
+                                               'cat_subcats' => 2,
+                                               'cat_files' => 1,
+                                       ],
+                                       (object)[
+                                               'rc_title' => 'Changed again',
+                                               'rc_cur_id' => 30,
+                                               'pp_propname' => null,
+                                               'cat_pages' => 10,
+                                               'cat_subcats' => 2,
+                                               'cat_files' => 1,
+                                       ],
+                                       (object)[
+                                               'rc_title' => 'Processed already',
+                                               'rc_cur_id' => 31,
+                                               'pp_propname' => null,
+                                               'cat_pages' => 10,
+                                               'cat_subcats' => 2,
+                                               'cat_files' => 1,
+                                       ],
+                               ],
+                               [ 31 => true ],
+                       ],
+
+               ];
+       }
+
+       /**
+        * Mock category links iterator.
+        * @param $dbr
+        * @param array $ids
+        * @return array
+        */
+       public function getCategoryLinksIterator( $dbr, array $ids ) {
+               $res = [];
+               foreach ( $ids as $pageid ) {
+                       $res[] = (object)[ 'cl_from' => $pageid, 'cl_to' => "Parent of $pageid" ];
+               }
+               return $res;
+       }
+
+       /**
+        * @dataProvider provideCategoryData
+        * @param string $testFileName Name of the test, defines filename with expected results.
+        * @param string $iterator Iterator method name to mock
+        * @param string $handler Handler method to call
+        * @param array $result Result to be returned from mock iterator
+        * @param array $preProcessed List of pre-processed items
+        */
+       public function testSparqlUpdate( $testFileName, $iterator, $handler, $result,
+                       array $preProcessed = [] ) {
+               $dumpScript =
+                       $this->getMockBuilder( CategoryChangesAsRdf::class )
+                               ->setMethods( [ $iterator, 'getCategoryLinksIterator' ] )
+                               ->getMock();
+
+               $dumpScript->expects( $this->any() )
+                       ->method( 'getCategoryLinksIterator' )
+                       ->willReturnCallback( [ $this, 'getCategoryLinksIterator' ] );
+
+               $dumpScript->expects( $this->once() )
+                       ->method( $iterator )
+                       ->willReturn( [ $result ] );
+
+               $ref = new ReflectionObject( $dumpScript );
+               $processedProperty = $ref->getProperty( 'processed' );
+               $processedProperty->setAccessible( true );
+               $processedProperty->setValue( $dumpScript, $preProcessed );
+
+               $output = fopen( "php://memory", "w+b" );
+               $dbr = wfGetDB( DB_REPLICA );
+               /** @var CategoryChangesAsRdf $dumpScript */
+               $dumpScript->initialize();
+               $dumpScript->getRdf();
+               $dumpScript->$handler( $dbr, $output );
+
+               rewind( $output );
+               $sparql = stream_get_contents( $output );
+               $this->assertFileContains( $testFileName, $sparql );
+
+               $processed = $processedProperty->getValue( $dumpScript );
+               $expectedProcessed = $preProcessed;
+               foreach ( $result as $row ) {
+                       if ( isset( $row->_processed ) ) {
+                               $this->assertArrayHasKey( $row->_processed, $processed,
+                                       "ID {$row->_processed} was not processed!" );
+                               $expectedProcessed[] = $row->_processed;
+                       }
+               }
+               $this->assertArrayEquals( $expectedProcessed, array_keys( $processed ),
+                       'Processed array has wrong items' );
+       }
+
+       public function testUpdateTs() {
+               $dumpScript = new CategoryChangesAsRdf();
+               $dumpScript->initialize();
+               $update = $dumpScript->updateTS( 1503620949 );
+               $outFile = __DIR__ . '/../data/categoriesrdf/updatets.txt';
+               $this->assertFileContains( $outFile, $update );
+       }
+
+}