Produce RDF dump of all categories and subcategories in a wiki.
authorStanislav Malyshev <smalyshev@gmail.com>
Fri, 16 Dec 2016 22:48:46 +0000 (14:48 -0800)
committerStanislav Malyshev <smalyshev@gmail.com>
Mon, 28 Aug 2017 07:30:35 +0000 (00:30 -0700)
Example:
http://en.wiki.local.wmftest.net:8080/wiki/Category:Ducks> a mediawiki:Category ;
        rdfs:label "Ducks" ;
        mediawiki:isInCategory <http://en.wiki.local.wmftest.net:8080/wiki/Category:Birds> .

Bug: T157676
Change-Id: I59b9603581b37af59d17dd6c38247c85aee44911

autoload.php
composer.json
docs/ontology.owl [new file with mode: 0644]
includes/CategoriesRdf.php [new file with mode: 0644]
maintenance/dumpCategoriesAsRdf.php [new file with mode: 0644]
tests/phpunit/MediaWikiTestCase.php
tests/phpunit/data/categoriesrdf/categoriesRdf-out.nt [new file with mode: 0644]
tests/phpunit/maintenance/categoriesRdfTest.php [new file with mode: 0644]

index 2bf1d4c..5fd415b 100644 (file)
@@ -219,6 +219,7 @@ $wgAutoloadLocalClasses = [
        'CachedBagOStuff' => __DIR__ . '/includes/libs/objectcache/CachedBagOStuff.php',
        'CachingSiteStore' => __DIR__ . '/includes/site/CachingSiteStore.php',
        'CapsCleanup' => __DIR__ . '/maintenance/cleanupCaps.php',
        'CachedBagOStuff' => __DIR__ . '/includes/libs/objectcache/CachedBagOStuff.php',
        'CachingSiteStore' => __DIR__ . '/includes/site/CachingSiteStore.php',
        'CapsCleanup' => __DIR__ . '/maintenance/cleanupCaps.php',
+       'CategoriesRdf' => __DIR__ . '/includes/CategoriesRdf.php',
        'Category' => __DIR__ . '/includes/Category.php',
        'CategoryFinder' => __DIR__ . '/includes/CategoryFinder.php',
        'CategoryMembershipChange' => __DIR__ . '/includes/changes/CategoryMembershipChange.php',
        'Category' => __DIR__ . '/includes/Category.php',
        'CategoryFinder' => __DIR__ . '/includes/CategoryFinder.php',
        'CategoryMembershipChange' => __DIR__ . '/includes/changes/CategoryMembershipChange.php',
@@ -401,6 +402,7 @@ $wgAutoloadLocalClasses = [
        'Dump7ZipOutput' => __DIR__ . '/includes/export/Dump7ZipOutput.php',
        'DumpBZip2Output' => __DIR__ . '/includes/export/DumpBZip2Output.php',
        'DumpBackup' => __DIR__ . '/maintenance/dumpBackup.php',
        'Dump7ZipOutput' => __DIR__ . '/includes/export/Dump7ZipOutput.php',
        'DumpBZip2Output' => __DIR__ . '/includes/export/DumpBZip2Output.php',
        'DumpBackup' => __DIR__ . '/maintenance/dumpBackup.php',
+       'DumpCategoriesAsRdf' => __DIR__ . '/maintenance/dumpCategoriesAsRdf.php',
        'DumpDBZip2Output' => __DIR__ . '/includes/export/DumpDBZip2Output.php',
        'DumpFileOutput' => __DIR__ . '/includes/export/DumpFileOutput.php',
        'DumpFilter' => __DIR__ . '/includes/export/DumpFilter.php',
        'DumpDBZip2Output' => __DIR__ . '/includes/export/DumpDBZip2Output.php',
        'DumpFileOutput' => __DIR__ . '/includes/export/DumpFileOutput.php',
        'DumpFilter' => __DIR__ . '/includes/export/DumpFilter.php',
index aefc158..5ee1940 100644 (file)
@@ -37,6 +37,7 @@
                "wikimedia/html-formatter": "1.0.1",
                "wikimedia/ip-set": "1.1.0",
                "wikimedia/php-session-serializer": "1.0.4",
                "wikimedia/html-formatter": "1.0.1",
                "wikimedia/ip-set": "1.1.0",
                "wikimedia/php-session-serializer": "1.0.4",
+               "wikimedia/purtle": "1.0.6",
                "wikimedia/relpath": "2.0.0",
                "wikimedia/remex-html": "1.0.1",
                "wikimedia/running-stat": "1.1.0",
                "wikimedia/relpath": "2.0.0",
                "wikimedia/remex-html": "1.0.1",
                "wikimedia/running-stat": "1.1.0",
diff --git a/docs/ontology.owl b/docs/ontology.owl
new file mode 100644 (file)
index 0000000..6b2e0b7
--- /dev/null
@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!DOCTYPE rdf:RDF [
+  <!ENTITY xsd "http://www.w3.org/2001/XMLSchema#">
+  <!ENTITY rdf "http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+  <!ENTITY rdfs "http://www.w3.org/2000/01/rdf-schema#">
+  <!ENTITY owl "http://www.w3.org/2002/07/owl#">
+  <!ENTITY mediawiki "https://www.mediawiki.org/ontology#">
+]>
+
+<rdf:RDF
+  xmlns:xsd="&xsd;"
+  xmlns:rdf="&rdf;"
+  xmlns:rdfs="&rdfs;"
+  xmlns:owl="&owl;"
+>
+
+  <owl:Ontology rdf:about="&mediawiki;">
+    <rdfs:label>MediaWiki ontology</rdfs:label>
+    <rdfs:comment>The ontology of MediaWiki</rdfs:comment>
+  </owl:Ontology>
+
+  <!--
+  ///////////////////////////////////////////////////////////////////////////////////////
+  //
+  // Classes
+  //
+  ///////////////////////////////////////////////////////////////////////////////////////
+  -->
+
+  <owl:Class rdf:about="&mediawiki;Dump">
+    <rdfs:label>Dump</rdfs:label>
+    <rdfs:comment>A dump of MediaWiki content.</rdfs:comment>
+  </owl:Class>
+
+  <owl:Class rdf:about="&mediawiki;Category">
+    <rdfs:label>Category</rdfs:label>
+    <rdfs:comment>MediaWiki category.</rdfs:comment>
+  </owl:Class>
+
+  <!--
+  ///////////////////////////////////////////////////////////////////////////////////////
+  //
+  // Properties
+  //
+  ///////////////////////////////////////////////////////////////////////////////////////
+  -->
+
+  <owl:ObjectProperty rdf:about="&mediawiki;isInCategory">
+      <rdfs:label>isInCategory</rdfs:label>
+      <rdfs:comment>One category is the parent of another.</rdfs:comment>
+      <rdfs:range rdf:resource="&mediawiki;Category"/>
+      <rdfs:domain rdf:resource="&mediawiki;Category"/>
+  </owl:ObjectProperty>
+
+</rdf:RDF>
diff --git a/includes/CategoriesRdf.php b/includes/CategoriesRdf.php
new file mode 100644 (file)
index 0000000..e19dc2a
--- /dev/null
@@ -0,0 +1,95 @@
+<?php
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+use Wikimedia\Purtle\RdfWriter;
+
+/**
+ * Helper class to produce RDF representation of categories.
+ */
+class CategoriesRdf {
+       /**
+        * Prefix used for Mediawiki ontology in the dump.
+        */
+       const ONTOLOGY_PREFIX = 'mediawiki';
+       /**
+        * Base URL for Mediawiki ontology.
+        */
+       const ONTOLOGY_URL = 'https://www.mediawiki.org/ontology#';
+       /**
+        * OWL description of the ontology.
+        */
+       const OWL_URL = 'https://www.mediawiki.org/ontology/ontology.owl';
+       /**
+        * Current version of the dump format.
+        */
+       const FORMAT_VERSION = "1.0";
+       /**
+        * @var RdfWriter
+        */
+       private $rdfWriter;
+
+       public function __construct( RdfWriter $writer ) {
+               $this->rdfWriter = $writer;
+       }
+
+       /**
+        * Setup prefixes relevant for the dump
+        */
+       public function setupPrefixes() {
+               $this->rdfWriter->prefix( self::ONTOLOGY_PREFIX, self::ONTOLOGY_URL );
+               $this->rdfWriter->prefix( 'rdfs', 'http://www.w3.org/2000/01/rdf-schema#' );
+               $this->rdfWriter->prefix( 'owl', 'http://www.w3.org/2002/07/owl#' );
+               $this->rdfWriter->prefix( 'schema', 'http://schema.org/' );
+               $this->rdfWriter->prefix( 'cc', 'http://creativecommons.org/ns#' );
+       }
+
+       /**
+        * Write RDF data for link between categories.
+        * @param string $fromName Child category name
+        * @param string $toName Parent category name
+        */
+       public function writeCategoryLinkData( $fromName, $toName ) {
+               $titleFrom = Title::makeTitle( NS_CATEGORY, $fromName );
+               $titleTo = Title::makeTitle( NS_CATEGORY, $toName );
+               $this->rdfWriter->about( $this->titleToUrl( $titleFrom ) )
+                       ->say( self::ONTOLOGY_PREFIX, 'isInCategory' )
+                       ->is( $this->titleToUrl( $titleTo ) );
+       }
+
+       /**
+        * Write out the data for single category.
+        * @param string $categoryName Category name
+        */
+       public function writeCategoryData( $categoryName ) {
+               $title = Title::makeTitle( NS_CATEGORY, $categoryName );
+               $this->rdfWriter->about( $this->titleToUrl( $title ) )
+                       ->say( 'a' )
+                       ->is( self::ONTOLOGY_PREFIX, 'Category' );
+               $titletext = $title->getText();
+               $this->rdfWriter->say( 'rdfs', 'label' )->value( $titletext );
+       }
+
+       /**
+        * Convert Title to link to target page.
+        * @param Title $title
+        * @return string
+        */
+       private function titleToUrl( Title $title ) {
+               return $title->getFullURL( '', false, PROTO_CANONICAL );
+       }
+}
diff --git a/maintenance/dumpCategoriesAsRdf.php b/maintenance/dumpCategoriesAsRdf.php
new file mode 100644 (file)
index 0000000..ff50498
--- /dev/null
@@ -0,0 +1,158 @@
+<?php
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+use Wikimedia\Purtle\RdfWriter;
+use Wikimedia\Purtle\RdfWriterFactory;
+use Wikimedia\Rdbms\IDatabase;
+
+require_once __DIR__ . '/Maintenance.php';
+
+/**
+ * Maintenance script to provide RDF representation of the category tree.
+ *
+ * @ingroup Maintenance
+ * @since 1.30
+ */
+class DumpCategoriesAsRdf extends Maintenance {
+       /**
+        * @var RdfWriter
+        */
+       private $rdfWriter;
+       /**
+        * Categories RDF helper.
+        * @var CategoriesRdf
+        */
+       private $categoriesRdf;
+
+       public function __construct() {
+               parent::__construct();
+
+               $this->addDescription( "Generate RDF dump of categories in a wiki." );
+
+               $this->setBatchSize( 200 );
+               $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.",
+                       false, true );
+               $this->addOption( 'format', "Set the dump format.", false, true );
+       }
+
+       /**
+        * Produce row iterator for categories.
+        * @param IDatabase $dbr Database connection
+        * @return RecursiveIterator
+        */
+       public function getCategoryIterator( IDatabase $dbr ) {
+               $it = new BatchRowIterator(
+                       $dbr,
+                       'page',
+                       [ 'page_title' ],
+                       $this->mBatchSize
+               );
+               $it->addConditions( [
+                       'page_namespace' => NS_CATEGORY,
+               ] );
+               $it->setFetchColumns( [ 'page_title', 'page_id' ] );
+               return $it;
+       }
+
+       /**
+        * Get iterator for links for categories.
+        * @param IDatabase $dbr
+        * @param array $ids List of page IDs
+        * @return Traversable
+        */
+       public function getCategoryLinksIterator( IDatabase $dbr, array $ids ) {
+               $it = new BatchRowIterator(
+                       $dbr,
+                       'categorylinks',
+                       [ 'cl_from', 'cl_to' ],
+                       $this->mBatchSize
+               );
+               $it->addConditions( [
+                       'cl_type' => 'subcat',
+                       'cl_from' => $ids
+               ] );
+               $it->setFetchColumns( [ 'cl_from', 'cl_to' ] );
+               return new RecursiveIteratorIterator( $it );
+       }
+
+       public function addDumpHeader( $timestamp ) {
+               global $wgRightsUrl;
+               $licenseUrl = $wgRightsUrl;
+               if ( substr( $licenseUrl, 0, 2 ) == '//' ) {
+                       $licenseUrl = 'https:' . $licenseUrl;
+               }
+               $this->rdfWriter->about( wfExpandUrl( '/categoriesDump', PROTO_CANONICAL ) )
+                       ->a( 'schema', 'Dataset' )
+                       ->a( 'owl', 'Ontology' )
+                       ->say( 'cc', 'license' )->is( $licenseUrl )
+                       ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION )
+                       ->say( 'schema', 'dateModified' )
+                               ->value( wfTimestamp( TS_ISO_8601, $timestamp ), 'xsd', 'dateTime' )
+                       ->say( 'schema', 'isPartOf' )->is( wfExpandUrl( '/', PROTO_CANONICAL ) )
+                       ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL );
+       }
+
+       public function execute() {
+               $outFile = $this->getOption( 'output', 'php://stdout' );
+
+               if ( $outFile === '-' ) {
+                       $outFile = 'php://stdout';
+               }
+
+               $output = fopen( $outFile, 'w' );
+               $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) );
+               $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter );
+
+               $this->categoriesRdf->setupPrefixes();
+               $this->rdfWriter->start();
+
+               $this->addDumpHeader( time() );
+               fwrite( $output, $this->rdfWriter->drain() );
+
+               $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] );
+
+               foreach ( $this->getCategoryIterator( $dbr ) as $batch ) {
+                       $pages = [];
+                       foreach ( $batch as $row ) {
+                               $this->categoriesRdf->writeCategoryData( $row->page_title );
+                               $pages[$row->page_id] = $row->page_title;
+                       }
+
+                       foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) {
+                               $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to );
+                       }
+                       fwrite( $output, $this->rdfWriter->drain() );
+               }
+               fflush( $output );
+               if ( $outFile !== '-' ) {
+                       fclose( $output );
+               }
+       }
+
+       /**
+        * @param string $format Writer format
+        * @return RdfWriter
+        */
+       private function createRdfWriter( $format ) {
+               $factory = new RdfWriterFactory();
+               return $factory->getWriter( $factory->getFormatName( $format ) );
+       }
+}
+
+$maintClass = "DumpCategoriesAsRdf";
+require_once RUN_MAINTENANCE_IF_MAIN;
index 215d292..156de4c 100644 (file)
@@ -1846,4 +1846,29 @@ abstract class MediaWikiTestCase extends PHPUnit_Framework_TestCase {
                $this->mergeMwGlobalArrayValue( 'wgHooks', [ $hookName => [ $handler ] ] );
        }
 
                $this->mergeMwGlobalArrayValue( 'wgHooks', [ $hookName => [ $handler ] ] );
        }
 
+       /**
+        * Check whether file contains given data.
+        * @param string $fileName
+        * @param string $actualData
+        * @param bool $createIfMissing If true, and file does not exist, create it with given data
+        *                              and skip the test.
+        * @param string $msg
+        * @since 1.30
+        */
+       protected function assertFileContains(
+               $fileName,
+               $actualData,
+               $createIfMissing = true,
+               $msg = ''
+       ) {
+               if ( $createIfMissing ) {
+                       if ( !file_exists( $fileName ) ) {
+                               file_put_contents( $fileName, $actualData );
+                               $this->markTestSkipped( 'Data file $fileName does not exist' );
+                       }
+               } else {
+                       self::assertFileExists( $fileName );
+               }
+               self::assertEquals( file_get_contents( $fileName ), $actualData, $msg );
+       }
 }
 }
diff --git a/tests/phpunit/data/categoriesrdf/categoriesRdf-out.nt b/tests/phpunit/data/categoriesrdf/categoriesRdf-out.nt
new file mode 100644 (file)
index 0000000..d2d7ea8
--- /dev/null
@@ -0,0 +1,16 @@
+<http://acme.test/categoriesDump> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Dataset> .
+<http://acme.test/categoriesDump> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#Ontology> .
+<http://acme.test/categoriesDump> <http://creativecommons.org/ns#license> <https://creativecommons.org/licenses/by-sa/3.0/> .
+<http://acme.test/categoriesDump> <http://schema.org/softwareVersion> "1.0" .
+<http://acme.test/categoriesDump> <http://schema.org/dateModified> "{DATE}"^^<http://www.w3.org/2001/XMLSchema#dateTime> .
+<http://acme.test/categoriesDump> <http://schema.org/isPartOf> <http://acme.test/> .
+<http://acme.test/categoriesDump> <http://www.w3.org/2002/07/owl#imports> <https://www.mediawiki.org/ontology/ontology.owl> .
+<http://acme.test/wiki/Category:Category_One> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://www.mediawiki.org/ontology#Category> .
+<http://acme.test/wiki/Category:Category_One> <http://www.w3.org/2000/01/rdf-schema#label> "Category One" .
+<http://acme.test/wiki/Category:2_Category_Two> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://www.mediawiki.org/ontology#Category> .
+<http://acme.test/wiki/Category:2_Category_Two> <http://www.w3.org/2000/01/rdf-schema#label> "2 Category Two" .
+<http://acme.test/wiki/Category:Category_One> <https://www.mediawiki.org/ontology#isInCategory> <http://acme.test/wiki/Category:Parent_of_1> .
+<http://acme.test/wiki/Category:2_Category_Two> <https://www.mediawiki.org/ontology#isInCategory> <http://acme.test/wiki/Category:Parent_of_2> .
+<http://acme.test/wiki/Category:%D0%A2%D1%80%D0%B5%D1%82%D1%8C%D1%8F_%D0%BA%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <https://www.mediawiki.org/ontology#Category> .
+<http://acme.test/wiki/Category:%D0%A2%D1%80%D0%B5%D1%82%D1%8C%D1%8F_%D0%BA%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F> <http://www.w3.org/2000/01/rdf-schema#label> "\u0422\u0440\u0435\u0442\u044C\u044F \u043A\u0430\u0442\u0435\u0433\u043E\u0440\u0438\u044F" .
+<http://acme.test/wiki/Category:%D0%A2%D1%80%D0%B5%D1%82%D1%8C%D1%8F_%D0%BA%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F> <https://www.mediawiki.org/ontology#isInCategory> <http://acme.test/wiki/Category:Parent_of_3> .
diff --git a/tests/phpunit/maintenance/categoriesRdfTest.php b/tests/phpunit/maintenance/categoriesRdfTest.php
new file mode 100644 (file)
index 0000000..ec2746e
--- /dev/null
@@ -0,0 +1,72 @@
+<?php
+
+class CategoriesRdfTest extends MediaWikiLangTestCase {
+       public function getCategoryIterator() {
+               return [
+                       // batch 1
+                       [
+                               (object)[ 'page_title' => 'Category One', 'page_id' => 1 ],
+                               (object)[ 'page_title' => '2 Category Two', 'page_id' => 2 ],
+                       ],
+                       // batch 2
+                       [
+                               (object)[ 'page_title' => 'Третья категория', 'page_id' => 3 ],
+                       ]
+               ];
+       }
+
+       public function getCategoryLinksIterator( $dbr, array $ids ) {
+               $res = [];
+               foreach ( $ids as $pageid ) {
+                       $res[] = (object)[ 'cl_from' => $pageid, 'cl_to' => "Parent of $pageid" ];
+               }
+               return $res;
+       }
+
+       public function testCategoriesDump() {
+               $this->setMwGlobals( [
+                       'wgServer' => 'http://acme.test',
+                       'wgCanonicalServer' => 'http://acme.test',
+                       'wgArticlePath' => '/wiki/$1',
+                       'wgRightsUrl' => '//creativecommons.org/licenses/by-sa/3.0/',
+               ] );
+
+               $dumpScript =
+                       $this->getMockBuilder( DumpCategoriesAsRdf::class )
+                               ->setMethods( [ 'getCategoryIterator', 'getCategoryLinksIterator' ] )
+                               ->getMock();
+
+               $dumpScript->expects( $this->once() )
+                       ->method( 'getCategoryIterator' )
+                       ->willReturn( $this->getCategoryIterator() );
+
+               $dumpScript->expects( $this->any() )
+                       ->method( 'getCategoryLinksIterator' )
+                       ->willReturnCallback( [ $this, 'getCategoryLinksIterator' ] );
+
+               /** @var DumpCategoriesAsRdf  $dumpScript */
+               $logFileName = tempnam( sys_get_temp_dir(), "Categories-DumpRdfTest" );
+               $outFileName = tempnam( sys_get_temp_dir(), "Categories-DumpRdfTest" );
+
+               $dumpScript->loadParamsAndArgs(
+                       null,
+                       [
+                               'log' => $logFileName,
+                               'output' => $outFileName,
+                               'format' => 'nt',
+                       ]
+               );
+
+               $dumpScript->execute();
+               $actualOut = file_get_contents( $outFileName );
+               $actualOut = preg_replace(
+                       '|<http://acme.test/categoriesDump> <http://schema.org/dateModified> "[^"]+?"|',
+                       '<http://acme.test/categoriesDump> <http://schema.org/dateModified> "{DATE}"',
+                       $actualOut
+               );
+
+               $outFile = __DIR__ . '/../data/categoriesrdf/categoriesRdf-out.nt';
+               $this->assertFileContains( $outFile, $actualOut );
+       }
+
+}