From d9bb673b72f4c5ff1b2ef982e05c1e4da81feb26 Mon Sep 17 00:00:00 2001 From: Stanislav Malyshev Date: Fri, 16 Dec 2016 14:48:46 -0800 Subject: [PATCH] Produce RDF dump of all categories and subcategories in a wiki. Example: http://en.wiki.local.wmftest.net:8080/wiki/Category:Ducks> a mediawiki:Category ; rdfs:label "Ducks" ; mediawiki:isInCategory . Bug: T157676 Change-Id: I59b9603581b37af59d17dd6c38247c85aee44911 --- autoload.php | 2 + composer.json | 1 + docs/ontology.owl | 56 +++++++ includes/CategoriesRdf.php | 95 +++++++++++ maintenance/dumpCategoriesAsRdf.php | 158 ++++++++++++++++++ tests/phpunit/MediaWikiTestCase.php | 25 +++ .../data/categoriesrdf/categoriesRdf-out.nt | 16 ++ .../phpunit/maintenance/categoriesRdfTest.php | 72 ++++++++ 8 files changed, 425 insertions(+) create mode 100644 docs/ontology.owl create mode 100644 includes/CategoriesRdf.php create mode 100644 maintenance/dumpCategoriesAsRdf.php create mode 100644 tests/phpunit/data/categoriesrdf/categoriesRdf-out.nt create mode 100644 tests/phpunit/maintenance/categoriesRdfTest.php diff --git a/autoload.php b/autoload.php index 2bf1d4cc5d..5fd415b554 100644 --- a/autoload.php +++ b/autoload.php @@ -219,6 +219,7 @@ $wgAutoloadLocalClasses = [ 'CachedBagOStuff' => __DIR__ . '/includes/libs/objectcache/CachedBagOStuff.php', 'CachingSiteStore' => __DIR__ . '/includes/site/CachingSiteStore.php', 'CapsCleanup' => __DIR__ . '/maintenance/cleanupCaps.php', + 'CategoriesRdf' => __DIR__ . '/includes/CategoriesRdf.php', 'Category' => __DIR__ . '/includes/Category.php', 'CategoryFinder' => __DIR__ . '/includes/CategoryFinder.php', 'CategoryMembershipChange' => __DIR__ . '/includes/changes/CategoryMembershipChange.php', @@ -401,6 +402,7 @@ $wgAutoloadLocalClasses = [ 'Dump7ZipOutput' => __DIR__ . '/includes/export/Dump7ZipOutput.php', 'DumpBZip2Output' => __DIR__ . '/includes/export/DumpBZip2Output.php', 'DumpBackup' => __DIR__ . '/maintenance/dumpBackup.php', + 'DumpCategoriesAsRdf' => __DIR__ . '/maintenance/dumpCategoriesAsRdf.php', 'DumpDBZip2Output' => __DIR__ . '/includes/export/DumpDBZip2Output.php', 'DumpFileOutput' => __DIR__ . '/includes/export/DumpFileOutput.php', 'DumpFilter' => __DIR__ . '/includes/export/DumpFilter.php', diff --git a/composer.json b/composer.json index aefc158c71..5ee1940fd0 100644 --- a/composer.json +++ b/composer.json @@ -37,6 +37,7 @@ "wikimedia/html-formatter": "1.0.1", "wikimedia/ip-set": "1.1.0", "wikimedia/php-session-serializer": "1.0.4", + "wikimedia/purtle": "1.0.6", "wikimedia/relpath": "2.0.0", "wikimedia/remex-html": "1.0.1", "wikimedia/running-stat": "1.1.0", diff --git a/docs/ontology.owl b/docs/ontology.owl new file mode 100644 index 0000000000..6b2e0b7f44 --- /dev/null +++ b/docs/ontology.owl @@ -0,0 +1,56 @@ + + + + + + + +]> + + + + + MediaWiki ontology + The ontology of MediaWiki + + + + + + Dump + A dump of MediaWiki content. + + + + Category + MediaWiki category. + + + + + + isInCategory + One category is the parent of another. + + + + + diff --git a/includes/CategoriesRdf.php b/includes/CategoriesRdf.php new file mode 100644 index 0000000000..e19dc2aadb --- /dev/null +++ b/includes/CategoriesRdf.php @@ -0,0 +1,95 @@ +rdfWriter = $writer; + } + + /** + * Setup prefixes relevant for the dump + */ + public function setupPrefixes() { + $this->rdfWriter->prefix( self::ONTOLOGY_PREFIX, self::ONTOLOGY_URL ); + $this->rdfWriter->prefix( 'rdfs', 'http://www.w3.org/2000/01/rdf-schema#' ); + $this->rdfWriter->prefix( 'owl', 'http://www.w3.org/2002/07/owl#' ); + $this->rdfWriter->prefix( 'schema', 'http://schema.org/' ); + $this->rdfWriter->prefix( 'cc', 'http://creativecommons.org/ns#' ); + } + + /** + * Write RDF data for link between categories. + * @param string $fromName Child category name + * @param string $toName Parent category name + */ + public function writeCategoryLinkData( $fromName, $toName ) { + $titleFrom = Title::makeTitle( NS_CATEGORY, $fromName ); + $titleTo = Title::makeTitle( NS_CATEGORY, $toName ); + $this->rdfWriter->about( $this->titleToUrl( $titleFrom ) ) + ->say( self::ONTOLOGY_PREFIX, 'isInCategory' ) + ->is( $this->titleToUrl( $titleTo ) ); + } + + /** + * Write out the data for single category. + * @param string $categoryName Category name + */ + public function writeCategoryData( $categoryName ) { + $title = Title::makeTitle( NS_CATEGORY, $categoryName ); + $this->rdfWriter->about( $this->titleToUrl( $title ) ) + ->say( 'a' ) + ->is( self::ONTOLOGY_PREFIX, 'Category' ); + $titletext = $title->getText(); + $this->rdfWriter->say( 'rdfs', 'label' )->value( $titletext ); + } + + /** + * Convert Title to link to target page. + * @param Title $title + * @return string + */ + private function titleToUrl( Title $title ) { + return $title->getFullURL( '', false, PROTO_CANONICAL ); + } +} diff --git a/maintenance/dumpCategoriesAsRdf.php b/maintenance/dumpCategoriesAsRdf.php new file mode 100644 index 0000000000..ff50498f0a --- /dev/null +++ b/maintenance/dumpCategoriesAsRdf.php @@ -0,0 +1,158 @@ +addDescription( "Generate RDF dump of categories in a wiki." ); + + $this->setBatchSize( 200 ); + $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", + false, true ); + $this->addOption( 'format', "Set the dump format.", false, true ); + } + + /** + * Produce row iterator for categories. + * @param IDatabase $dbr Database connection + * @return RecursiveIterator + */ + public function getCategoryIterator( IDatabase $dbr ) { + $it = new BatchRowIterator( + $dbr, + 'page', + [ 'page_title' ], + $this->mBatchSize + ); + $it->addConditions( [ + 'page_namespace' => NS_CATEGORY, + ] ); + $it->setFetchColumns( [ 'page_title', 'page_id' ] ); + return $it; + } + + /** + * Get iterator for links for categories. + * @param IDatabase $dbr + * @param array $ids List of page IDs + * @return Traversable + */ + public function getCategoryLinksIterator( IDatabase $dbr, array $ids ) { + $it = new BatchRowIterator( + $dbr, + 'categorylinks', + [ 'cl_from', 'cl_to' ], + $this->mBatchSize + ); + $it->addConditions( [ + 'cl_type' => 'subcat', + 'cl_from' => $ids + ] ); + $it->setFetchColumns( [ 'cl_from', 'cl_to' ] ); + return new RecursiveIteratorIterator( $it ); + } + + public function addDumpHeader( $timestamp ) { + global $wgRightsUrl; + $licenseUrl = $wgRightsUrl; + if ( substr( $licenseUrl, 0, 2 ) == '//' ) { + $licenseUrl = 'https:' . $licenseUrl; + } + $this->rdfWriter->about( wfExpandUrl( '/categoriesDump', PROTO_CANONICAL ) ) + ->a( 'schema', 'Dataset' ) + ->a( 'owl', 'Ontology' ) + ->say( 'cc', 'license' )->is( $licenseUrl ) + ->say( 'schema', 'softwareVersion' )->value( CategoriesRdf::FORMAT_VERSION ) + ->say( 'schema', 'dateModified' ) + ->value( wfTimestamp( TS_ISO_8601, $timestamp ), 'xsd', 'dateTime' ) + ->say( 'schema', 'isPartOf' )->is( wfExpandUrl( '/', PROTO_CANONICAL ) ) + ->say( 'owl', 'imports' )->is( CategoriesRdf::OWL_URL ); + } + + public function execute() { + $outFile = $this->getOption( 'output', 'php://stdout' ); + + if ( $outFile === '-' ) { + $outFile = 'php://stdout'; + } + + $output = fopen( $outFile, 'w' ); + $this->rdfWriter = $this->createRdfWriter( $this->getOption( 'format', 'ttl' ) ); + $this->categoriesRdf = new CategoriesRdf( $this->rdfWriter ); + + $this->categoriesRdf->setupPrefixes(); + $this->rdfWriter->start(); + + $this->addDumpHeader( time() ); + fwrite( $output, $this->rdfWriter->drain() ); + + $dbr = $this->getDB( DB_REPLICA, [ 'vslow' ] ); + + foreach ( $this->getCategoryIterator( $dbr ) as $batch ) { + $pages = []; + foreach ( $batch as $row ) { + $this->categoriesRdf->writeCategoryData( $row->page_title ); + $pages[$row->page_id] = $row->page_title; + } + + foreach ( $this->getCategoryLinksIterator( $dbr, array_keys( $pages ) ) as $row ) { + $this->categoriesRdf->writeCategoryLinkData( $pages[$row->cl_from], $row->cl_to ); + } + fwrite( $output, $this->rdfWriter->drain() ); + } + fflush( $output ); + if ( $outFile !== '-' ) { + fclose( $output ); + } + } + + /** + * @param string $format Writer format + * @return RdfWriter + */ + private function createRdfWriter( $format ) { + $factory = new RdfWriterFactory(); + return $factory->getWriter( $factory->getFormatName( $format ) ); + } +} + +$maintClass = "DumpCategoriesAsRdf"; +require_once RUN_MAINTENANCE_IF_MAIN; diff --git a/tests/phpunit/MediaWikiTestCase.php b/tests/phpunit/MediaWikiTestCase.php index 215d292410..156de4c4b9 100644 --- a/tests/phpunit/MediaWikiTestCase.php +++ b/tests/phpunit/MediaWikiTestCase.php @@ -1846,4 +1846,29 @@ abstract class MediaWikiTestCase extends PHPUnit_Framework_TestCase { $this->mergeMwGlobalArrayValue( 'wgHooks', [ $hookName => [ $handler ] ] ); } + /** + * Check whether file contains given data. + * @param string $fileName + * @param string $actualData + * @param bool $createIfMissing If true, and file does not exist, create it with given data + * and skip the test. + * @param string $msg + * @since 1.30 + */ + protected function assertFileContains( + $fileName, + $actualData, + $createIfMissing = true, + $msg = '' + ) { + if ( $createIfMissing ) { + if ( !file_exists( $fileName ) ) { + file_put_contents( $fileName, $actualData ); + $this->markTestSkipped( 'Data file $fileName does not exist' ); + } + } else { + self::assertFileExists( $fileName ); + } + self::assertEquals( file_get_contents( $fileName ), $actualData, $msg ); + } } diff --git a/tests/phpunit/data/categoriesrdf/categoriesRdf-out.nt b/tests/phpunit/data/categoriesrdf/categoriesRdf-out.nt new file mode 100644 index 0000000000..d2d7ea81f1 --- /dev/null +++ b/tests/phpunit/data/categoriesrdf/categoriesRdf-out.nt @@ -0,0 +1,16 @@ + . + . + . + "1.0" . + "{DATE}"^^ . + . + . + . + "Category One" . + . + "2 Category Two" . + . + . + . + "\u0422\u0440\u0435\u0442\u044C\u044F \u043A\u0430\u0442\u0435\u0433\u043E\u0440\u0438\u044F" . + . diff --git a/tests/phpunit/maintenance/categoriesRdfTest.php b/tests/phpunit/maintenance/categoriesRdfTest.php new file mode 100644 index 0000000000..ec2746e8ee --- /dev/null +++ b/tests/phpunit/maintenance/categoriesRdfTest.php @@ -0,0 +1,72 @@ + 'Category One', 'page_id' => 1 ], + (object)[ 'page_title' => '2 Category Two', 'page_id' => 2 ], + ], + // batch 2 + [ + (object)[ 'page_title' => 'Третья категория', 'page_id' => 3 ], + ] + ]; + } + + public function getCategoryLinksIterator( $dbr, array $ids ) { + $res = []; + foreach ( $ids as $pageid ) { + $res[] = (object)[ 'cl_from' => $pageid, 'cl_to' => "Parent of $pageid" ]; + } + return $res; + } + + public function testCategoriesDump() { + $this->setMwGlobals( [ + 'wgServer' => 'http://acme.test', + 'wgCanonicalServer' => 'http://acme.test', + 'wgArticlePath' => '/wiki/$1', + 'wgRightsUrl' => '//creativecommons.org/licenses/by-sa/3.0/', + ] ); + + $dumpScript = + $this->getMockBuilder( DumpCategoriesAsRdf::class ) + ->setMethods( [ 'getCategoryIterator', 'getCategoryLinksIterator' ] ) + ->getMock(); + + $dumpScript->expects( $this->once() ) + ->method( 'getCategoryIterator' ) + ->willReturn( $this->getCategoryIterator() ); + + $dumpScript->expects( $this->any() ) + ->method( 'getCategoryLinksIterator' ) + ->willReturnCallback( [ $this, 'getCategoryLinksIterator' ] ); + + /** @var DumpCategoriesAsRdf $dumpScript */ + $logFileName = tempnam( sys_get_temp_dir(), "Categories-DumpRdfTest" ); + $outFileName = tempnam( sys_get_temp_dir(), "Categories-DumpRdfTest" ); + + $dumpScript->loadParamsAndArgs( + null, + [ + 'log' => $logFileName, + 'output' => $outFileName, + 'format' => 'nt', + ] + ); + + $dumpScript->execute(); + $actualOut = file_get_contents( $outFileName ); + $actualOut = preg_replace( + '| "[^"]+?"|', + ' "{DATE}"', + $actualOut + ); + + $outFile = __DIR__ . '/../data/categoriesrdf/categoriesRdf-out.nt'; + $this->assertFileContains( $outFile, $actualOut ); + } + +} -- 2.20.1